Sage Moore fixes for full cuda graph support for DeepEP+DeepGEMM LL

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
refactor example - qwen3_reranker (#19847 )
2025-06-24 11:21:52 -04:00 · 2025-06-24 14:03:20 +00:00 · 2025-06-24 13:20:04 +00:00 · 2025-06-23 23:01:26 -07:00 · 2025-06-24 05:57:46 +00:00 · 2025-06-24 12:04:11 +08:00
249 changed files with 10046 additions and 4709 deletions
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -16,7 +16,7 @@ Please download the visualization scripts in the post
  - Download `nightly-benchmarks.zip`.
  - In the same folder, run the following code:

-  ```console
+  ```bash
  export HF_TOKEN=<your HF token>
  apt update
  apt install -y git
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -102,6 +102,7 @@ steps:
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
@ -117,6 +118,7 @@ steps:
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
       --name "${container_name}" \
       ${image_name} \
       /bin/bash -c "
+            set -e; # Exit on first error
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo 'Running test file: '$f;
+                echo \"Running test file: \$f\";
                python3 -m pytest \$f -v --capture=tee-sys;
            done
       "
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu

 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct
-MAX_NUM_SEQS=512
-MAX_NUM_BATCHED_TOKENS=512
+MAX_NUM_SEQS=256
+MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -89,7 +89,7 @@ steps:
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

 - label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
@ -271,6 +271,15 @@ steps:
  commands:
    - pytest -v -s prefix_caching

+
+- label: Platform Tests (CUDA)
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
 - label: Samplers Test # 36min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -45,6 +45,7 @@ pull_request_rules:
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
      - files~=^vllm/model_executor/models/.*llama.*\.py
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+      - title~=(?i)llama
  actions:
    label:
      add:
@ -65,6 +66,19 @@ pull_request_rules:
      add:
        - multi-modality

+- name: label-performance
+  description: Automatically apply performance label
+  conditions:
+    - or:
+      - files~=^benchmarks/
+      - files~=^vllm/benchmarks/
+      - files~=^tests/benchmarks/
+      - files~=^\.buildkite/nightly-benchmarks/
+  actions:
+    label:
+      add:
+        - performance
+
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
@ -74,7 +88,6 @@ pull_request_rules:
      - files~=^vllm/model_executor/models/.*qwen.*\.py
      - files~=^vllm/reasoning/.*qwen.*\.py
      - title~=(?i)Qwen
-      - body~=(?i)Qwen
  actions:
    label:
      add:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -115,6 +115,11 @@ repos:
    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
+  - id: check-root-lazy-imports
+    name: Check root lazy imports
+    entry: python tools/check_init_lazy_imports.py
+    language: python
+    types: [python]
  - id: check-filenames
    name: Check for spaces in all filenames
    entry: bash
--- a/README.md
+++ b/README.md
@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

+<!-- --8<-- [start:contact-us] -->
 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+<!-- --8<-- [end:contact-us] -->

 ## Media Kit

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -387,3 +387,178 @@ python3 vllm/benchmarks/benchmark_throughput.py \
  --enable-lora \
  --lora-path yard1/llama-2-7b-sql-lora-test
  ```
+
+---
+## Example - Structured Output Benchmark
+
+Benchmark the performance of structured output generation (JSON, grammar, regex).
+
+### Server Setup
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+```
+
+### JSON Schema Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset json \
+  --structured-output-ratio 1.0 \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Grammar-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset grammar \
+  --structure-type grammar \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Regex-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset regex \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Choice-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset choice \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### XGrammar Benchmark Dataset
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset xgrammar_bench \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+---
+## Example - Long Document QA Throughput Benchmark
+
+Benchmark the performance of long document question-answering with prefix caching.
+
+### Basic Long Document QA Test
+
+```bash
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 16 \
+  --document-length 2000 \
+  --output-len 50 \
+  --repeat-count 5
+```
+
+### Different Repeat Modes
+
+```bash
+# Random mode (default) - shuffle prompts randomly
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode random
+
+# Tile mode - repeat entire prompt list in sequence
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode tile
+
+# Interleave mode - repeat each prompt consecutively
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode interleave
+```
+
+---
+## Example - Prefix Caching Benchmark
+
+Benchmark the efficiency of automatic prefix caching.
+
+### Fixed Prompt with Prefix Caching
+
+```bash
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-prompts 1 \
+  --repeat-count 100 \
+  --input-length-range 128:256
+```
+
+### ShareGPT Dataset with Prefix Caching
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --enable-prefix-caching \
+  --num-prompts 20 \
+  --repeat-count 5 \
+  --input-length-range 128:256
+```
+
+---
+## Example - Request Prioritization Benchmark
+
+Benchmark the performance of request prioritization in vLLM.
+
+### Basic Prioritization Test
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority
+```
+
+### Multiple Sequences per Prompt
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority \
+  --n 2
+```
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -10,6 +10,7 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
+#   SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
 #   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
@ -34,6 +35,7 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
+SYSTEM="TPU"
 TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
+PROFILE_PATH="$LOG_FOLDER/profile"

 echo "result file: $RESULT"
 echo "model: $MODEL"

 rm -rf $LOG_FOLDER
+rm -rf $PROFILE_PATH
 mkdir -p $LOG_FOLDER
+mkdir -p $PROFILE_PATH

 cd "$BASE/vllm"

@ -70,10 +75,11 @@ start_server() {
    local max_num_seqs=$2
    local max_num_batched_tokens=$3
    local vllm_log=$4
+    local profile_dir=$5
    
    pkill -f vllm

-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
        --disable-log-requests \
        --port 8004 \
        --gpu-memory-utilization $gpu_memory_utilization \
@ -105,19 +111,37 @@ start_server() {
    fi
 }

+update_best_profile() {
+    local profile_dir=$1
+    local profile_index=$2
+    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
+    selected_profile_file=
+    if [[ "$SYSTEM" == "TPU" ]]; then
+        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
+    fi 
+    if [[ "$SYSTEM" == "GPU" ]]; then
+        selected_profile_file="${sorted_paths[$profile_index]}"
+    fi 
+    rm -f $PROFILE_PATH/*
+    cp $selected_profile_file $PROFILE_PATH
+}
+
 run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
+    mkdir -p $profile_dir
    pkill -f vllm
+    local profile_index=0

    echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -144,7 +168,8 @@ run_benchmark() {
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
-        --port 8004 &> "$bm_log"
+        --port 8004 \
+        --profile &> "$bm_log"
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -158,6 +183,7 @@ run_benchmark() {
    # start from request-rate as int(throughput) + 1
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
+            profile_index=$((profile_index+1))
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
@ -195,6 +221,12 @@ run_benchmark() {
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
+            if [[ "$SYSTEM" == "TPU" ]]; then
+                update_best_profile "$profile_dir/plugins/profile" $profile_index
+            fi
+            if [[ "$SYSTEM" == "GPU" ]]; then
+                update_best_profile "$profile_dir" $profile_index
+            fi
        fi
    else
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
    done
 done
 echo "finish permutations"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -404,8 +404,14 @@ async def async_request_openai_chat_completions(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")

-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -353,7 +353,7 @@ class RandomDataset(BenchmarkDataset):
                : input_lens[i]
            ]
            prompt = tokenizer.decode(re_encoded_sequence)
-            total_input_len = prefix_len + int(input_lens[i])
+            total_input_len = len(re_encoded_sequence)
            requests.append(
                SampleRequest(
                    prompt=prompt,
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -97,7 +97,7 @@ def run_vllm(
        assert lora_requests is None, "BeamSearch API does not support LoRA"
        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
-        output_len = requests[0][2]
+        output_len = requests[0].expected_output_len
        for request in requests:
            assert request.expected_output_len == output_len
        start = time.perf_counter()
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -22,8 +22,16 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    MARLIN_SUPPORTED_GROUP_SIZES,
    query_marlin_supported_quant_types,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
+    rand_marlin_weight_fp4_like,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    marlin_quant_fp8_torch,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace,
+    awq_marlin_quantize,
    marlin_quantize,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@ -35,7 +43,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
    sort_weights,
 )
-from vllm.scalar_type import ScalarType
+from vllm.scalar_type import ScalarType, scalar_types
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -57,80 +65,144 @@ def bench_run(
    size_n: int,
 ):
    label = "Quant Matmul"
-
    sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
        model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
    )
-
    print(f"Testing: {sub_label}")

    a = torch.randn(size_m, size_k).to(torch.half).cuda()
    b = torch.rand(size_k, size_n).to(torch.half).cuda()
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    if act_order and (group_size == -1 or group_size == size_k or has_zp):
+        return
+    if size_k % group_size != 0:
+        return

-    a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda()
-
-    # Marlin quant
-    (
-        marlin_w_ref,
-        marlin_q_w,
-        marlin_s,
-        marlin_g_idx,
-        marlin_sort_indices,
-        marlin_rand_perm,
-    ) = marlin_quantize(b, quant_type, group_size, act_order)
-
-    # Marlin_24 quant
-    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
-        marlin_24_quantize(b, quant_type, group_size)
+    marlin_24_supported = (
+        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
    )
-
-    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
-
-    # GPTQ quant
-    (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
-        b, quant_type, group_size, act_order
+    repack_supported = (
+        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
    )
-    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
-
-    # For act_order, sort the "weights" and "g_idx"
-    # so that group ids are increasing
-    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
-    if act_order:
-        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
-
-    # Prepare
-    marlin_workspace = MarlinWorkspace(
-        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
-    )
-
-    marlin_24_workspace = MarlinWorkspace(
-        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
-    )
-    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
-
-    # AllSpark W8A16 quant
-    as_supported_case = (
+    allspark_supported = (
        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
        and group_size == -1
        and not act_order
        and is_k_full
    )
-    if as_supported_case:
-        properties = torch.cuda.get_device_properties(b.device.index)
-        sm_count = properties.multi_processor_count
-        sm_version = properties.major * 10 + properties.minor

-        supported_arch = sm_version >= 80 and sm_version < 90
-        as_supported_case = as_supported_case and supported_arch
-        if supported_arch:
-            has_zp = False
-            w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
-            qw = qw.to(torch.uint8)
-
-            qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
-                qw, s, zp, has_zp
+    def gen_marlin_params():
+        # Marlin quant
+        marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
+        if quant_type == scalar_types.float4_e2m1f:
+            if group_size != 16 or act_order:
+                return
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
+                b.T, group_size
            )
-            CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
+        elif quant_type == scalar_types.float8_e4m3fn:
+            if group_size not in [-1, 128] or act_order:
+                return
+            marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
+        elif group_size == 16:
+            return
+        elif has_zp:
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+                b, quant_type, group_size
+            )
+        else:
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
+                marlin_quantize(b, quant_type, group_size, act_order)
+            )
+        return (
+            marlin_w_ref,
+            marlin_q_w,
+            marlin_s,
+            marlin_s2,
+            marlin_zp,
+            marlin_g_idx,
+            marlin_sort_indices,
+        )
+
+    def gen_marlin_24_params():
+        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
+        if marlin_24_supported:
+            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
+                marlin_24_quantize(b, quant_type, group_size)
+            )
+        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
+
+    def gen_repack_params():
+        q_w_gptq = None
+        repack_sort_indices = None
+        if repack_supported:
+            (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
+                b, quant_type, group_size, act_order
+            )
+            q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+
+            # For act_order, sort the "weights" and "g_idx"
+            # so that group ids are increasing
+            repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+            if act_order:
+                (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+        return q_w_gptq, repack_sort_indices
+
+    def gen_allspark_params():
+        qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
+            CUBLAS_M_THRESHOLD
+        ) = None
+        nonlocal allspark_supported
+        if allspark_supported:
+            properties = torch.cuda.get_device_properties(b.device.index)
+            sm_count = properties.multi_processor_count
+            sm_version = properties.major * 10 + properties.minor
+
+            supported_arch = sm_version >= 80 and sm_version < 90
+            allspark_supported = allspark_supported and supported_arch
+            if supported_arch:
+                w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
+                qw = qw.to(torch.uint8)
+
+                qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
+                    qw, s, zp, has_zp
+                )
+                CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
+        return (
+            qw_reorder,
+            s_reorder,
+            zp_reorder,
+            sm_count,
+            sm_version,
+            CUBLAS_M_THRESHOLD,
+        )
+
+    (
+        marlin_w_ref,
+        marlin_q_w,
+        marlin_s,
+        marlin_s2,
+        marlin_zp,
+        marlin_g_idx,
+        marlin_sort_indices,
+    ) = gen_marlin_params()
+    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
+        gen_marlin_24_params()
+    )
+    q_w_gptq, repack_sort_indices = gen_repack_params()
+    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
+        gen_allspark_params()
+    )
+
+    # Prepare
+    marlin_workspace = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+    )
+    marlin_24_workspace = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
+    )

    globals = {
        # Gen params
@ -140,15 +212,14 @@ def bench_run(
        "size_n": size_n,
        "size_k": size_k,
        "a": a,
-        "a_tmp": a_tmp,
        # Marlin params
        "marlin_w_ref": marlin_w_ref,
        "marlin_q_w": marlin_q_w,
        "marlin_s": marlin_s,
+        "marlin_s2": marlin_s2,
        "marlin_zp": marlin_zp,
        "marlin_g_idx": marlin_g_idx,
        "marlin_sort_indices": marlin_sort_indices,
-        "marlin_rand_perm": marlin_rand_perm,
        "marlin_workspace": marlin_workspace,
        "is_k_full": is_k_full,
        # Marlin_24 params
@ -161,12 +232,12 @@ def bench_run(
        "q_w_gptq": q_w_gptq,
        "repack_sort_indices": repack_sort_indices,
        # AllSpark W8A16 params
-        "qw_reorder": qw_reorder if as_supported_case else None,
-        "s_reorder": s_reorder if as_supported_case else None,
-        "zp_reorder": zp_reorder if as_supported_case else None,
-        "sm_count": sm_count if as_supported_case else None,
-        "sm_version": sm_version if as_supported_case else None,
-        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None,
+        "qw_reorder": qw_reorder,
+        "s_reorder": s_reorder,
+        "zp_reorder": zp_reorder,
+        "sm_count": sm_count,
+        "sm_version": sm_version,
+        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
        # Kernels
        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
@ -177,7 +248,7 @@ def bench_run(
    min_run_time = 1

    # Warmup pytorch
-    for i in range(5):
+    for _ in range(5):
        torch.matmul(a, marlin_w_ref)

    results.append(
@ -192,17 +263,17 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp16",
+            description="gptq_marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time)
    )

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -210,10 +281,7 @@ def bench_run(
        ).blocked_autorange(min_run_time=min_run_time)
    )

-    if (
-        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
-        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
-    ):
+    if marlin_24_supported:
        results.append(
            benchmark.Timer(
                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
@ -224,17 +292,18 @@ def bench_run(
            ).blocked_autorange(min_run_time=min_run_time)
        )

-    results.append(
-        benchmark.Timer(
-            stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="gptq_marlin_repack",
-        ).blocked_autorange(min_run_time=min_run_time)
-    )
+    if repack_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_repack",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )

-    if as_supported_case:
+    if allspark_supported:
        results.append(
            benchmark.Timer(
                stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)",  # noqa: E501
@ -250,7 +319,6 @@ def main(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
-
    results: list[benchmark.Measurement] = []

    for model in args.models:
@ -278,14 +346,17 @@ def main(args):
                    ):
                        continue

-                    for quant_type in query_marlin_supported_quant_types(False):
+                    for quant_type in query_marlin_supported_quant_types():
                        if (
                            len(args.limit_num_bits) > 0
                            and quant_type.size_bits not in args.limit_num_bits
                        ):
                            continue

-                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in (
+                            MARLIN_SUPPORTED_GROUP_SIZES
+                            + FP4_MARLIN_SUPPORTED_GROUP_SIZES
+                        ):
                            if (
                                len(args.limit_group_size) > 0
                                and group_size not in args.limit_group_size
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="c1debd8"
+ARG AITER_BRANCH="6487649"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
--- a/docs/ci/update_pytorch_version.md
+++ b/docs/ci/update_pytorch_version.md
@ -91,7 +91,7 @@ source to unblock the update process.
 ### FlashInfer
 Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):

-```
+```bash
 export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
 export FLASHINFER_ENABLE_SM90=1
 uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
@ -105,14 +105,14 @@ team if you want to get the package published there.
 ### xFormers
 Similar to FlashInfer, here is how to build and install xFormers from source:

-```
+```bash
 export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
 MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
 ```

 ### Mamba

-```
+```bash
 uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 ```

--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@ -16,35 +16,33 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}

 Start the vLLM OpenAI Compatible API server.

-Examples:
+??? Examples

-```bash
-# Start with a model
-vllm serve meta-llama/Llama-2-7b-hf
+    ```bash
+    # Start with a model
+    vllm serve meta-llama/Llama-2-7b-hf

-# Specify the port
-vllm serve meta-llama/Llama-2-7b-hf --port 8100
+    # Specify the port
+    vllm serve meta-llama/Llama-2-7b-hf --port 8100

-# Check with --help for more options
-# To list all groups
-vllm serve --help=listgroup
+    # Check with --help for more options
+    # To list all groups
+    vllm serve --help=listgroup

-# To view a argument group
-vllm serve --help=ModelConfig
+    # To view a argument group
+    vllm serve --help=ModelConfig

-# To view a single argument
-vllm serve --help=max-num-seqs
+    # To view a single argument
+    vllm serve --help=max-num-seqs

-# To search by keyword
-vllm serve --help=max
-```
+    # To search by keyword
+    vllm serve --help=max
+    ```

 ## chat

 Generate chat completions via the running API server.

-Examples:
-
 ```bash
 # Directly connect to localhost API without arguments
 vllm chat
@ -60,8 +58,6 @@ vllm chat --quick "hi"

 Generate text completions based on the given prompt via the running API server.

-Examples:
-
 ```bash
 # Directly connect to localhost API without arguments
 vllm complete
@ -73,6 +69,8 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
 vllm complete --quick "The future of AI is"
 ```

+</details>
+
 ## bench

 Run benchmark tests for latency online serving throughput and offline inference throughput.
@ -89,8 +87,6 @@ vllm bench {latency, serve, throughput}

 Benchmark the latency of a single batch of requests.

-Example:
-
 ```bash
 vllm bench latency \
    --model meta-llama/Llama-3.2-1B-Instruct \
@ -104,8 +100,6 @@ vllm bench latency \

 Benchmark the online serving throughput.

-Example:
-
 ```bash
 vllm bench serve \
    --model meta-llama/Llama-3.2-1B-Instruct \
@ -120,8 +114,6 @@ vllm bench serve \

 Benchmark offline inference throughput.

-Example:
-
 ```bash
 vllm bench throughput \
    --model meta-llama/Llama-3.2-1B-Instruct \
@ -143,7 +135,8 @@ vllm collect-env

 Run batch prompts and write results to file.

-Examples:
+<details>
+<summary>Examples</summary>

 ```bash
 # Running with a local file
@ -159,6 +152,8 @@ vllm run-batch \
    --model meta-llama/Meta-Llama-3-8B-Instruct
 ```

+</details>
+
 ## More Help

 For detailed options of any subcommand, use:
--- a/docs/community/contact_us.md
+++ b/docs/community/contact_us.md
@ -0,0 +1,6 @@
+---
+title: Contact Us
+---
+[](){ #contactus }
+
+--8<-- "README.md:contact-us"
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -57,19 +57,21 @@ By default, we optimize model inference using CUDA graphs which take up extra me

 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:

-```python
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationLevel
+??? Code

-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        # By default, it goes up to max_num_seqs
-        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
-    ),
-)
-```
+    ```python
+    from vllm import LLM
+    from vllm.config import CompilationConfig, CompilationLevel
+
+    llm = LLM(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            # By default, it goes up to max_num_seqs
+            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+        ),
+    )
+    ```

 You can disable graph capturing completely via the `enforce_eager` flag:

@ -127,18 +129,20 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.

 Here are some examples:

-```python
-from vllm import LLM
+??? Code

-# Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_kwargs={
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-          })
+    ```python
+    from vllm import LLM

-# Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
-          mm_processor_kwargs={
-              "max_dynamic_patch": 4,  # Default is 12
-          })
-```
+    # Available for Qwen2-VL series models
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+            mm_processor_kwargs={
+                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+            })
+
+    # Available for InternVL series models
+    llm = LLM(model="OpenGVLab/InternVL2-2B",
+            mm_processor_kwargs={
+                "max_dynamic_patch": 4,  # Default is 12
+            })
+    ```
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@ -7,6 +7,8 @@ vLLM uses the following environment variables to configure the system:

    All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).

-```python
--8<-- "vllm/envs.py:env-vars-definition"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/envs.py:env-vars-definition"
+    ```
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -93,25 +93,27 @@ For additional features and advanced configurations, refer to the official [MkDo

 ## Testing

-```bash
-pip install -r requirements/dev.txt
+??? note "Commands"

-# Linting, formatting and static type checking
-pre-commit install --hook-type pre-commit --hook-type commit-msg
+    ```bash
+    pip install -r requirements/dev.txt

-# You can manually run pre-commit with
-pre-commit run --all-files
+    # Linting, formatting and static type checking
+    pre-commit install --hook-type pre-commit --hook-type commit-msg

-# To manually run something from CI that does not run
-# locally by default, you can run:
-pre-commit run mypy-3.9 --hook-stage manual --all-files
+    # You can manually run pre-commit with
+    pre-commit run --all-files

-# Unit tests
-pytest tests/
+    # To manually run something from CI that does not run
+    # locally by default, you can run:
+    pre-commit run mypy-3.9 --hook-stage manual --all-files

-# Run tests for a single test file with detailed output
-pytest -s -v tests/test_logger.py
-```
+    # Unit tests
+    pytest tests/
+
+    # Run tests for a single test file with detailed output
+    pytest -s -v tests/test_logger.py
+    ```

 !!! tip
    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -27,33 +27,35 @@ All vLLM modules within the model must include a `prefix` argument in their cons

 The initialization code should look like this:

-```python
-from torch import nn
-from vllm.config import VllmConfig
-from vllm.attention import Attention
+??? Code

-class MyAttention(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.attn = Attention(prefix=f"{prefix}.attn")
+    ```python
+    from torch import nn
+    from vllm.config import VllmConfig
+    from vllm.attention import Attention

-class MyDecoderLayer(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+    class MyAttention(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.attn = Attention(prefix=f"{prefix}.attn")

-class MyModel(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
-        )
+    class MyDecoderLayer(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")

-class MyModelForCausalLM(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
-```
+    class MyModel(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str):
+            super().__init__()
+            self.layers = nn.ModuleList(
+                [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+            )
+
+    class MyModelForCausalLM(nn.Module):
+        def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+            super().__init__()
+            self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+    ```

 ### Computation Code

--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -25,59 +25,63 @@ Further update the model as follows:

 - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.

-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
+    ??? Code

-        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+        ```python
+        class YourModelForImage2Seq(nn.Module):
+            ...

-            assert self.vision_encoder is not None
-            image_features = self.vision_encoder(image_input)
-            return self.multi_modal_projector(image_features)
+            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:

-        def get_multimodal_embeddings(
-                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+                assert self.vision_encoder is not None
+                image_features = self.vision_encoder(image_input)
+                return self.multi_modal_projector(image_features)

-            # Validate the multimodal input keyword arguments
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is None:
-                return None
+            def get_multimodal_embeddings(
+                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

-            # Run multimodal inputs through encoder and projector
-            vision_embeddings = self._process_image_input(image_input)
-            return vision_embeddings
-    ```
+                # Validate the multimodal input keyword arguments
+                image_input = self._parse_and_validate_image_input(**kwargs)
+                if image_input is None:
+                    return None
+
+                # Run multimodal inputs through encoder and projector
+                vision_embeddings = self._process_image_input(image_input)
+                return vision_embeddings
+        ```

 !!! important
    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.

 - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.

-    ```python
-    from .utils import merge_multimodal_embeddings
+    ??? Code

-    class YourModelForImage2Seq(nn.Module):
-        ...
+        ```python
+        from .utils import merge_multimodal_embeddings

-        def get_input_embeddings(
-            self,
-            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-        ) -> torch.Tensor:
+        class YourModelForImage2Seq(nn.Module):
+            ...

-            # `get_input_embeddings` should already be implemented for the language 
-            # model as one of the requirements of basic vLLM model implementation.
-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+            def get_input_embeddings(
+                self,
+                input_ids: torch.Tensor,
+                multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+            ) -> torch.Tensor:

-            if multimodal_embeddings is not None:
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids=input_ids, 
-                    inputs_embeds=inputs_embeds, 
-                    multimodal_embeddings=multimodal_embeddings,
-                    placeholder_token_id=self.config.image_token_index)
+                # `get_input_embeddings` should already be implemented for the language 
+                # model as one of the requirements of basic vLLM model implementation.
+                inputs_embeds = self.language_model.get_input_embeddings(input_ids)

-            return inputs_embeds
-    ```
+                if multimodal_embeddings is not None:
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids=input_ids, 
+                        inputs_embeds=inputs_embeds, 
+                        multimodal_embeddings=multimodal_embeddings,
+                        placeholder_token_id=self.config.image_token_index)
+
+                return inputs_embeds
+        ```

 - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.

@ -135,42 +139,46 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    Looking at the code of HF's `LlavaForConditionalGeneration`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
-    n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-    n_image_features = image_features.shape[0] * image_features.shape[1]
+    ??? Code

-    if n_image_tokens != n_image_features:
-        raise ValueError(
-            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+        n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        special_image_mask = (
+            (input_ids == self.config.image_token_index)
+            .unsqueeze(-1)
+            .expand_as(inputs_embeds)
+            .to(inputs_embeds.device)
        )
-    special_image_mask = (
-        (input_ids == self.config.image_token_index)
-        .unsqueeze(-1)
-        .expand_as(inputs_embeds)
-        .to(inputs_embeds.device)
-    )
-    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-    ```
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        ```

    The number of placeholder feature tokens per image is `image_features.shape[1]`.
    `image_features` is calculated inside the `get_image_features` method:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
-    image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+    ??? Code

-    selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-    if vision_feature_select_strategy == "default":
-        selected_image_feature = selected_image_feature[:, 1:]
-    elif vision_feature_select_strategy == "full":
-        selected_image_feature = selected_image_feature
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-    image_features = self.multi_modal_projector(selected_image_feature)
-    return image_features
-    ```
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+        selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+        ```

    We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
    (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
@ -193,20 +201,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
-    target_dtype = self.patch_embedding.weight.dtype
-    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
-    patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+    ??? Code

-    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-    if interpolate_pos_encoding:
-        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-    else:
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-    return embeddings
-    ```
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+        ```

    We can infer that `embeddings.shape[1] == self.num_positions`, where

@ -218,55 +228,59 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    Overall, the number of placeholder feature tokens for an image can be calculated as:

-    ```python
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        hf_processor = self.get_hf_processor()
+    ??? Code

-        image_size = hf_config.vision_config.image_size
-        patch_size = hf_config.vision_config.patch_size
+        ```python
+        def get_num_image_tokens(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+        ) -> int:
+            hf_config = self.get_hf_config()
+            hf_processor = self.get_hf_processor()

-        num_image_tokens = (image_size // patch_size) ** 2 + 1
-        if hf_processor.vision_feature_select_strategy == "default":
-            num_image_tokens -= 1
+            image_size = hf_config.vision_config.image_size
+            patch_size = hf_config.vision_config.patch_size

-        return num_image_tokens
-    ```
+            num_image_tokens = (image_size // patch_size) ** 2 + 1
+            if hf_processor.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            return num_image_tokens
+        ```

    Notice that the number of image tokens doesn't depend on the image width and height.
    We can simply use a dummy `image_size` to calculate the multimodal profiling data:

-    ```python
-    # NOTE: In actuality, this is usually implemented as part of the
-    # model's subclass of `BaseProcessingInfo`, but we show it as is
-    # here for simplicity.
-    def get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self.get_hf_config()
-        width = height = hf_config.image_size
-        return ImageSize(width=width, height=height)
+    ??? Code

-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalDataDict:
-        num_images = mm_counts.get("image", 0)
+        ```python
+        # NOTE: In actuality, this is usually implemented as part of the
+        # model's subclass of `BaseProcessingInfo`, but we show it as is
+        # here for simplicity.
+        def get_image_size_with_most_features(self) -> ImageSize:
+            hf_config = self.get_hf_config()
+            width = height = hf_config.image_size
+            return ImageSize(width=width, height=height)

-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            num_images = mm_counts.get("image", 0)

-        return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-    ```
+            target_width, target_height = \
+                self.info.get_image_size_with_most_features()
+
+            return {
+                "image":
+                self._get_dummy_images(width=target_width,
+                                    height=target_height,
+                                    num_images=num_images)
+            }
+        ```

    For the text, we simply expand the multimodal image token from the model config to match the desired number of images.

@ -284,21 +298,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    Looking at the code of HF's `FuyuForCausalLM`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
-    if image_patches is not None and past_key_values is None:
-        patch_embeddings = [
-            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
-            .squeeze(0)
-            .to(inputs_embeds.device)
-            for patch in image_patches
-        ]
-        inputs_embeds = self.gather_continuous_embeddings(
-            word_embeddings=inputs_embeds,
-            continuous_embeddings=patch_embeddings,
-            image_patch_input_indices=image_patches_indices,
-        )
-    ```
+    ??? Code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
+        if image_patches is not None and past_key_values is None:
+            patch_embeddings = [
+                self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
+                .squeeze(0)
+                .to(inputs_embeds.device)
+                for patch in image_patches
+            ]
+            inputs_embeds = self.gather_continuous_embeddings(
+                word_embeddings=inputs_embeds,
+                continuous_embeddings=patch_embeddings,
+                image_patch_input_indices=image_patches_indices,
+            )
+        ```

    The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
    which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
@ -312,92 +328,98 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
    returning the dimensions after resizing (but before padding) as metadata.

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
-    image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
-    batch_images = image_encoding["images"]
-    image_unpadded_heights = image_encoding["image_unpadded_heights"]
-    image_unpadded_widths = image_encoding["image_unpadded_widths"]
+    ??? Code

-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
-    if do_resize:
-        batch_images = [
-            [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
-            for images in batch_images
-        ]
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
+        image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+        batch_images = image_encoding["images"]
+        image_unpadded_heights = image_encoding["image_unpadded_heights"]
+        image_unpadded_widths = image_encoding["image_unpadded_widths"]

-    image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
-    image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
-    image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
-
-    if do_pad:
-        batch_images = [
-            [
-                self.pad_image(
-                    image,
-                    size=size,
-                    mode=padding_mode,
-                    constant_values=padding_value,
-                    input_data_format=input_data_format,
-                )
-                for image in images
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
+        if do_resize:
+            batch_images = [
+                [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+                for images in batch_images
            ]
-            for images in batch_images
-        ]
-    ```
+
+        image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+        if do_pad:
+            batch_images = [
+                [
+                    self.pad_image(
+                        image,
+                        size=size,
+                        mode=padding_mode,
+                        constant_values=padding_value,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+        ```

    In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
-    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-        image_input=tensor_batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=image_placeholder_id,
-        image_newline_id=image_newline_id,
-        variable_sized=True,
-    )
+    ??? Code

-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
-    image_height, image_width = image.shape[1], image.shape[2]
-    if variable_sized:  # variable_sized=True
-        new_h = min(
-            image_height,
-            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
        )
-        new_w = min(
-            image_width,
-            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
-        )
-        image = image[:, :new_h, :new_w]
-        image_height, image_width = new_h, new_w

-    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
-    tensor_of_image_ids = torch.full(
-        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
-    )
-    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
-    assert num_patches == patches.shape[0]
-    ```
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
+        image_height, image_width = image.shape[1], image.shape[2]
+        if variable_sized:  # variable_sized=True
+            new_h = min(
+                image_height,
+                math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+            )
+            new_w = min(
+                image_width,
+                math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+            )
+            image = image[:, :new_h, :new_w]
+            image_height, image_width = new_h, new_w
+
+        num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+        tensor_of_image_ids = torch.full(
+            [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+        )
+        patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+        assert num_patches == patches.shape[0]
+        ```

    The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
-    patch_size = patch_size if patch_size is not None else self.patch_size
-    patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+    ??? Code

-    if image_height % patch_height != 0:
-        raise ValueError(f"{image_height=} must be divisible by {patch_height}")
-    if image_width % patch_width != 0:
-        raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]

-    num_patches_per_dim_h = image_height // patch_height
-    num_patches_per_dim_w = image_width // patch_width
-    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-    ```
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        ```

    These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
    to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
@ -419,23 +441,25 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    For the multimodal image profiling data, the logic is very similar to LLaVA:

-    ```python
-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        num_images = mm_counts.get("image", 0)
+    ??? Code

-        return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-    ```
+        ```python
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            target_width, target_height = \
+                self.info.get_image_size_with_most_features()
+            num_images = mm_counts.get("image", 0)
+
+            return {
+                "image":
+                self._get_dummy_images(width=target_width,
+                                    height=target_height,
+                                    num_images=num_images)
+            }
+        ```

 ## 4. Specify processing details

@ -455,6 +479,7 @@ return a schema of the tensors outputted by the HF processor that are related to
    The output of `CLIPImageProcessor` is a simple tensor with shape
    `(num_images, num_channels, image_height, image_width)`:

+
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
    images = [
@ -505,35 +530,37 @@ return a schema of the tensors outputted by the HF processor that are related to
    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:

-    ```python
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        processed_outputs = super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-        )
+    ??? Code

-        image_patches = processed_outputs.get("image_patches")
-        if image_patches is not None:
-            images = mm_data["images"]
-            assert isinstance(images, list)
+        ```python
+        def _call_hf_processor(
+            self,
+            prompt: str,
+            mm_data: Mapping[str, object],
+            mm_kwargs: Mapping[str, object],
+        ) -> BatchFeature:
+            processed_outputs = super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+            )

-            # Original output: (1, num_images, Pn, Px * Py * C)
-            # New output: (num_images, Pn, Px * Py * C)
-            assert (isinstance(image_patches, list)
-                    and len(image_patches) == 1)
-            assert (isinstance(image_patches[0], torch.Tensor)
-                    and len(image_patches[0]) == len(images))
+            image_patches = processed_outputs.get("image_patches")
+            if image_patches is not None:
+                images = mm_data["images"]
+                assert isinstance(images, list)

-            processed_outputs["image_patches"] = image_patches[0]
+                # Original output: (1, num_images, Pn, Px * Py * C)
+                # New output: (num_images, Pn, Px * Py * C)
+                assert (isinstance(image_patches, list)
+                        and len(image_patches) == 1)
+                assert (isinstance(image_patches[0], torch.Tensor)
+                        and len(image_patches[0]) == len(images))

-        return processed_outputs
-    ```
+                processed_outputs["image_patches"] = image_patches[0]
+
+            return processed_outputs
+        ```

    !!! note
        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
@ -573,35 +600,37 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
    It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
    Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:

-    ```python
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
-    ) -> Sequence[PromptUpdate]:
-        hf_config = self.info.get_hf_config()
-        image_token_id = hf_config.image_token_index
+    ??? Code

-        def get_replacement(item_idx: int):
-            images = mm_items.get_items("image", ImageProcessorItems)
+        ```python
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            hf_config = self.info.get_hf_config()
+            image_token_id = hf_config.image_token_index

-            image_size = images.get_image_size(item_idx)
-            num_image_tokens = self.info.get_num_image_tokens(
-                image_width=image_size.width,
-                image_height=image_size.height,
-            )
+            def get_replacement(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)

-            return [image_token_id] * num_image_tokens
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )

-        return [
-            PromptReplacement(
-                modality="image",
-                target=[image_token_id],
-                replacement=get_replacement,
-            ),
-        ]
-    ```
+                return [image_token_id] * num_image_tokens
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement,
+                ),
+            ]
+        ```

 === "Handling additional tokens: Fuyu"

@ -616,117 +645,90 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies

    We define a helper function to return `ncols` and `nrows` directly:

-    ```python
-    def get_image_feature_grid_size(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> tuple[int, int]:
-        image_processor = self.get_image_processor()
-        target_width = image_processor.size["width"]
-        target_height = image_processor.size["height"]
-        patch_width = image_processor.patch_size["width"]
-        patch_height = image_processor.patch_size["height"]
+    ??? Code

-        if not (image_width <= target_width and image_height <= target_height):
-            height_scale_factor = target_height / image_height
-            width_scale_factor = target_width / image_width
-            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+        ```python
+        def get_image_feature_grid_size(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+        ) -> tuple[int, int]:
+            image_processor = self.get_image_processor()
+            target_width = image_processor.size["width"]
+            target_height = image_processor.size["height"]
+            patch_width = image_processor.patch_size["width"]
+            patch_height = image_processor.patch_size["height"]

-            image_height = int(image_height * optimal_scale_factor)
-            image_width = int(image_width * optimal_scale_factor)
+            if not (image_width <= target_width and image_height <= target_height):
+                height_scale_factor = target_height / image_height
+                width_scale_factor = target_width / image_width
+                optimal_scale_factor = min(height_scale_factor, width_scale_factor)

-        ncols = math.ceil(image_width / patch_width)
-        nrows = math.ceil(image_height / patch_height)
-        return ncols, nrows
-    ```
+                image_height = int(image_height * optimal_scale_factor)
+                image_width = int(image_width * optimal_scale_factor)
+
+            ncols = math.ceil(image_width / patch_width)
+            nrows = math.ceil(image_height / patch_height)
+            return ncols, nrows
+        ```

    Based on this, we can initially define our replacement tokens as:

-    ```python
-    def get_replacement(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
+    ??? Code

-        ncols, nrows = self.info.get_image_feature_grid_size(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
+        ```python
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)

-        # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
-        # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
-        return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
-    ```
+            ncols, nrows = self.info.get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
+            # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
+            return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+        ```

    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
    a BOS token (`<s>`) is also added to the promopt:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
-    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-        image_input=tensor_batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=image_placeholder_id,
-        image_newline_id=image_newline_id,
-        variable_sized=True,
-    )
-    prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
-        tokenizer=self.tokenizer,
-        prompts=prompts,
-        scale_factors=scale_factors,
-        max_tokens_to_generate=self.max_tokens_to_generate,
-        max_position_embeddings=self.max_position_embeddings,
-        add_BOS=True,
-        add_beginning_of_answer_token=True,
-    )
-    ```
+    ??? Code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
+        )
+        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+            tokenizer=self.tokenizer,
+            prompts=prompts,
+            scale_factors=scale_factors,
+            max_tokens_to_generate=self.max_tokens_to_generate,
+            max_position_embeddings=self.max_position_embeddings,
+            add_BOS=True,
+            add_beginning_of_answer_token=True,
+        )
+        ```

    To assign the vision embeddings to only the image tokens, instead of a string
    you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:

-    ```python
-    hf_config = self.info.get_hf_config()
-    bos_token_id = hf_config.bos_token_id  # `<s>`
-    assert isinstance(bos_token_id, int)
+    ??? Code

-    def get_replacement_fuyu(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
-
-        ncols, nrows = self.info.get_image_feature_grid_size(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
-        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                        [_NEWLINE_TOKEN_ID]) * nrows
-
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
-        )
-    ```
-
-    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
-    we can search for it to conduct the replacement at the start of the string:
-
-    ```python
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
-    ) -> Sequence[PromptUpdate]:
+        ```python
        hf_config = self.info.get_hf_config()
-        bos_token_id = hf_config.bos_token_id
+        bos_token_id = hf_config.bos_token_id  # `<s>`
        assert isinstance(bos_token_id, int)

-        tokenizer = self.info.get_tokenizer()
-        eot_token_id = tokenizer.bos_token_id
-        assert isinstance(eot_token_id, int)
-
        def get_replacement_fuyu(item_idx: int):
            images = mm_items.get_items("image", ImageProcessorItems)
            image_size = images.get_image_size(item_idx)
@ -742,15 +744,52 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                image_tokens + [bos_token_id],
                embed_token_id=_IMAGE_TOKEN_ID,
            )
+        ```

-        return [
-            PromptReplacement(
-                modality="image",
-                target=[eot_token_id],
-                replacement=get_replacement_fuyu,
-            )
-        ]
-    ```
+    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
+    we can search for it to conduct the replacement at the start of the string:
+
+    ??? Code
+
+        ```python
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            hf_config = self.info.get_hf_config()
+            bos_token_id = hf_config.bos_token_id
+            assert isinstance(bos_token_id, int)
+
+            tokenizer = self.info.get_tokenizer()
+            eot_token_id = tokenizer.bos_token_id
+            assert isinstance(eot_token_id, int)
+
+            def get_replacement_fuyu(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                                [_NEWLINE_TOKEN_ID]) * nrows
+
+                return PromptUpdateDetails.select_token_id(
+                    image_tokens + [bos_token_id],
+                    embed_token_id=_IMAGE_TOKEN_ID,
+                )
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[eot_token_id],
+                    replacement=get_replacement_fuyu,
+                )
+            ]
+        ```

 ## 5. Register processor-related classes

--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -30,13 +30,21 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
 #### OpenAI Server

 ```bash
-VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
+VLLM_TORCH_PROFILER_DIR=./vllm_profile \
+    python -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3-70B
 ```

 benchmark_serving.py:

 ```bash
-python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model meta-llama/Meta-Llama-3-70B \
+    --dataset-name sharegpt \
+    --dataset-path sharegpt.json \
+    --profile \
+    --num-prompts 2
 ```

 ## Profile with NVIDIA Nsight Systems
@ -64,7 +72,16 @@ For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fo
 The following is an example using the `benchmarks/benchmark_latency.py` script:

 ```bash
-nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
+nsys profile -o report.nsys-rep \
+    --trace-fork-before-exec=true \
+    --cuda-graph-trace=node \
+    python benchmarks/benchmark_latency.py \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --num-iters-warmup 5 \
+    --num-iters 1 \
+    --batch-size 16 \
+    --input-len 512 \
+    --output-len 8
 ```

 #### OpenAI Server
@ -73,10 +90,21 @@ To profile the server, you will want to prepend your `vllm serve` command with `

 ```bash
 # server
-nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
+nsys profile -o report.nsys-rep \
+    --trace-fork-before-exec=true \
+    --cuda-graph-trace=node \
+    --delay 30 \
+    --duration 60 \
+    vllm serve meta-llama/Llama-3.1-8B-Instruct

 # client
-python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --num-prompts 1 \
+    --dataset-name random \
+    --random-input 1024 \
+    --random-output 512
 ```

 In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
@ -97,26 +125,26 @@ to manually kill the profiler and generate your `nsys-rep` report.

 You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).

-CLI example:
+??? CLI example

-```bash
-nsys stats report1.nsys-rep
-...
- ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+    ```bash
+    nsys stats report1.nsys-rep
+    ...
+    ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

- Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
- --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
-     46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
-     14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
-     12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
-      9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
-      5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
-      4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
-      2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
-      1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
-      0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
-... 
-```
+    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+    --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
+        46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
+        14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
+        12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
+        9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
+        5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
+        4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
+        2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
+        1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
+        0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
+    ... 
+    ```

 GUI example:

--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -10,7 +10,7 @@ title: Using Docker
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).

-```console
+```bash
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
@ -22,7 +22,7 @@ docker run --runtime nvidia --gpus all \

 This image can also be used with other container engines such as [Podman](https://podman.io/).

-```console
+```bash
 podman run --gpus all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
@ -71,7 +71,7 @@ You can add any other [engine-args][engine-args] you need after the image tag (`

 You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:

-```console
+```bash
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
 DOCKER_BUILDKIT=1 docker build . \
    --target vllm-openai \
@ -97,26 +97,28 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).

-```console
-# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-python3 use_existing_torch.py
-DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
-  --target vllm-openai \
-  --platform "linux/arm64" \
-  -t vllm/vllm-gh200-openai:latest \
-  --build-arg max_jobs=66 \
-  --build-arg nvcc_threads=2 \
-  --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-  --build-arg vllm_fa_cmake_gpu_arches="90-real"
-```
+??? Command
+
+    ```bash
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    python3 use_existing_torch.py
+    DOCKER_BUILDKIT=1 docker build . \
+    --file docker/Dockerfile \
+    --target vllm-openai \
+    --platform "linux/arm64" \
+    -t vllm/vllm-gh200-openai:latest \
+    --build-arg max_jobs=66 \
+    --build-arg nvcc_threads=2 \
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg vllm_fa_cmake_gpu_arches="90-real"
+    ```

 !!! note
    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.

    Run the following command on your host machine to register QEMU user static handlers:

-    ```console
+    ```bash
    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
    ```

@ -126,7 +128,7 @@ DOCKER_BUILDKIT=1 docker build . \

 To run vLLM with the custom-built Docker image:

-```console
+```bash
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 ```

--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@ -11,7 +11,7 @@ title: AutoGen

 - Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment

-```console
+```bash
 pip install vllm

 # Install AgentChat and OpenAI client from Extensions
@ -23,58 +23,60 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 python -m vllm.entrypoints.openai.api_server \
    --model mistralai/Mistral-7B-Instruct-v0.2
 ```

 - Call it with AutoGen:

-```python
-import asyncio
-from autogen_core.models import UserMessage
-from autogen_ext.models.openai import OpenAIChatCompletionClient
-from autogen_core.models import ModelFamily
+??? Code
+
+    ```python
+    import asyncio
+    from autogen_core.models import UserMessage
+    from autogen_ext.models.openai import OpenAIChatCompletionClient
+    from autogen_core.models import ModelFamily


-async def main() -> None:
-    # Create a model client
-    model_client = OpenAIChatCompletionClient(
-        model="mistralai/Mistral-7B-Instruct-v0.2",
-        base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
-        api_key="EMPTY",
-        model_info={
-            "vision": False,
-            "function_calling": False,
-            "json_output": False,
-            "family": ModelFamily.MISTRAL,
-            "structured_output": True,
-        },
-    )
+    async def main() -> None:
+        # Create a model client
+        model_client = OpenAIChatCompletionClient(
+            model="mistralai/Mistral-7B-Instruct-v0.2",
+            base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
+            api_key="EMPTY",
+            model_info={
+                "vision": False,
+                "function_calling": False,
+                "json_output": False,
+                "family": ModelFamily.MISTRAL,
+                "structured_output": True,
+            },
+        )

-    messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
+        messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]

-    # Create a stream.
-    stream = model_client.create_stream(messages=messages)
+        # Create a stream.
+        stream = model_client.create_stream(messages=messages)

-    # Iterate over the stream and print the responses.
-    print("Streamed responses:")
-    async for response in stream:
-        if isinstance(response, str):
-            # A partial response is a string.
-            print(response, flush=True, end="")
-        else:
-            # The last response is a CreateResult object with the complete message.
-            print("\n\n------------\n")
-            print("The complete response:", flush=True)
-            print(response.content, flush=True)
+        # Iterate over the stream and print the responses.
+        print("Streamed responses:")
+        async for response in stream:
+            if isinstance(response, str):
+                # A partial response is a string.
+                print(response, flush=True, end="")
+            else:
+                # The last response is a CreateResult object with the complete message.
+                print("\n\n------------\n")
+                print("The complete response:", flush=True)
+                print(response.content, flush=True)

-    # Close the client when done.
-    await model_client.close()
+        # Close the client when done.
+        await model_client.close()


-asyncio.run(main())
-```
+    asyncio.run(main())
+    ```

 For details, see the tutorial:

--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr

 To install the Cerebrium client, run:

-```console
+```bash
 pip install cerebrium
 cerebrium login
 ```

 Next, create your Cerebrium project, run:

-```console
+```bash
 cerebrium init vllm-project
 ```

@ -34,75 +34,81 @@ vllm = "latest"

 Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+    ```python
+    from vllm import LLM, SamplingParams

-def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")

-    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
-    outputs = llm.generate(prompts, sampling_params)
+    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):

-    # Print the outputs.
-    results = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        results.append({"prompt": prompt, "generated_text": generated_text})
+        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+        outputs = llm.generate(prompts, sampling_params)

-    return {"results": results}
-```
+        # Print the outputs.
+        results = []
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            results.append({"prompt": prompt, "generated_text": generated_text})
+
+        return {"results": results}
+    ```

 Then, run the following code to deploy it to the cloud:

-```console
+```bash
 cerebrium deploy
 ```

 If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)

-```python
-curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
- -H 'Content-Type: application/json' \
- -H 'Authorization: <JWT TOKEN>' \
- --data '{
-   "prompts": [
-     "Hello, my name is",
-     "The president of the United States is",
-     "The capital of France is",
-     "The future of AI is"
-   ]
- }'
-```
+??? Command
+
+    ```python
+    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+    -H 'Content-Type: application/json' \
+    -H 'Authorization: <JWT TOKEN>' \
+    --data '{
+    "prompts": [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is"
+    ]
+    }'
+    ```

 You should get a response like:

-```python
-{
-    "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
-    "result": {
-        "result": [
-            {
-                "prompt": "Hello, my name is",
-                "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
-            },
-            {
-                "prompt": "The president of the United States is",
-                "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
-            },
-            {
-                "prompt": "The capital of France is",
-                "generated_text": " Paris.\n"
-            },
-            {
-                "prompt": "The future of AI is",
-                "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
-            }
-        ]
-    },
-    "run_time_ms": 152.53663063049316
-}
-```
+??? Response
+
+    ```python
+    {
+        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+        "result": {
+            "result": [
+                {
+                    "prompt": "Hello, my name is",
+                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+                },
+                {
+                    "prompt": "The president of the United States is",
+                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+                },
+                {
+                    "prompt": "The capital of France is",
+                    "generated_text": " Paris.\n"
+                },
+                {
+                    "prompt": "The future of AI is",
+                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+                }
+            ]
+        },
+        "run_time_ms": 152.53663063049316
+    }
+    ```

 You now have an autoscaling endpoint where you only pay for the compute you use!
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```

--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@ -18,13 +18,13 @@ This guide walks you through deploying Dify using a vLLM backend.

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve Qwen/Qwen1.5-7B-Chat
 ```

 - Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):

-```console
+```bash
 git clone https://github.com/langgenius/dify.git
 cd dify
 cd docker
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),

 To install dstack client, run:

-```console
+```bash
 pip install "dstack[all]
 dstack server
 ```

 Next, to configure your dstack project, run:

-```console
+```bash
 mkdir -p vllm-dstack
 cd vllm-dstack
 dstack init
@ -26,75 +26,81 @@ dstack init

 Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:

-```yaml
-type: service
+??? Config

-python: "3.11"
-env:
-    - MODEL=NousResearch/Llama-2-7b-chat-hf
-port: 8000
-resources:
-    gpu: 24GB
-commands:
-    - pip install vllm
-    - vllm serve $MODEL --port 8000
-model:
-    format: openai
-    type: chat
-    name: NousResearch/Llama-2-7b-chat-hf
-```
+    ```yaml
+    type: service
+
+    python: "3.11"
+    env:
+        - MODEL=NousResearch/Llama-2-7b-chat-hf
+    port: 8000
+    resources:
+        gpu: 24GB
+    commands:
+        - pip install vllm
+        - vllm serve $MODEL --port 8000
+    model:
+        format: openai
+        type: chat
+        name: NousResearch/Llama-2-7b-chat-hf
+    ```

 Then, run the following CLI for provisioning:

-```console
-$ dstack run . -f serve.dstack.yml
+??? Command

-⠸ Getting run plan...
- Configuration  serve.dstack.yml
- Project        deep-diver-main
- User           deep-diver
- Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
- Max price      -
- Max duration   -
- Spot policy    auto
- Retry policy   no
+    ```console
+    $ dstack run . -f serve.dstack.yml

- #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
- 1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
- 2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
- 3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
-    ...
- Shown 3 of 193 offers, $5.876 max
+    ⠸ Getting run plan...
+    Configuration  serve.dstack.yml
+    Project        deep-diver-main
+    User           deep-diver
+    Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
+    Max price      -
+    Max duration   -
+    Spot policy    auto
+    Retry policy   no

-Continue? [y/n]: y
-⠙ Submitting run...
-⠏ Launching spicy-treefrog-1 (pulling)
-spicy-treefrog-1 provisioning completed (running)
-Service is published at ...
-```
+    #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
+    1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+        ...
+    Shown 3 of 193 offers, $5.876 max
+
+    Continue? [y/n]: y
+    ⠙ Submitting run...
+    ⠏ Launching spicy-treefrog-1 (pulling)
+    spicy-treefrog-1 provisioning completed (running)
+    Service is published at ...
+    ```

 After the provisioning, you can interact with the model by using the OpenAI SDK:

-```python
-from openai import OpenAI
+??? Code

-client = OpenAI(
-    base_url="https://gateway.<gateway domain>",
-    api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
-)
+    ```python
+    from openai import OpenAI

-completion = client.chat.completions.create(
-    model="NousResearch/Llama-2-7b-chat-hf",
-    messages=[
-        {
-            "role": "user",
-            "content": "Compose a poem that explains the concept of recursion in programming.",
-        }
-    ]
-)
+    client = OpenAI(
+        base_url="https://gateway.<gateway domain>",
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+    )

-print(completion.choices[0].message.content)
-```
+    completion = client.chat.completions.create(
+        model="NousResearch/Llama-2-7b-chat-hf",
+        messages=[
+            {
+                "role": "user",
+                "content": "Compose a poem that explains the concept of recursion in programming.",
+            }
+        ]
+    )
+
+    print(completion.choices[0].message.content)
+    ```

 !!! note
    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@ -13,7 +13,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 - Setup vLLM and Haystack environment

-```console
+```bash
 pip install vllm haystack-ai
 ```

@ -21,35 +21,35 @@ pip install vllm haystack-ai

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve mistralai/Mistral-7B-Instruct-v0.1
 ```

 - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.

-```python
-from haystack.components.generators.chat import OpenAIChatGenerator
-from haystack.dataclasses import ChatMessage
-from haystack.utils import Secret
+??? Code

-generator = OpenAIChatGenerator(
-    # for compatibility with the OpenAI API, a placeholder api_key is needed
-    api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
-    model="mistralai/Mistral-7B-Instruct-v0.1",
-    api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-    generation_kwargs = {"max_tokens": 512}
-)
+    ```python
+    from haystack.components.generators.chat import OpenAIChatGenerator
+    from haystack.dataclasses import ChatMessage
+    from haystack.utils import Secret

-response = generator.run(
-  messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
-)
+    generator = OpenAIChatGenerator(
+        # for compatibility with the OpenAI API, a placeholder api_key is needed
+        api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
+        model="mistralai/Mistral-7B-Instruct-v0.1",
+        api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
+        generation_kwargs = {"max_tokens": 512}
+    )

-print("-"*30)
-print(response)
-print("-"*30)
-```
+    response = generator.run(
+      messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
+    )

-Output e.g.:
+    print("-"*30)
+    print(response)
+    print("-"*30)
+    ```

 ```console
 ------------------------------
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@ -22,7 +22,7 @@ Before you begin, ensure that you have the following:

 To install the chart with the release name `test-vllm`:

-```console
+```bash
 helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
 ```

@ -30,7 +30,7 @@ helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f val

 To uninstall the `test-vllm` deployment:

-```console
+```bash
 helm uninstall test-vllm --namespace=ns-vllm
 ```

--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@ -18,7 +18,7 @@ And LiteLLM supports all models on VLLM.

 - Setup vLLM and litellm environment

-```console
+```bash
 pip install vllm litellm
 ```

@ -28,33 +28,35 @@ pip install vllm litellm

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```

 - Call it with litellm:

-```python
-import litellm 
+??? Code

-messages = [{ "content": "Hello, how are you?","role": "user"}]
+    ```python
+    import litellm 

-# hosted_vllm is prefix key word and necessary
-response = litellm.completion(
-            model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
-            messages=messages,
-            api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
-            temperature=0.2,
-            max_tokens=80)
+    messages = [{ "content": "Hello, how are you?","role": "user"}]

-print(response)
-```
+    # hosted_vllm is prefix key word and necessary
+    response = litellm.completion(
+                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+                messages=messages,
+                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+                temperature=0.2,
+                max_tokens=80)
+
+    print(response)
+    ```

 ### Embeddings

 - Start the vLLM server with the supported embedding model, e.g.

-```console
+```bash
 vllm serve BAAI/bge-base-en-v1.5
 ```

--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -17,99 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber

 Deploy the following yaml file `lws.yaml`

-```yaml
-apiVersion: leaderworkerset.x-k8s.io/v1
-kind: LeaderWorkerSet
-metadata:
-  name: vllm
-spec:
-  replicas: 2
-  leaderWorkerTemplate:
-    size: 2
-    restartPolicy: RecreateGroupOnPodRestart
-    leaderTemplate:
-      metadata:
-        labels:
-          role: leader
-      spec:
-        containers:
-          - name: vllm-leader
-            image: docker.io/vllm/vllm-openai:latest
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                value: <your-hf-token>
-            command:
-              - sh
-              - -c
-              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
-            resources:
-              limits:
-                nvidia.com/gpu: "8"
-                memory: 1124Gi
-                ephemeral-storage: 800Gi
-              requests:
-                ephemeral-storage: 800Gi
-                cpu: 125
-            ports:
-              - containerPort: 8080
-            readinessProbe:
-              tcpSocket:
-                port: 8080
-              initialDelaySeconds: 15
-              periodSeconds: 10
-            volumeMounts:
-              - mountPath: /dev/shm
-                name: dshm
-        volumes:
-        - name: dshm
-          emptyDir:
-            medium: Memory
-            sizeLimit: 15Gi
-    workerTemplate:
-      spec:
-        containers:
-          - name: vllm-worker
-            image: docker.io/vllm/vllm-openai:latest
-            command:
-              - sh
-              - -c
-              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
-            resources:
-              limits:
-                nvidia.com/gpu: "8"
-                memory: 1124Gi
-                ephemeral-storage: 800Gi
-              requests:
-                ephemeral-storage: 800Gi
-                cpu: 125
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                value: <your-hf-token>
-            volumeMounts:
-              - mountPath: /dev/shm
-                name: dshm   
-        volumes:
-        - name: dshm
-          emptyDir:
-            medium: Memory
-            sizeLimit: 15Gi
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vllm-leader
-spec:
-  ports:
-    - name: http
-      port: 8080
-      protocol: TCP
-      targetPort: 8080
-  selector:
-    leaderworkerset.sigs.k8s.io/name: vllm
-    role: leader
-  type: ClusterIP
-```
+??? Yaml
+
+    ```yaml
+    apiVersion: leaderworkerset.x-k8s.io/v1
+    kind: LeaderWorkerSet
+    metadata:
+      name: vllm
+    spec:
+      replicas: 2
+      leaderWorkerTemplate:
+        size: 2
+        restartPolicy: RecreateGroupOnPodRestart
+        leaderTemplate:
+          metadata:
+            labels:
+              role: leader
+          spec:
+            containers:
+              - name: vllm-leader
+                image: docker.io/vllm/vllm-openai:latest
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    value: <your-hf-token>
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                ports:
+                  - containerPort: 8080
+                readinessProbe:
+                  tcpSocket:
+                    port: 8080
+                  initialDelaySeconds: 15
+                  periodSeconds: 10
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+        workerTemplate:
+          spec:
+            containers:
+              - name: vllm-worker
+                image: docker.io/vllm/vllm-openai:latest
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    value: <your-hf-token>
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm   
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+    ---
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: vllm-leader
+    spec:
+      ports:
+        - name: http
+          port: 8080
+          protocol: TCP
+          targetPort: 8080
+      selector:
+        leaderworkerset.sigs.k8s.io/name: vllm
+        role: leader
+      type: ClusterIP
+    ```

 ```bash
 kubectl apply -f lws.yaml
@ -175,25 +177,27 @@ curl http://localhost:8080/v1/completions \

 The output should be similar to the following

-```text
-{
-  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
-  "object": "text_completion",
-  "created": 1715138766,
-  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
-  "choices": [
+??? Output
+
+    ```text
    {
-      "index": 0,
-      "text": " top destination for foodies, with",
-      "logprobs": null,
-      "finish_reason": "length",
-      "stop_reason": null
+      "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+      "object": "text_completion",
+      "created": 1715138766,
+      "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+      "choices": [
+        {
+          "index": 0,
+          "text": " top destination for foodies, with",
+          "logprobs": null,
+          "finish_reason": "length",
+          "stop_reason": null
+        }
+      ],
+      "usage": {
+        "prompt_tokens": 5,
+        "total_tokens": 12,
+        "completion_tokens": 7
+      }
    }
-  ],
-  "usage": {
-    "prompt_tokens": 5,
-    "total_tokens": 12,
-    "completion_tokens": 7
-  }
-}
-```
+    ```
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@ -7,13 +7,13 @@ title: Open WebUI

 2. Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```

 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):

-```console
+```bash
 docker run -d -p 3000:8080 \
 --name open-webui \
 -v open-webui:/app/backend/data \
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -15,7 +15,7 @@ Here are the integrations:

 - Setup vLLM and langchain environment

-```console
+```bash
 pip install -U vllm \
            langchain_milvus langchain_openai \
            langchain_community beautifulsoup4 \
@ -26,14 +26,14 @@ pip install -U vllm \

 - Start the vLLM server with the supported embedding model, e.g.

-```console
+```bash
 # Start embedding service (port 8000)
 vllm serve ssmits/Qwen2-7B-Instruct-embed-base
 ```

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 # Start chat service (port 8001)
 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
 ```
@ -52,7 +52,7 @@ python retrieval_augmented_generation_with_langchain.py

 - Setup vLLM and llamaindex environment

-```console
+```bash
 pip install vllm \
            llama-index llama-index-readers-web \
            llama-index-llms-openai-like    \
@ -64,14 +64,14 @@ pip install vllm \

 - Start the vLLM server with the supported embedding model, e.g.

-```console
+```bash
 # Start embedding service (port 8000)
 vllm serve ssmits/Qwen2-7B-Instruct-embed-base
 ```

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 # Start chat service (port 8001)
 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
 ```
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@ -15,7 +15,7 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
 - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
 - Check that `sky check` shows clouds or Kubernetes are enabled.

-```console
+```bash
 pip install skypilot-nightly
 sky check
 ```
@ -24,52 +24,54 @@ sky check

 See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).

-```yaml
-resources:
-  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-  use_spot: True
-  disk_size: 512  # Ensure model checkpoints can fit.
-  disk_tier: best
-  ports: 8081  # Expose to internet traffic.
+??? Yaml

-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    ```yaml
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

-  pip install vllm==0.4.0.post1
-  # Install Gradio for web UI.
-  pip install gradio openai
-  pip install flash-attn==2.5.7
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-run: |
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.openai.api_server \
-    --port 8081 \
-    --model $MODEL_NAME \
-    --trust-remote-code \
-    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-    2>&1 | tee api_server.log &
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7

-  echo 'Waiting for vllm api server to start...'
-  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log &

-  echo 'Starting gradio server...'
-  git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-    -m $MODEL_NAME \
-    --port 8811 \
-    --model-url http://localhost:8081/v1 \
-    --stop-token-ids 128009,128001
-```
+      echo 'Waiting for vllm api server to start...'
+      while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+      echo 'Starting gradio server...'
+      git clone https://github.com/vllm-project/vllm.git || true
+      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+        -m $MODEL_NAME \
+        --port 8811 \
+        --model-url http://localhost:8081/v1 \
+        --stop-token-ids 128009,128001
+    ```

 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):

-```console
+```bash
 HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
 ```

@ -81,7 +83,7 @@ Check the output of the command. There will be a shareable gradio link (like the

 **Optional**: Serve the 70B model instead of the default 8B and use more GPU:

-```console
+```bash
 HF_TOKEN="your-huggingface-token" \
  sky launch serving.yaml \
  --gpus A100:8 \
@ -93,72 +95,71 @@ HF_TOKEN="your-huggingface-token" \

 SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.

-```yaml
-service:
-  replicas: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-    model: $MODEL_NAME
-    messages:
-      - role: user
-        content: Hello! What is your name?
-  max_completion_tokens: 1
-```
+??? Yaml

-<details>
-<summary>Click to see the full recipe YAML</summary>
-
-```yaml
-service:
-  replicas: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
+    ```yaml
+    service:
+      replicas: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
      max_completion_tokens: 1
+    ```

-resources:
-  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-  use_spot: True
-  disk_size: 512  # Ensure model checkpoints can fit.
-  disk_tier: best
-  ports: 8081  # Expose to internet traffic.
+??? Yaml

-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    ```yaml
+    service:
+      replicas: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+          model: $MODEL_NAME
+          messages:
+            - role: user
+              content: Hello! What is your name?
+          max_completion_tokens: 1

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.

-  pip install vllm==0.4.0.post1
-  # Install Gradio for web UI.
-  pip install gradio openai
-  pip install flash-attn==2.5.7
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

-run: |
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.openai.api_server \
-    --port 8081 \
-    --model $MODEL_NAME \
-    --trust-remote-code \
-    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-    2>&1 | tee api_server.log
-```
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-</details>
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7
+
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log
+    ```

 Start the serving the Llama-3 8B model on multiple replicas:

-```console
+```bash
 HF_TOKEN="your-huggingface-token" \
  sky serve up -n vllm serving.yaml \
  --env HF_TOKEN
@ -166,12 +167,11 @@ HF_TOKEN="your-huggingface-token" \

 Wait until the service is ready:

-```console
+```bash
 watch -n10 sky serve status vllm
 ```

-<details>
-<summary>Example outputs:</summary>
+Example outputs:

 ```console
 Services
@ -184,29 +184,29 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```

-</details>
-
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:

-```console
-ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-curl -L http://$ENDPOINT/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant."
-    },
-    {
-      "role": "user",
-      "content": "Who are you?"
-    }
-    ],
-    "stop_token_ids": [128009,  128001]
-  }'
-```
+??? Commands
+
+    ```bash
+    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+    curl -L http://$ENDPOINT/v1/chat/completions \
+      -H "Content-Type: application/json" \
+      -d '{
+        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "Who are you?"
+        }
+        ],
+        "stop_token_ids": [128009,  128001]
+      }'
+    ```

 To enable autoscaling, you could replace the `replicas` with the following configs in `service`:

@ -220,67 +220,64 @@ service:

 This will scale the service up to when the QPS exceeds 2 for each replica.

-<details>
-<summary>Click to see the full recipe YAML</summary>
+??? Yaml

-```yaml
-service:
-  replica_policy:
-    min_replicas: 2
-    max_replicas: 4
-    target_qps_per_replica: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
-      max_completion_tokens: 1
+    ```yaml
+    service:
+      replica_policy:
+        min_replicas: 2
+        max_replicas: 4
+        target_qps_per_replica: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+          model: $MODEL_NAME
+          messages:
+            - role: user
+              content: Hello! What is your name?
+          max_completion_tokens: 1

-resources:
-  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-  use_spot: True
-  disk_size: 512  # Ensure model checkpoints can fit.
-  disk_tier: best
-  ports: 8081  # Expose to internet traffic.
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.

-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-  pip install vllm==0.4.0.post1
-  # Install Gradio for web UI.
-  pip install gradio openai
-  pip install flash-attn==2.5.7
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7

-run: |
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.openai.api_server \
-    --port 8081 \
-    --model $MODEL_NAME \
-    --trust-remote-code \
-    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-    2>&1 | tee api_server.log
-```
-
-</details>
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log
+    ```

 To update the service with the new config:

-```console
+```bash
 HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
 ```

 To stop the service:

-```console
+```bash
 sky serve down vllm
 ```

@ -288,42 +285,39 @@ sky serve down vllm

 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.

-<details>
-<summary>Click to see the full GUI YAML</summary>
+??? Yaml

-```yaml
-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+    ```yaml
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.

-resources:
-  cpus: 2
+    resources:
+      cpus: 2

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-  # Install Gradio for web UI.
-  pip install gradio openai
+      # Install Gradio for web UI.
+      pip install gradio openai

-run: |
-  conda activate vllm
-  export PATH=$PATH:/sbin
+    run: |
+      conda activate vllm
+      export PATH=$PATH:/sbin

-  echo 'Starting gradio server...'
-  git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-    -m $MODEL_NAME \
-    --port 8811 \
-    --model-url http://$ENDPOINT/v1 \
-    --stop-token-ids 128009,128001 | tee ~/gradio.log
-```
-
-</details>
+      echo 'Starting gradio server...'
+      git clone https://github.com/vllm-project/vllm.git || true
+      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+        -m $MODEL_NAME \
+        --port 8811 \
+        --model-url http://$ENDPOINT/v1 \
+        --stop-token-ids 128009,128001 | tee ~/gradio.log
+    ```

 1. Start the chat web UI:

-    ```console
+    ```bash
    sky launch \
      -c gui ./gui.yaml \
      --env ENDPOINT=$(sky serve status --endpoint vllm)
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@ -15,13 +15,13 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu

 - Start the vLLM server with the supported chat completion model, e.g.

-```console
+```bash
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```

 - Install streamlit and openai:

-```console
+```bash
 pip install streamlit openai
 ```

@ -29,7 +29,7 @@ pip install streamlit openai

 - Start the streamlit web UI and start to chat:

-```console
+```bash
 streamlit run streamlit_openai_chatbot_webserver.py

 # or specify the VLLM_API_BASE or VLLM_API_KEY
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta

 To install Llama Stack, run

-```console
+```bash
 pip install llama-stack -q
 ```

--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
 curl -o- http://localhost:30080/models
 ```

-Expected output:
+??? Output

-```json
-{
-  "object": "list",
-  "data": [
+    ```json
    {
-      "id": "facebook/opt-125m",
-      "object": "model",
-      "created": 1737428424,
-      "owned_by": "vllm",
-      "root": null
+      "object": "list",
+      "data": [
+        {
+          "id": "facebook/opt-125m",
+          "object": "model",
+          "created": 1737428424,
+          "owned_by": "vllm",
+          "root": null
+        }
+      ]
    }
-  ]
-}
-```
+    ```

 To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:

@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \
  }'
 ```

-Expected output:
+??? Output

-```json
-{
-  "id": "completion-id",
-  "object": "text_completion",
-  "created": 1737428424,
-  "model": "facebook/opt-125m",
-  "choices": [
+    ```json
    {
-      "text": " there was a brave knight who...",
-      "index": 0,
-      "finish_reason": "length"
+      "id": "completion-id",
+      "object": "text_completion",
+      "created": 1737428424,
+      "model": "facebook/opt-125m",
+      "choices": [
+        {
+          "text": " there was a brave knight who...",
+          "index": 0,
+          "finish_reason": "length"
+        }
+      ]
    }
-  ]
-}
-```
+    ```

 ### Uninstall

@ -121,23 +121,25 @@ sudo helm uninstall vllm

 The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:

-```yaml
-servingEngineSpec:
-  runtimeClassName: ""
-  modelSpec:
-  - name: "opt125m"
-    repository: "vllm/vllm-openai"
-    tag: "latest"
-    modelURL: "facebook/opt-125m"
+??? Yaml

-    replicaCount: 1
+    ```yaml
+    servingEngineSpec:
+      runtimeClassName: ""
+      modelSpec:
+      - name: "opt125m"
+        repository: "vllm/vllm-openai"
+        tag: "latest"
+        modelURL: "facebook/opt-125m"

-    requestCPU: 6
-    requestMemory: "16Gi"
-    requestGPU: 1
+        replicaCount: 1

-    pvcStorage: "10Gi"
-```
+        requestCPU: 6
+        requestMemory: "16Gi"
+        requestGPU: 1
+
+        pvcStorage: "10Gi"
+    ```

 In this YAML configuration:
 * **`modelSpec`** includes:
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -29,89 +29,93 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:

 First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:

-```bash
-cat <<EOF |kubectl apply -f -
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: vllm-models
-spec:
-  accessModes:
-    - ReadWriteOnce
-  volumeMode: Filesystem
-  resources:
-    requests:
-      storage: 50Gi
---
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token-secret
-type: Opaque
-data:
-  token: $(HF_TOKEN)
-EOF
-```
+??? Config
+
+    ```bash
+    cat <<EOF |kubectl apply -f -
+    apiVersion: v1
+    kind: PersistentVolumeClaim
+    metadata:
+      name: vllm-models
+    spec:
+      accessModes:
+        - ReadWriteOnce
+      volumeMode: Filesystem
+      resources:
+        requests:
+          storage: 50Gi
+    ---
+    apiVersion: v1
+    kind: Secret
+    metadata:
+      name: hf-token-secret
+    type: Opaque
+    data:
+      token: $(HF_TOKEN)
+    EOF
+    ```

 Next, start the vLLM server as a Kubernetes Deployment and Service:

-```bash
-cat <<EOF |kubectl apply -f -
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm-server
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: vllm
-  template:
+??? Config
+
+    ```bash
+    cat <<EOF |kubectl apply -f -
+    apiVersion: apps/v1
+    kind: Deployment
    metadata:
-      labels:
-        app.kubernetes.io/name: vllm
+      name: vllm-server
    spec:
-      containers:
-      - name: vllm
-        image: vllm/vllm-openai:latest
-        command: ["/bin/sh", "-c"]
-        args: [
-          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
-        ]
-        env:
-        - name: HUGGING_FACE_HUB_TOKEN
-          valueFrom:
-            secretKeyRef:
-              name: hf-token-secret
-              key: token
-        ports:
-          - containerPort: 8000
-        volumeMounts:
+      replicas: 1
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: vllm
+      template:
+        metadata:
+          labels:
+            app.kubernetes.io/name: vllm
+        spec:
+          containers:
+          - name: vllm
+            image: vllm/vllm-openai:latest
+            command: ["/bin/sh", "-c"]
+            args: [
+              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+            ]
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+            ports:
+              - containerPort: 8000
+            volumeMounts:
+              - name: llama-storage
+                mountPath: /root/.cache/huggingface
+          volumes:
          - name: llama-storage
-            mountPath: /root/.cache/huggingface
-      volumes:
-      - name: llama-storage
-        persistentVolumeClaim:
-          claimName: vllm-models
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vllm-server
-spec:
-  selector:
-    app.kubernetes.io/name: vllm
-  ports:
-  - protocol: TCP
-    port: 8000
-    targetPort: 8000
-  type: ClusterIP
-EOF
-```
+            persistentVolumeClaim:
+              claimName: vllm-models
+    ---
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: vllm-server
+    spec:
+      selector:
+        app.kubernetes.io/name: vllm
+      ports:
+      - protocol: TCP
+        port: 8000
+        targetPort: 8000
+      type: ClusterIP
+    EOF
+    ```

 We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):

-```console
+```bash
 kubectl logs -l app.kubernetes.io/name=vllm
 ...
 INFO:     Started server process [1]
@ -128,6 +132,9 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)

      PVC is used to store the model cache and it is optional, you can use hostPath or other storage options

+      <details>
+      <summary>Yaml</summary>
+
      ```yaml
      apiVersion: v1
      kind: PersistentVolumeClaim
@ -144,6 +151,8 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
        volumeMode: Filesystem
      ```

+      </details>
+
      Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models

      ```yaml
@ -156,13 +165,16 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
      stringData:
        token: "REPLACE_WITH_TOKEN"
      ```
-
+  
      Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.

      Here are two examples for using NVIDIA GPU and AMD GPU.

      NVIDIA GPU:

+      <details>
+      <summary>Yaml</summary>
+
      ```yaml
      apiVersion: apps/v1
      kind: Deployment
@ -233,10 +245,15 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                periodSeconds: 5
      ```

+      </details>
+
      AMD GPU:

      You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.

+      <details>
+      <summary>Yaml</summary>
+
      ```yaml
      apiVersion: apps/v1
      kind: Deployment
@ -305,12 +322,17 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                mountPath: /dev/shm
      ```

+      </details>
+
      You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.

 2. Create a Kubernetes Service for vLLM

      Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:

+      <details>
+      <summary>Yaml</summary>
+
      ```yaml
      apiVersion: v1
      kind: Service
@ -330,18 +352,20 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
        type: ClusterIP
      ```

+      </details>
+
 3. Deploy and Test

      Apply the deployment and service configurations using `kubectl apply -f <filename>`:

-      ```console
+      ```bash
      kubectl apply -f deployment.yaml
      kubectl apply -f service.yaml
      ```

      To test the deployment, run the following `curl` command:

-      ```console
+      ```bash
      curl http://mistral-7b.default.svc.cluster.local/v1/completions \
        -H "Content-Type: application/json" \
        -d '{
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@ -11,13 +11,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx

 This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.

-```console
+```bash
 export vllm_root=`pwd`
 ```

 Create a file named `Dockerfile.nginx`:

-```console
+```dockerfile
 FROM nginx:latest
 RUN rm /etc/nginx/conf.d/default.conf
 EXPOSE 80
@ -26,7 +26,7 @@ CMD ["nginx", "-g", "daemon off;"]

 Build the container:

-```console
+```bash
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```

@ -36,36 +36,38 @@ docker build . -f Dockerfile.nginx --tag nginx-lb

 Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.

-```console
-upstream backend {
-    least_conn;
-    server vllm0:8000 max_fails=3 fail_timeout=10000s;
-    server vllm1:8000 max_fails=3 fail_timeout=10000s;
-}
-server {
-    listen 80;
-    location / {
-        proxy_pass http://backend;
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        proxy_set_header X-Forwarded-Proto $scheme;
+??? Config
+
+    ```console
+    upstream backend {
+        least_conn;
+        server vllm0:8000 max_fails=3 fail_timeout=10000s;
+        server vllm1:8000 max_fails=3 fail_timeout=10000s;
    }
-}
-```
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+    }
+    ```

 [](){ #nginxloadbalancer-nginx-vllm-container }

 ## Build vLLM Container

-```console
+```bash
 cd $vllm_root
 docker build -f docker/Dockerfile . --tag vllm
 ```

 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:

-```console
+```bash
 cd $vllm_root
 docker build \
    -f docker/Dockerfile . \
@ -78,7 +80,7 @@ docker build \

 ## Create Docker Network

-```console
+```bash
 docker network create vllm_nginx
 ```

@ -93,30 +95,32 @@ Notes:
 - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
 - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.

-```console
-mkdir -p ~/.cache/huggingface/hub/
-hf_cache_dir=~/.cache/huggingface/
-docker run \
-    -itd \
-    --ipc host \
-    --network vllm_nginx \
-    --gpus device=0 \
-    --shm-size=10.24gb \
-    -v $hf_cache_dir:/root/.cache/huggingface/ \
-    -p 8081:8000 \
-    --name vllm0 vllm \
-    --model meta-llama/Llama-2-7b-chat-hf
-docker run \
-    -itd \
-    --ipc host \
-    --network vllm_nginx \
-    --gpus device=1 \
-    --shm-size=10.24gb \
-    -v $hf_cache_dir:/root/.cache/huggingface/ \
-    -p 8082:8000 \
-    --name vllm1 vllm \
-    --model meta-llama/Llama-2-7b-chat-hf
-```
+??? Commands
+
+    ```console
+    mkdir -p ~/.cache/huggingface/hub/
+    hf_cache_dir=~/.cache/huggingface/
+    docker run \
+        -itd \
+        --ipc host \
+        --network vllm_nginx \
+        --gpus device=0 \
+        --shm-size=10.24gb \
+        -v $hf_cache_dir:/root/.cache/huggingface/ \
+        -p 8081:8000 \
+        --name vllm0 vllm \
+        --model meta-llama/Llama-2-7b-chat-hf
+    docker run \
+        -itd \
+        --ipc host \
+        --network vllm_nginx \
+        --gpus device=1 \
+        --shm-size=10.24gb \
+        -v $hf_cache_dir:/root/.cache/huggingface/ \
+        -p 8082:8000 \
+        --name vllm1 vllm \
+        --model meta-llama/Llama-2-7b-chat-hf
+    ```

 !!! note
    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
@ -125,7 +129,7 @@ docker run \

 ## Launch Nginx

-```console
+```bash
 docker run \
    -itd \
    -p 8000:80 \
@ -138,7 +142,7 @@ docker run \

 ## Verify That vLLM Servers Are Ready

-```console
+```bash
 docker logs vllm0 | grep Uvicorn
 docker logs vllm1 | grep Uvicorn
 ```
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -22,31 +22,33 @@ server.

 Here is a sample of `LLM` class usage:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Define a list of input prompts
-prompts = [
-    "Hello, my name is",
-    "The capital of France is",
-    "The largest ocean is",
-]
+    ```python
+    from vllm import LLM, SamplingParams

-# Define sampling parameters
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Define a list of input prompts
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The largest ocean is",
+    ]

-# Initialize the LLM engine with the OPT-125M model
-llm = LLM(model="facebook/opt-125m")
+    # Define sampling parameters
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# Generate outputs for the input prompts
-outputs = llm.generate(prompts, sampling_params)
+    # Initialize the LLM engine with the OPT-125M model
+    llm = LLM(model="facebook/opt-125m")

-# Print the generated outputs
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    # Generate outputs for the input prompts
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the generated outputs
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.

@ -178,32 +180,34 @@ vision-language model.

    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:

-    ```python
-    class MyOldModel(nn.Module):
-        def __init__(
-            self,
-            config,
-            cache_config: Optional[CacheConfig] = None,
-            quant_config: Optional[QuantizationConfig] = None,
-            lora_config: Optional[LoRAConfig] = None,
-            prefix: str = "",
-        ) -> None:
-            ...
+    ??? Code

-    from vllm.config import VllmConfig
-    class MyNewModel(MyOldModel):
-        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-            config = vllm_config.model_config.hf_config
-            cache_config = vllm_config.cache_config
-            quant_config = vllm_config.quant_config
-            lora_config = vllm_config.lora_config
-            super().__init__(config, cache_config, quant_config, lora_config, prefix)
+        ```python
+        class MyOldModel(nn.Module):
+            def __init__(
+                self,
+                config,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None,
+                prefix: str = "",
+            ) -> None:
+                ...

-    if __version__ >= "0.6.4":
-        MyModel = MyNewModel
-    else:
-        MyModel = MyOldModel
-    ```
+        from vllm.config import VllmConfig
+        class MyNewModel(MyOldModel):
+            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+                config = vllm_config.model_config.hf_config
+                cache_config = vllm_config.cache_config
+                quant_config = vllm_config.quant_config
+                lora_config = vllm_config.lora_config
+                super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+        if __version__ >= "0.6.4":
+            MyModel = MyNewModel
+        else:
+            MyModel = MyOldModel
+        ```

    This way, the model can work with both old and new versions of vLLM.

--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
 all results for output have been calculated but are just stored in
 different thread register memory.

-```cpp
-float* out_smem = reinterpret_cast<float*>(shared_mem);
-for (int i = NUM_WARPS; i > 1; i /= 2) {
-    // Upper warps write to shared memory.
-    ...
-    float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-        ...
-        dst[row_idx] = accs[i];
-    }
+??? Code

-    // Lower warps update the output.
-    const float* src = &out_smem[warp_idx * HEAD_SIZE];
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    ```cpp
+    float* out_smem = reinterpret_cast<float*>(shared_mem);
+    for (int i = NUM_WARPS; i > 1; i /= 2) {
+        // Upper warps write to shared memory.
        ...
-        accs[i] += src[row_idx];
-    }
+        float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            dst[row_idx] = accs[i];
+        }

-    // Write out the accs.
-}
-```
+        // Lower warps update the output.
+        const float* src = &out_smem[warp_idx * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            accs[i] += src[row_idx];
+        }
+
+        // Write out the accs.
+    }
+    ```

 ## Output

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (

 vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:

-```python
-# inside `setup.py` file
-from setuptools import setup
+??? Code

-setup(name='vllm_add_dummy_model',
-      version='0.1',
-      packages=['vllm_add_dummy_model'],
-      entry_points={
-          'vllm.general_plugins':
-          ["register_dummy_model = vllm_add_dummy_model:register"]
-      })
+    ```python
+    # inside `setup.py` file
+    from setuptools import setup

-# inside `vllm_add_dummy_model.py` file
-def register():
-    from vllm import ModelRegistry
+    setup(name='vllm_add_dummy_model',
+        version='0.1',
+        packages=['vllm_add_dummy_model'],
+        entry_points={
+            'vllm.general_plugins':
+            ["register_dummy_model = vllm_add_dummy_model:register"]
+        })

-    if "MyLlava" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model(
-            "MyLlava",
-            "vllm_add_dummy_model.my_llava:MyLlava",
-        )
-```
+    # inside `vllm_add_dummy_model.py` file
+    def register():
+        from vllm import ModelRegistry
+
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
+            ModelRegistry.register_model(
+                "MyLlava",
+                "vllm_add_dummy_model.my_llava:MyLlava",
+            )
+    ```

 For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).

--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@ -61,23 +61,25 @@ To address the above issues, I have designed and developed a local Tensor memory

 # Install vLLM

-```shell
-# Enter the home directory or your working directory.
-cd /home
+??? Commands

-# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
-wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    ```shell
+    # Enter the home directory or your working directory.
+    cd /home

-# Download the code repository.
-git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
-cd vllm
+    # Download the installation package, and I will update the commit-id in time. You can directly copy the command.
+    wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl

-# Set the installation package path.
-export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    # Download the code repository.
+    git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
+    cd vllm

-# installation
-pip install -e . -v
-```
+    # Set the installation package path.
+    export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+
+    # installation
+    pip install -e . -v
+    ```

 # Run xPyD

@ -104,83 +106,91 @@ python3 disagg_prefill_proxy_xpyd.py &

 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20005 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20005 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20009 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.7 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20009 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20003 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.7 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20003 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20008 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.7 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20008 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ## Run 3P1D

@ -193,83 +203,91 @@ python3 disagg_prefill_proxy_xpyd.py &

 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20005 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20005 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20009 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20009 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20003 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20003 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)

-```shell
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
-    --host 0.0.0.0 \
-    --port 20008 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
-    --served-model-name base_model \
-    --dtype float16 \
-    --max-model-len 10000 \
-    --max-num-batched-tokens 10000 \
-    --max-num-seqs 256 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.7 \
-    --disable-log-request \
-    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-```
+??? Command
+
+    ```shell
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+        --host 0.0.0.0 \
+        --port 20008 \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --served-model-name base_model \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+    ```

 # Single request

@ -286,25 +304,27 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \

 # Benchmark

-```shell
-python3 benchmark_serving.py \
-    --backend vllm \
-    --model base_model \
-    --tokenizer meta-llama/Llama-3.1-8B-Instruct \
-    --dataset-name "random" \
-    --host 10.0.1.1 \
-    --port 10001 \
-    --random-input-len 1024 \
-    --random-output-len 1024 \
-    --ignore-eos \
-    --burstiness 100 \
-    --percentile-metrics "ttft,tpot,itl,e2el" \
-    --metric-percentiles "90,95,99" \
-    --seed $(date +%s) \
-    --trust-remote-code \
-    --request-rate 3 \
-    --num-prompts 1000
-```
+??? Command
+
+    ```shell
+    python3 benchmark_serving.py \
+        --backend vllm \
+        --model base_model \
+        --tokenizer meta-llama/Llama-3.1-8B-Instruct \
+        --dataset-name "random" \
+        --host 10.0.1.1 \
+        --port 10001 \
+        --random-input-len 1024 \
+        --random-output-len 1024 \
+        --ignore-eos \
+        --burstiness 100 \
+        --percentile-metrics "ttft,tpot,itl,e2el" \
+        --metric-percentiles "90,95,99" \
+        --seed $(date +%s) \
+        --trust-remote-code \
+        --request-rate 3 \
+        --num-prompts 1000
+    ```

 # Shut down

--- a/docs/design/v1/torch_compile.md
+++ b/docs/design/v1/torch_compile.md
@ -28,27 +28,29 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all

 In the very verbose logs, we can see:

-```
-DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
+??? Logs

-DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
+      ```text
+      DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>

-DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
-DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
-```
+      DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
+
+      DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
+      DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
+      ```

 This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.

@ -99,28 +101,31 @@ This time, Inductor compilation is completely bypassed, and we will load from di

 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:

-```
-vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
+```bash
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
 ```

 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.

 When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:

-```
-AUTOTUNE mm(8x2048, 2048x3072)
-  triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-  triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
-  triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-  mm 0.0160 ms 81.6% 
-  triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
-  triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-  triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
-  triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
-  triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
-  triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
-SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
-```
+??? Logs
+
+    ```
+    AUTOTUNE mm(8x2048, 2048x3072)
+      triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+      triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+      triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
+      mm 0.0160 ms 81.6% 
+      triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
+      triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+      triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
+      triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+      triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+      triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+    SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
+    ```

 It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).

@ -136,8 +141,9 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh

 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:

-```
-vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
+```bash
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```

 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
 of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
 the third parameter is the path to the LoRA adapter.

-```python
-sampling_params = SamplingParams(
-    temperature=0,
-    max_tokens=256,
-    stop=["[/assistant]"]
-)
+??? Code

-prompts = [
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-]
+    ```python
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=256,
+        stop=["[/assistant]"]
+    )

-outputs = llm.generate(
-    prompts,
-    sampling_params,
-    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
-)
-```
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    ]
+
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+    )
+    ```

 Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.

@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):

-```bash
-curl localhost:8000/v1/models | jq .
-{
-    "object": "list",
-    "data": [
-        {
-            "id": "meta-llama/Llama-2-7b-hf",
-            "object": "model",
-            ...
-        },
-        {
-            "id": "sql-lora",
-            "object": "model",
-            ...
-        }
-    ]
-}
-```
+??? Command
+
+    ```bash
+    curl localhost:8000/v1/models | jq .
+    {
+        "object": "list",
+        "data": [
+            {
+                "id": "meta-llama/Llama-2-7b-hf",
+                "object": "model",
+                ...
+            },
+            {
+                "id": "sql-lora",
+                "object": "model",
+                ...
+            }
+        ]
+    }
+    ```

 Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:

 1. Implement the LoRAResolver interface.

-    Example of a simple S3 LoRAResolver implementation:
+    ??? Example of a simple S3 LoRAResolver implementation

-    ```python
-    import os
-    import s3fs
-    from vllm.lora.request import LoRARequest
-    from vllm.lora.resolver import LoRAResolver
+        ```python
+        import os
+        import s3fs
+        from vllm.lora.request import LoRARequest
+        from vllm.lora.resolver import LoRAResolver

-    class S3LoRAResolver(LoRAResolver):
-        def __init__(self):
-            self.s3 = s3fs.S3FileSystem()
-            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
-            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+        class S3LoRAResolver(LoRAResolver):
+            def __init__(self):
+                self.s3 = s3fs.S3FileSystem()
+                self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+                self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")

-        async def resolve_lora(self, base_model_name, lora_name):
-            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            async def resolve_lora(self, base_model_name, lora_name):
+                s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)

-            # Download the LoRA from S3 to the local path
-            await self.s3._get(
-                s3_path, local_path, recursive=True, maxdepth=1
-            )
+                # Download the LoRA from S3 to the local path
+                await self.s3._get(
+                    s3_path, local_path, recursive=True, maxdepth=1
+                )

-            lora_request = LoRARequest(
-                lora_name=lora_name,
-                lora_path=local_path,
-                lora_int_id=abs(hash(lora_name))
-            )
-            return lora_request
-    ```
+                lora_request = LoRARequest(
+                    lora_name=lora_name,
+                    lora_path=local_path,
+                    lora_int_id=abs(hash(lora_name))
+                )
+                return lora_request
+        ```

 2. Register `LoRAResolver` plugin.

@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
 - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.

-```bash
-$ curl http://localhost:8000/v1/models
+??? Command output

-{
-    "object": "list",
-    "data": [
-        {
-        "id": "meta-llama/Llama-2-7b-hf",
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-        "parent": null,
-        "permission": [
+    ```bash
+    $ curl http://localhost:8000/v1/models
+
+    {
+        "object": "list",
+        "data": [
            {
-            .....
+            "id": "meta-llama/Llama-2-7b-hf",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+            "parent": null,
+            "permission": [
+                {
+                .....
+                }
+            ]
+            },
+            {
+            "id": "sql-lora",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            "parent": meta-llama/Llama-2-7b-hf,
+            "permission": [
+                {
+                ....
+                }
+            ]
            }
        ]
-        },
-        {
-        "id": "sql-lora",
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-        "parent": meta-llama/Llama-2-7b-hf,
-        "permission": [
-            {
-            ....
-            }
-        ]
-        }
-    ]
-}
-```
+    }
+    ```
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:

 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:

-```python
-from vllm import LLM
+??? Code

-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    ```python
+    from vllm import LLM

-# Refer to the HuggingFace repo for the correct format to use
-prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")

-# Load the image using PIL.Image
-image = PIL.Image.open(...)
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"

-# Single prompt inference
-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {"image": image},
-})
+    # Load the image using PIL.Image
+    image = PIL.Image.open(...)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
+    # Single prompt inference
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    })

-# Batch inference
-image_1 = PIL.Image.open(...)
-image_2 = PIL.Image.open(...)
-outputs = llm.generate(
-    [
-        {
-            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
-            "multi_modal_data": {"image": image_1},
-        },
-        {
-            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
-            "multi_modal_data": {"image": image_2},
-        }
-    ]
-)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    # Batch inference
+    image_1 = PIL.Image.open(...)
+    image_2 = PIL.Image.open(...)
+    outputs = llm.generate(
+        [
+            {
+                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_1},
+            },
+            {
+                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_2},
+            }
+        ]
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 Full example: <gh-file:examples/offline_inference/vision_language.py>

 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:

-```python
-from vllm import LLM
+??? Code

-llm = LLM(
-    model="microsoft/Phi-3.5-vision-instruct",
-    trust_remote_code=True,  # Required to load Phi-3.5-vision
-    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
-)
+    ```python
+    from vllm import LLM

-# Refer to the HuggingFace repo for the correct format to use
-prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,  # Required to load Phi-3.5-vision
+        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+    )

-# Load the images using PIL.Image
-image1 = PIL.Image.open(...)
-image2 = PIL.Image.open(...)
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"

-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {
-        "image": [image1, image2]
-    },
-})
+    # Load the images using PIL.Image
+    image1 = PIL.Image.open(...)
+    image2 = PIL.Image.open(...)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": [image1, image2]
+        },
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

-```python
-from vllm import LLM
+??? Code

-# Specify the maximum number of frames per video to be 4. This can be changed.
-llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    ```python
+    from vllm import LLM

-# Create the request payload.
-video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-message = {
-    "role": "user",
-    "content": [
-        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
-    ],
-}
-for i in range(len(video_frames)):
-    base64_image = encode_image(video_frames[i]) # base64 encoding.
-    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-    message["content"].append(new_image)
+    # Specify the maximum number of frames per video to be 4. This can be changed.
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})

-# Perform inference and log output.
-outputs = llm.chat([message])
+    # Create the request payload.
+    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+    message = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+        ],
+    }
+    for i in range(len(video_frames)):
+        base64_image = encode_image(video_frames[i]) # base64 encoding.
+        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+        message["content"].append(new_image)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    # Perform inference and log output.
+    outputs = llm.chat([message])
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 ### Video Inputs

@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.

-```python
-from vllm import LLM
+??? Code

-# Inference with image embeddings as input
-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    ```python
+    from vllm import LLM

-# Refer to the HuggingFace repo for the correct format to use
-prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+    # Inference with image embeddings as input
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")

-# Embeddings for single image
-# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-image_embeds = torch.load(...)
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"

-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {"image": image_embeds},
-})
+    # Embeddings for single image
+    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:

-```python
-# Construct the prompt based on your model
-prompt = ...
+??? Code

-# Embeddings for multiple images
-# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-image_embeds = torch.load(...)
+    ```python
+    # Construct the prompt based on your model
+    prompt = ...

-# Qwen2-VL
-llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-mm_data = {
-    "image": {
-        "image_embeds": image_embeds,
-        # image_grid_thw is needed to calculate positional encoding.
-        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+    # Embeddings for multiple images
+    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    # Qwen2-VL
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_grid_thw is needed to calculate positional encoding.
+            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+        }
    }
-}

-# MiniCPM-V
-llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
-mm_data = {
-    "image": {
-        "image_embeds": image_embeds,
-        # image_sizes is needed to calculate details of the sliced image.
-        "image_sizes": [image.size for image in images],  # list of image sizes
+    # MiniCPM-V
+    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_sizes is needed to calculate details of the sliced image.
+            "image_sizes": [image.size for image in images],  # list of image sizes
+        }
    }
-}

-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": mm_data,
-})
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    })

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 ## Online Serving

@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \

 Then, you can use the OpenAI client as follows:

-```python
-from openai import OpenAI
+??? Code

-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-# Single-image input inference
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-chat_response = client.chat.completions.create(
-    model="microsoft/Phi-3.5-vision-instruct",
-    messages=[{
-        "role": "user",
-        "content": [
-            # NOTE: The prompt formatting with the image token `<image>` is not needed
-            # since the prompt will be processed automatically by the API server.
-            {"type": "text", "text": "What’s in this image?"},
-            {"type": "image_url", "image_url": {"url": image_url}},
-        ],
-    }],
-)
-print("Chat completion output:", chat_response.choices[0].message.content)
+    # Single-image input inference
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

-# Multi-image input inference
-image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    chat_response = client.chat.completions.create(
+        model="microsoft/Phi-3.5-vision-instruct",
+        messages=[{
+            "role": "user",
+            "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
+                {"type": "text", "text": "What’s in this image?"},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }],
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)

-chat_response = client.chat.completions.create(
-    model="microsoft/Phi-3.5-vision-instruct",
-    messages=[{
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What are the animals in these images?"},
-            {"type": "image_url", "image_url": {"url": image_url_duck}},
-            {"type": "image_url", "image_url": {"url": image_url_lion}},
-        ],
-    }],
-)
-print("Chat completion output:", chat_response.choices[0].message.content)
-```
+    # Multi-image input inference
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
+    chat_response = client.chat.completions.create(
+        model="microsoft/Phi-3.5-vision-instruct",
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What are the animals in these images?"},
+                {"type": "image_url", "image_url": {"url": image_url_duck}},
+                {"type": "image_url", "image_url": {"url": image_url_lion}},
+            ],
+        }],
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)
+    ```

 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>

@ -295,7 +307,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
    By default, the timeout for fetching images through HTTP URL is `5` seconds.
    You can override this by setting the environment variable:

-    ```console
+    ```bash
    export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
    ```

@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model

 Then, you can use the OpenAI client as follows:

-```python
-from openai import OpenAI
+??? Code

-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-## Use video url in the payload
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )

-result = chat_completion_from_url.choices[0].message.content
-print("Chat completion output from image url:", result)
-```
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ```

 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>

@ -356,7 +370,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
    You can override this by setting the environment variable:

-    ```console
+    ```bash
    export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
    ```

@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b

 Then, you can use the OpenAI client as follows:

-```python
-import base64
-import requests
-from openai import OpenAI
-from vllm.assets.audio import AudioAsset
+??? Code

-def encode_base64_content_from_url(content_url: str) -> str:
-    """Encode a content retrieved from a remote url to base64 format."""
+    ```python
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset

-    with requests.get(content_url) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode('utf-8')
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""

-    return result
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')

-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+        return result

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-# Any format supported by librosa is supported
-audio_url = AudioAsset("winning_call").url
-audio_base64 = encode_base64_content_from_url(audio_url)
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-chat_completion_from_base64 = client.chat.completions.create(
-    messages=[{
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": audio_base64,
-                    "format": "wav"
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )

-result = chat_completion_from_base64.choices[0].message.content
-print("Chat completion output from input audio:", result)
-```
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+    ```

 Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:

-```python
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
+??? Code

-result = chat_completion_from_url.choices[0].message.content
-print("Chat completion output from audio url:", result)
-```
+    ```python
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+    ```

 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>

@ -458,7 +476,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
    You can override this by setting the environment variable:

-    ```console
+    ```bash
    export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
    ```

@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:

-```python
-image_embedding = torch.load(...)
-grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+??? Code

-buffer = io.BytesIO()
-torch.save(image_embedding, buffer)
-buffer.seek(0)
-binary_data = buffer.read()
-base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    ```python
+    image_embedding = torch.load(...)
+    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct

-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    buffer = io.BytesIO()
+    torch.save(image_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')

-# Basic usage - this is equivalent to the LLaVA example for offline inference
-model = "llava-hf/llava-1.5-7b-hf"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": f"{base64_image_embedding}" 
-}
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
-model = "Qwen/Qwen2-VL-2B-Instruct"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": {
-        "image_embeds": f"{base64_image_embedding}" , # Required
-        "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
-    },
-}
-model = "openbmb/MiniCPM-V-2_6"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": {
-        "image_embeds": f"{base64_image_embedding}" , # Required
-        "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
-    },
-}
-chat_completion = client.chat.completions.create(
-    messages=[
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": [
-        {
-            "type": "text",
-            "text": "What's in this image?",
+    # Basic usage - this is equivalent to the LLaVA example for offline inference
+    model = "llava-hf/llava-1.5-7b-hf"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": f"{base64_image_embedding}" 
+    }
+
+    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+    model = "Qwen/Qwen2-VL-2B-Instruct"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": {
+            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
        },
-        embeds,
-        ],
-    },
-],
-    model=model,
-)
-```
+    }
+    model = "openbmb/MiniCPM-V-2_6"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": {
+            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+        },
+    }
+    chat_completion = client.chat.completions.create(
+        messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?",
+            },
+            embeds,
+            ],
+        },
+    ],
+        model=model,
+    )
+    ```

 !!! note
    Only one message can contain `{"type": "image_embeds"}`.
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@ -9,39 +9,41 @@ The main benefits are lower latency and memory usage.

 You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).

-```console
+```bash
 pip install autoawq
 ```

 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:

-```python
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
+??? Code

-model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-quant_path = 'mistral-instruct-v0.2-awq'
-quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    ```python
+    from awq import AutoAWQForCausalLM
+    from transformers import AutoTokenizer

-# Load model
-model = AutoAWQForCausalLM.from_pretrained(
-    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    quant_path = 'mistral-instruct-v0.2-awq'
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

-# Quantize
-model.quantize(tokenizer, quant_config=quant_config)
+    # Load model
+    model = AutoAWQForCausalLM.from_pretrained(
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

-# Save quantized model
-model.save_quantized(quant_path)
-tokenizer.save_pretrained(quant_path)
+    # Quantize
+    model.quantize(tokenizer, quant_config=quant_config)

-print(f'Model is quantized and saved at "{quant_path}"')
-```
+    # Save quantized model
+    model.save_quantized(quant_path)
+    tokenizer.save_pretrained(quant_path)
+
+    print(f'Model is quantized and saved at "{quant_path}"')
+    ```

 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:

-```console
+```bash
 python examples/offline_inference/llm_engine_example.py \
    --model TheBloke/Llama-2-7b-Chat-AWQ \
    --quantization awq
@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \

 AWQ models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-# Create an LLM.
-llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@ -12,7 +12,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic

 Below are the steps to utilize BitBLAS with vLLM.

-```console
+```bash
 pip install bitblas>=0.1.0
 ```

@ -43,17 +43,19 @@ llm = LLM(

 ## Read gptq format checkpoint

-```python
-from vllm import LLM
-import torch
+??? Code

-# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
-model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(
-    model=model_id,
-    dtype=torch.float16,
-    trust_remote_code=True,
-    quantization="bitblas",
-    max_model_len=1024
-)
-```
+    ```python
+    from vllm import LLM
+    import torch
+
+    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+    llm = LLM(
+        model=model_id,
+        dtype=torch.float16,
+        trust_remote_code=True,
+        quantization="bitblas",
+        max_model_len=1024
+    )
+    ```
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal

 Below are the steps to utilize BitsAndBytes with vLLM.

-```console
+```bash
 pip install bitsandbytes>=0.45.3
 ```

@ -54,6 +54,6 @@ llm = LLM(

 Append the following to your model arguments for 4bit inflight quantization:

-```console
+```bash
 --quantization bitsandbytes
 ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -23,7 +23,7 @@ The FP8 types typically supported in hardware have two distinct representations,

 To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:

-```console
+```bash
 pip install llmcompressor
 ```

@ -58,28 +58,30 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r

 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
+??? Code

-# Configure the simple PTQ quantization
-recipe = QuantizationModifier(
-  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import QuantizationModifier

-# Apply the quantization algorithm.
-oneshot(model=model, recipe=recipe)
+    # Configure the simple PTQ quantization
+    recipe = QuantizationModifier(
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])

-# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply the quantization algorithm.
+    oneshot(model=model, recipe=recipe)
+
+    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 ### 3. Evaluating Accuracy

 Install `vllm` and `lm-evaluation-harness` for evaluation:

-```console
+```bash
 pip install vllm lm-eval==0.4.4
 ```

@ -97,9 +99,9 @@ Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
 !!! note
    Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.

-```console
-$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
-$ lm_eval \
+```bash
+MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
+lm_eval \
  --model vllm \
  --model_args pretrained=$MODEL,add_bos_token=True \
  --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@ -11,7 +11,7 @@ title: GGUF

 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:

-```console
+```bash
 wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
@ -20,7 +20,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \

 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:

-```console
+```bash
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -32,7 +32,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \

 GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path

-```console
+```bash
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \

 You can also use the GGUF model directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# In this script, we demonstrate how to pass input to the chat method:
-conversation = [
-   {
-      "role": "system",
-      "content": "You are a helpful assistant"
-   },
-   {
-      "role": "user",
-      "content": "Hello"
-   },
-   {
-      "role": "assistant",
-      "content": "Hello! How can I assist you today?"
-   },
-   {
-      "role": "user",
-      "content": "Write an essay about the importance of higher education.",
-   },
-]
+      ```python
+      from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+      # In this script, we demonstrate how to pass input to the chat method:
+      conversation = [
+         {
+            "role": "system",
+            "content": "You are a helpful assistant"
+         },
+         {
+            "role": "user",
+            "content": "Hello"
+         },
+         {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+         },
+         {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+         },
+      ]

-# Create an LLM.
-llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.chat(conversation, sampling_params)
+      # Create a sampling params object.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# Print the outputs.
-for output in outputs:
-   prompt = output.prompt
-   generated_text = output.outputs[0].text
-   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+      # Create an LLM.
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
+      outputs = llm.chat(conversation, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@ -21,7 +21,7 @@ for more details on this and other advanced features.

 You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).

-```console
+```bash
 pip install -U gptqmodel --no-build-isolation -v
 ```

@ -31,34 +31,36 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t

 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:

-```python
-from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
+??? Code

-model_id = "meta-llama/Llama-3.2-1B-Instruct"
-quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    ```python
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig

-calibration_dataset = load_dataset(
-    "allenai/c4",
-    data_files="en/c4-train.00001-of-01024.json.gz",
-    split="train"
-  ).select(range(1024))["text"]
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"

-quant_config = QuantizeConfig(bits=4, group_size=128)
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]

-model = GPTQModel.load(model_id, quant_config)
+    quant_config = QuantizeConfig(bits=4, group_size=128)

-# increase `batch_size` to match gpu/vram specs to speed up quantization
-model.quantize(calibration_dataset, batch_size=2)
+    model = GPTQModel.load(model_id, quant_config)

-model.save(quant_path)
-```
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
+    model.quantize(calibration_dataset, batch_size=2)
+
+    model.save(quant_path)
+    ```

 ## Running a quantized model with vLLM

 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:

-```console
+```bash
 python examples/offline_inference/llm_engine_example.py \
    --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \

 GPTQModel quantized models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
+    ```python
+    from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]

-# Create an LLM.
-llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)

-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    # Create an LLM.
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")

-# Print the outputs.
-print("-"*50)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
    print("-"*50)
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -14,13 +14,13 @@ Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs re

 To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:

-```console
+```bash
 pip install llmcompressor
 ```

 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

-```console
+```bash
 pip install vllm lm-eval==0.4.4
 ```

@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:

-```python
-from datasets import load_dataset
+??? Code

-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+    ```python
+    from datasets import load_dataset

-# Load and preprocess the dataset
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048

-def preprocess(example):
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(preprocess)
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

-def tokenize(sample):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-```
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```

 ### 3. Applying Quantization

 Now, apply the quantization algorithms:

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+??? Code

-# Configure the quantization algorithms
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
+    # Configure the quantization algorithms
+    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])

-# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 This process creates a W4A16 model with weights quantized to 4-bit integers.

@ -112,8 +116,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")

 To evaluate accuracy, you can use `lm_eval`:

-```console
-$ lm_eval --model vllm \
+```bash
+lm_eval --model vllm \
  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
  --tasks gsm8k \
  --num_fewshot 5 \
@ -137,34 +141,36 @@ $ lm_eval --model vllm \

 The following is an example of an expanded quantization recipe you can tune to your own use case:

-```python
-from compressed_tensors.quantization import (
-    QuantizationArgs,
-    QuantizationScheme,
-    QuantizationStrategy,
-    QuantizationType,
-) 
-recipe = GPTQModifier(
-    targets="Linear",
-    config_groups={
-        "config_group": QuantizationScheme(
-            targets=["Linear"],
-            weights=QuantizationArgs(
-                num_bits=4,
-                type=QuantizationType.INT,
-                strategy=QuantizationStrategy.GROUP,
-                group_size=128,
-                symmetric=True,
-                dynamic=False,
-                actorder="weight",
+??? Code
+
+    ```python
+    from compressed_tensors.quantization import (
+        QuantizationArgs,
+        QuantizationScheme,
+        QuantizationStrategy,
+        QuantizationType,
+    ) 
+    recipe = GPTQModifier(
+        targets="Linear",
+        config_groups={
+            "config_group": QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=4,
+                    type=QuantizationType.INT,
+                    strategy=QuantizationStrategy.GROUP,
+                    group_size=128,
+                    symmetric=True,
+                    dynamic=False,
+                    actorder="weight",
+                ),
            ),
-        ),
-    },
-    ignore=["lm_head"],
-    update_size=NUM_CALIBRATION_SAMPLES,
-    dampening_frac=0.01
-)
-```
+        },
+        ignore=["lm_head"],
+        update_size=NUM_CALIBRATION_SAMPLES,
+        dampening_frac=0.01
+    )
+    ```

 ## Troubleshooting and Support

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -15,13 +15,13 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re

 To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:

-```console
+```bash
 pip install llmcompressor
 ```

 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

-```console
+```bash
 pip install vllm lm-eval==0.4.4
 ```

@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:

-```python
-from datasets import load_dataset
+??? Code

-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+    ```python
+    from datasets import load_dataset

-# Load and preprocess the dataset
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048

-def preprocess(example):
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(preprocess)
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

-def tokenize(sample):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-```
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+
+</details>

 ### 3. Applying Quantization

 Now, apply the quantization algorithms:

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+??? Code

-# Configure the quantization algorithms
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
-    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-]
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
+    # Configure the quantization algorithms
+    recipe = [
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    ]

-# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 This process creates a W8A8 model with weights and activations quantized to 8-bit integers.

@ -116,8 +122,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")

 To evaluate accuracy, you can use `lm_eval`:

-```console
-$ lm_eval --model vllm \
+```bash
+lm_eval --model vllm \
  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
  --tasks gsm8k \
  --num_fewshot 5 \
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@ -4,7 +4,7 @@ The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-O

 We recommend installing the library with:

-```console
+```bash
 pip install nvidia-modelopt
 ```

@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te

 Below is an example showing how to quantize a model using modelopt's PTQ API:

-```python
-import modelopt.torch.quantization as mtq
-from transformers import AutoModelForCausalLM
+??? Code

-# Load the model from HuggingFace
-model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+    ```python
+    import modelopt.torch.quantization as mtq
+    from transformers import AutoModelForCausalLM

-# Select the quantization config, for example, FP8
-config = mtq.FP8_DEFAULT_CFG
+    # Load the model from HuggingFace
+    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")

-# Define a forward loop function for calibration
-def forward_loop(model):
-    for data in calib_set:
-        model(data)
+    # Select the quantization config, for example, FP8
+    config = mtq.FP8_DEFAULT_CFG

-# PTQ with in-place replacement of quantized modules
-model = mtq.quantize(model, config, forward_loop)
-```
+    # Define a forward loop function for calibration
+    def forward_loop(model):
+        for data in calib_set:
+            model(data)
+
+    # PTQ with in-place replacement of quantized modules
+    model = mtq.quantize(model, config, forward_loop)
+    ```

 After the model is quantized, you can export it to a quantized checkpoint using the export API:

@ -48,31 +50,33 @@ with torch.inference_mode():

 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-def main():
+    ```python
+    from vllm import LLM, SamplingParams

-    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
-    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
+    def main():

-    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)

-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

-    outputs = llm.generate(prompts, sampling_params)
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]

-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        outputs = llm.generate(prompts, sampling_params)

-if __name__ == "__main__":
-    main()
-```
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    if __name__ == "__main__":
+        main()
+    ```
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades

 Here is an example of how to enable FP8 quantization:

-```python
-# To calculate kv cache scales on the fly enable the calculate_kv_scales
-# parameter
+??? Code

-from vllm import LLM, SamplingParams
+    ```python
+    # To calculate kv cache scales on the fly enable the calculate_kv_scales
+    # parameter

-sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-          kv_cache_dtype="fp8",
-          calculate_kv_scales=True)
-prompt = "London is the capital of"
-out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-print(out)
-```
+    from vllm import LLM, SamplingParams
+
+    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
+            calculate_kv_scales=True)
+    prompt = "London is the capital of"
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+    print(out)
+    ```

 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
 - `"auto"`: Uses the model's default "unquantized" data type
@ -63,7 +65,7 @@ For optimal model quality when using FP8 KV Cache, we recommend using calibrated

 First, install the required dependencies:

-```console
+```bash
 pip install llmcompressor
 ```

@ -71,67 +73,69 @@ pip install llmcompressor

 Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):

-```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from llmcompressor.transformers import oneshot
+??? Code

-# Select model and load it
-MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    ```python
+    from datasets import load_dataset
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from llmcompressor.transformers import oneshot

-# Select calibration dataset
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
+    # Select model and load it
+    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

-# Configure calibration parameters
-NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
-MAX_SEQUENCE_LENGTH = 2048
+    # Select calibration dataset
+    DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+    DATASET_SPLIT = "train_sft"

-# Load and preprocess dataset
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    # Configure calibration parameters
+    NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
+    MAX_SEQUENCE_LENGTH = 2048

-def process_and_tokenize(example):
-    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(
-        text,
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
+    # Load and preprocess dataset
+    ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+    def process_and_tokenize(example):
+        text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return tokenizer(
+            text,
+            padding=False,
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+    # Configure quantization settings
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: 8
+                    type: float
+                    strategy: tensor
+                    dynamic: false
+                    symmetric: true
+    """
+
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )

-ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
-
-# Configure quantization settings
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            kv_cache_scheme:
-                num_bits: 8
-                type: float
-                strategy: tensor
-                dynamic: false
-                symmetric: true
-"""
-
-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-
-# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -13,7 +13,7 @@ AWQ, GPTQ, Rotation and SmoothQuant.

 Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:

-```console
+```bash
 pip install amd-quark
 ```

@ -22,13 +22,13 @@ for more installation details.

 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

-```console
+```bash
 pip install vllm lm-eval==0.4.4
 ```

 ## Quantization Process

-After installing Quark, we will use an example to illustrate how to use Quark.  
+After installing Quark, we will use an example to illustrate how to use Quark.
 The Quark quantization process can be listed for 5 steps as below:

 1. Load the model
@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
 Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
 to fetch model and tokenizer.

-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
+??? Code

-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-MAX_SEQ_LEN = 512
+    ```python
+    from transformers import AutoTokenizer, AutoModelForCausalLM

-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
-)
-model.eval()
+    MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+    MAX_SEQ_LEN = 512

-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
-tokenizer.pad_token = tokenizer.eos_token
-```
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, device_map="auto", torch_dtype="auto",
+    )
+    model.eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+    tokenizer.pad_token = tokenizer.eos_token
+    ```

 ### 2. Prepare the Calibration Dataloader

@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
 to load calibration data. For more details about how to use calibration datasets efficiently, please refer
 to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).

-```python
-from datasets import load_dataset
-from torch.utils.data import DataLoader
+??? Code

-BATCH_SIZE = 1
-NUM_CALIBRATION_DATA = 512
+    ```python
+    from datasets import load_dataset
+    from torch.utils.data import DataLoader

-# Load the dataset and get calibration data.
-dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
-text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+    BATCH_SIZE = 1
+    NUM_CALIBRATION_DATA = 512

-tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-    batch_size=BATCH_SIZE, drop_last=True)
-```
+    # Load the dataset and get calibration data.
+    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE, drop_last=True)
+    ```

 ### 3. Set the Quantization Configuration

@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
    AutoSmoothQuant config file for Llama is
    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.

-```python
-from quark.torch.quantization import (Config, QuantizationConfig,
-                                     FP8E4M3PerTensorSpec,
-                                     load_quant_algo_config_from_file)
+??? Code

-# Define fp8/per-tensor/static spec.
-FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-    is_dynamic=False).to_quantization_spec()
+    ```python
+    from quark.torch.quantization import (Config, QuantizationConfig,
+                                        FP8E4M3PerTensorSpec,
+                                        load_quant_algo_config_from_file)

-# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-    weight=FP8_PER_TENSOR_SPEC)
+    # Define fp8/per-tensor/static spec.
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+        is_dynamic=False).to_quantization_spec()

-# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
-KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-kv_cache_quant_config = {name :
-    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                       weight=global_quant_config.weight,
-                       output_tensors=KV_CACHE_SPEC)
-    for name in kv_cache_layer_names_for_llama}
-layer_quant_config = kv_cache_quant_config.copy()
+    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC)

-# Define algorithm config by config file.
-LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
-algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+    kv_cache_quant_config = {name :
+        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                        weight=global_quant_config.weight,
+                        output_tensors=KV_CACHE_SPEC)
+        for name in kv_cache_layer_names_for_llama}
+    layer_quant_config = kv_cache_quant_config.copy()

-EXCLUDE_LAYERS = ["lm_head"]
-quant_config = Config(
-    global_quant_config=global_quant_config,
-    layer_quant_config=layer_quant_config,
-    kv_cache_quant_config=kv_cache_quant_config,
-    exclude=EXCLUDE_LAYERS,
-    algo_config=algo_config)
-```
+    # Define algorithm config by config file.
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+    EXCLUDE_LAYERS = ["lm_head"]
+    quant_config = Config(
+        global_quant_config=global_quant_config,
+        layer_quant_config=layer_quant_config,
+        kv_cache_quant_config=kv_cache_quant_config,
+        exclude=EXCLUDE_LAYERS,
+        algo_config=algo_config)
+    ```

 ### 4. Quantize the Model and Export

@ -139,68 +145,72 @@ HuggingFace `safetensors`, you can refer to
 [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
 for more exporting format details.

-```python
-import torch
-from quark.torch import ModelQuantizer, ModelExporter
-from quark.torch.export import ExporterConfig, JsonExporterConfig
+??? Code

-# Apply quantization.
-quantizer = ModelQuantizer(quant_config)
-quant_model = quantizer.quantize_model(model, calib_dataloader)
+    ```python
+    import torch
+    from quark.torch import ModelQuantizer, ModelExporter
+    from quark.torch.export import ExporterConfig, JsonExporterConfig

-# Freeze quantized model to export.
-freezed_model = quantizer.freeze(model)
+    # Apply quantization.
+    quantizer = ModelQuantizer(quant_config)
+    quant_model = quantizer.quantize_model(model, calib_dataloader)

-# Define export config.
-LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
-export_config = ExporterConfig(json_export_config=JsonExporterConfig())
-export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+    # Freeze quantized model to export.
+    freezed_model = quantizer.freeze(model)

-# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
-EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
-with torch.no_grad():
-    exporter.export_safetensors_model(freezed_model,
-        quant_config=quant_config, tokenizer=tokenizer)
-```
+    # Define export config.
+    LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+    export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+    export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+    # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
+    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+    with torch.no_grad():
+        exporter.export_safetensors_model(freezed_model,
+            quant_config=quant_config, tokenizer=tokenizer)
+    ```

 ### 5. Evaluation in vLLM

 Now, you can load and run the Quark quantized model directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-# Create an LLM.
-llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-          kv_cache_dtype='fp8',quantization='quark')
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-print("\nGenerated Outputs:\n" + "-" * 60)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt:    {prompt!r}")
-    print(f"Output:    {generated_text!r}")
-    print("-" * 60)
-```
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+            kv_cache_dtype='fp8',quantization='quark')
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+    ```

 Or, you can use `lm_eval` to evaluate accuracy:

-```console
-$ lm_eval --model vllm \
+```bash
+lm_eval --model vllm \
  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
  --tasks gsm8k
 ```
@ -212,7 +222,7 @@ to quantize large language models more conveniently. It supports quantizing mode
 of different quantization schemes and optimization algorithms. It can export the quantized model
 and run evaluation tasks on the fly. With the script, the example above can be:

-```console
+```bash
 python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
                          --output_dir /path/to/output \
                          --quant_scheme w_fp8_a_fp8 \
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@ -4,7 +4,7 @@ TorchAO is an architecture optimization library for PyTorch, it provides high pe

 We recommend installing the latest torchao nightly with

-```console
+```bash
 # Install the latest TorchAO nightly build
 # Choose the CUDA version that matches your system (cu126, cu128, etc.)
 pip install \
@ -15,26 +15,28 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:

-```Python
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+??? Code

-model_name = "meta-llama/Meta-Llama-3-8B"
-quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    ```Python
+    import torch
+    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+    from torchao.quantization import Int8WeightOnlyConfig

-hub_repo = # YOUR HUB REPO ID
-tokenizer.push_to_hub(hub_repo)
-quantized_model.push_to_hub(hub_repo, safe_serialization=False)
-```
+    model_name = "meta-llama/Meta-Llama-3-8B"
+    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+    quantized_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto",
+        quantization_config=quantization_config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+    hub_repo = # YOUR HUB REPO ID
+    tokenizer.push_to_hub(hub_repo)
+    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    ```

 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \

 Next, make a request to the model that should return the reasoning content in the response.

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Round 1
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
-# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-response = client.chat.completions.create(model=model, messages=messages)
+    models = client.models.list()
+    model = models.data[0].id

-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    response = client.chat.completions.create(model=model, messages=messages)

-print("reasoning_content:", reasoning_content)
-print("content:", content)
-```
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content:", reasoning_content)
+    print("content:", content)
+    ```

 The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.

@ -68,77 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final

 Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).

-```json
-{
-    "id": "chatcmpl-123",
-    "object": "chat.completion.chunk",
-    "created": 1694268190,
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-    "system_fingerprint": "fp_44709d6fcb",
-    "choices": [
-        {
-            "index": 0,
-            "delta": {
-                "role": "assistant",
-                "reasoning_content": "is",
-            },
-            "logprobs": null,
-            "finish_reason": null
-        }
-    ]
-}
-```
+??? Json
+
+    ```json
+    {
+        "id": "chatcmpl-123",
+        "object": "chat.completion.chunk",
+        "created": 1694268190,
+        "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "reasoning_content": "is",
+                },
+                "logprobs": null,
+                "finish_reason": null
+            }
+        ]
+    }
+    ```

 OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
-# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-stream = client.chat.completions.create(model=model,
-                                        messages=messages,
-                                        stream=True)
+    models = client.models.list()
+    model = models.data[0].id

-print("client: Start streaming chat completions...")
-printed_reasoning_content = False
-printed_content = False
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)

-for chunk in stream:
-    reasoning_content = None
-    content = None
-    # Check the content is reasoning_content or content
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
-    elif hasattr(chunk.choices[0].delta, "content"):
-        content = chunk.choices[0].delta.content
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False

-    if reasoning_content is not None:
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
-            print("reasoning_content:", end="", flush=True)
-        print(reasoning_content, end="", flush=True)
-    elif content is not None:
-        if not printed_content:
-            printed_content = True
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
-        print(content, end="", flush=True)
-```
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+    ```

 Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).

@ -146,41 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before

 The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.

-```python
-from openai import OpenAI
+??? Code

-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+    ```python
+    from openai import OpenAI

-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-            },
-            "required": ["location", "unit"]
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
        }
-    }
-}]
+    }]

-response = client.chat.completions.create(
-    model=client.models.list().data[0].id,
-    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-    tools=tools,
-    tool_choice="auto"
-)
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
+        tool_choice="auto"
+    )

-print(response)
-tool_call = response.choices[0].message.tool_calls[0].function
+    print(response)
+    tool_call = response.choices[0].message.tool_calls[0].function

-print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
-print(f"Function called: {tool_call.name}")
-print(f"Arguments: {tool_call.arguments}")
-```
+    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+    ```

 For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.

@ -192,85 +200,89 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_

 You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.

-```python
-# import the required packages
+??? Code

-from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+    ```python
+    # import the required packages

-# define a reasoning parser and register it to vllm
-# the name list in register_module can be used
-# in --reasoning-parser.
-@ReasoningParserManager.register_module(["example"])
-class ExampleParser(ReasoningParser):
-    def __init__(self, tokenizer: AnyTokenizer):
-        super().__init__(tokenizer)
+    from vllm.reasoning import ReasoningParser, ReasoningParserManager
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                DeltaMessage)

-    def extract_reasoning_content_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
-        """
-        Instance method that should be implemented for extracting reasoning
-        from an incomplete response; for use when handling reasoning calls and
-        streaming. Has to be an instance method because  it requires state -
-        the current tokens/diffs, but also the information about what has
-        previously been parsed and extracted (see constructor)
-        """
+    # define a reasoning parser and register it to vllm
+    # the name list in register_module can be used
+    # in --reasoning-parser.
+    @ReasoningParserManager.register_module(["example"])
+    class ExampleParser(ReasoningParser):
+        def __init__(self, tokenizer: AnyTokenizer):
+            super().__init__(tokenizer)

-    def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
-        """
-        Extract reasoning content from a complete model-generated string.
+        def extract_reasoning_content_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+        ) -> Union[DeltaMessage, None]:
+            """
+            Instance method that should be implemented for extracting reasoning
+            from an incomplete response; for use when handling reasoning calls and
+            streaming. Has to be an instance method because  it requires state -
+            the current tokens/diffs, but also the information about what has
+            previously been parsed and extracted (see constructor)
+            """

-        Used for non-streaming responses where we have the entire model response
-        available before sending to the client.
+        def extract_reasoning_content(
+                self, model_output: str, request: ChatCompletionRequest
+        ) -> tuple[Optional[str], Optional[str]]:
+            """
+            Extract reasoning content from a complete model-generated string.

-        Parameters:
-        model_output: str
-            The model-generated string to extract reasoning content from.
+            Used for non-streaming responses where we have the entire model response
+            available before sending to the client.

-        request: ChatCompletionRequest
-            The request object that was used to generate the model_output.
+            Parameters:
+            model_output: str
+                The model-generated string to extract reasoning content from.

-        Returns:
-        tuple[Optional[str], Optional[str]]
-            A tuple containing the reasoning content and the content.
-        """
-```
+            request: ChatCompletionRequest
+                The request object that was used to generate the model_output.
+
+            Returns:
+            tuple[Optional[str], Optional[str]]
+                A tuple containing the reasoning content and the content.
+            """
+    ```

 Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.

-```python
-@dataclass
-class DeepSeekReasoner(Reasoner):
-    """
-    Reasoner for DeepSeek R series models.
-    """
-    start_token_id: int
-    end_token_id: int
+??? Code

-    start_token: str = "<think>"
-    end_token: str = "</think>"
+    ```python
+    @dataclass
+    class DeepSeekReasoner(Reasoner):
+        """
+        Reasoner for DeepSeek R series models.
+        """
+        start_token_id: int
+        end_token_id: int

-    @classmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        return cls(start_token_id=tokenizer.encode(
-            "<think>", add_special_tokens=False)[0],
-                   end_token_id=tokenizer.encode("</think>",
-                                                 add_special_tokens=False)[0])
+        start_token: str = "<think>"
+        end_token: str = "</think>"

-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
-    ...
-```
+        @classmethod
+        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+            return cls(start_token_id=tokenizer.encode(
+                "<think>", add_special_tokens=False)[0],
+                    end_token_id=tokenizer.encode("</think>",
+                                                    add_special_tokens=False)[0])
+
+        def is_reasoning_end(self, input_ids: list[int]) -> bool:
+            return self.end_token_id in input_ids
+        ...
+    ```

 The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.

--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory

 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
-    speculative_config={
-        "model": "facebook/opt-125m",
-        "num_speculative_tokens": 5,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "model": "facebook/opt-125m",
+            "num_speculative_tokens": 5,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 To perform the same with an online mode launch the server:

@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \

 Then use a client:

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="The future of AI is",
-    echo=False,
-    n=1,
-    stream=stream,
-)
+    models = client.models.list()
+    model = models.data[0].id

-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
-```
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    ```

 ## Speculating by matching n-grams in the prompt

 The following code configures vLLM to use speculative decoding where proposals are generated by
 matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
-    speculative_config={
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 4,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 4,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 ## Speculating using MLP speculators

@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-    tensor_parallel_size=4,
-    speculative_config={
-        "model": "ibm-ai-platform/llama3-70b-accelerator",
-        "draft_tensor_parallel_size": 1,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "ibm-ai-platform/llama3-70b-accelerator",
+            "draft_tensor_parallel_size": 1,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 Note that these speculative models currently need to be run without tensor parallelism, although
 it is possible to run the main model using tensor parallelism (see example above). Since the
@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="meta-llama/Meta-Llama-3-8B-Instruct",
-    tensor_parallel_size=4,
-    speculative_config={
-        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "draft_tensor_parallel_size": 1,
-    },
-)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-outputs = llm.generate(prompts, sampling_params)
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "draft_tensor_parallel_size": 1,
+        },
+    )

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    outputs = llm.generate(prompts, sampling_params)

-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    ```

 A few important things to consider when using the EAGLE based draft models:

--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -33,39 +33,43 @@ text.

 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:

-```python
-from openai import OpenAI
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="-",
-)
-model = client.models.list().data[0].id
+??? Code

-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-    ],
-    extra_body={"guided_choice": ["positive", "negative"]},
-)
-print(completion.choices[0].message.content)
-```
+    ```python
+    from openai import OpenAI
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+    model = client.models.list().data[0].id
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
+        extra_body={"guided_choice": ["positive", "negative"]},
+    )
+    print(completion.choices[0].message.content)
+    ```

 The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:

-```python
-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
-        }
-    ],
-    extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
-)
-print(completion.choices[0].message.content)
-```
+??? Code
+
+    ```python
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+            }
+        ],
+        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+    )
+    print(completion.choices[0].message.content)
+    ```

 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
 For this we can use the `guided_json` parameter in two different ways:
@ -75,41 +79,43 @@ For this we can use the `guided_json` parameter in two different ways:

 The next example shows how to use the `guided_json` parameter with a Pydantic model:

-```python
-from pydantic import BaseModel
-from enum import Enum
+??? Code

-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
+    ```python
+    from pydantic import BaseModel
+    from enum import Enum

-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
+    class CarType(str, Enum):
+        sedan = "sedan"
+        suv = "SUV"
+        truck = "Truck"
+        coupe = "Coupe"

-json_schema = CarDescription.model_json_schema()
+    class CarDescription(BaseModel):
+        brand: str
+        model: str
+        car_type: CarType

-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
-        }
-    ],
-    "response_format": {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "car-description",
-            "schema": CarDescription.model_json_schema()
+    json_schema = CarDescription.model_json_schema()
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema()
+            },
        },
-    },
-)
-print(completion.choices[0].message.content)
-```
+    )
+    print(completion.choices[0].message.content)
+    ```

 !!! tip
    While not strictly necessary, normally it´s better to indicate in the prompt the
@ -121,33 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:

-```python
-simplified_sql_grammar = """
-    root ::= select_statement
+??? Code

-    select_statement ::= "SELECT " column " from " table " where " condition
+    ```python
+    simplified_sql_grammar = """
+        root ::= select_statement

-    column ::= "col_1 " | "col_2 "
+        select_statement ::= "SELECT " column " from " table " where " condition

-    table ::= "table_1 " | "table_2 "
+        column ::= "col_1 " | "col_2 "

-    condition ::= column "= " number
+        table ::= "table_1 " | "table_2 "

-    number ::= "1 " | "2 "
-"""
+        condition ::= column "= " number

-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
-        }
-    ],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print(completion.choices[0].message.content)
-```
+        number ::= "1 " | "2 "
+    """
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+            }
+        ],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print(completion.choices[0].message.content)
+    ```

 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)

@ -161,34 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r

 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:

-```python
-from pydantic import BaseModel
+??? Code
+
+    ```python
+    from pydantic import BaseModel


-class People(BaseModel):
-    name: str
-    age: int
+    class People(BaseModel):
+        name: str
+        age: int


-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate a JSON with the name and age of one random person.",
-        }
-    ],
-    response_format={
-        "type": "json_schema",
-        "json_schema": {
-            "name": "people",
-            "schema": People.model_json_schema()
-        }
-    },
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
-```
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the name and age of one random person.",
+            }
+        ],
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "people",
+                "schema": People.model_json_schema()
+            }
+        },
+    )
+    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
+    ```

 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)

@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.

 Here is a simple example demonstrating how to get structured output using Pydantic models:

-```python
-from pydantic import BaseModel
-from openai import OpenAI
+??? Code

-class Info(BaseModel):
-    name: str
-    age: int
+    ```python
+    from pydantic import BaseModel
+    from openai import OpenAI

-client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-model = client.models.list().data[0].id
-completion = client.beta.chat.completions.parse(
-    model=model,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
-    ],
-    response_format=Info,
-)
+    class Info(BaseModel):
+        name: str
+        age: int

-message = completion.choices[0].message
-print(message)
-assert message.parsed
-print("Name:", message.parsed.name)
-print("Age:", message.parsed.age)
-```
+    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+    model = client.models.list().data[0].id
+    completion = client.beta.chat.completions.parse(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+        ],
+        response_format=Info,
+    )

-Output:
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    print("Name:", message.parsed.name)
+    print("Age:", message.parsed.age)
+    ```

 ```console
 ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
@ -238,35 +248,37 @@ Age: 28

 Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:

-```python
-from typing import List
-from pydantic import BaseModel
-from openai import OpenAI
+??? Code

-class Step(BaseModel):
-    explanation: str
-    output: str
+    ```python
+    from typing import List
+    from pydantic import BaseModel
+    from openai import OpenAI

-class MathResponse(BaseModel):
-    steps: list[Step]
-    final_answer: str
+    class Step(BaseModel):
+        explanation: str
+        output: str

-completion = client.beta.chat.completions.parse(
-    model=model,
-    messages=[
-        {"role": "system", "content": "You are a helpful expert math tutor."},
-        {"role": "user", "content": "Solve 8x + 31 = 2."},
-    ],
-    response_format=MathResponse,
-)
+    class MathResponse(BaseModel):
+        steps: list[Step]
+        final_answer: str

-message = completion.choices[0].message
-print(message)
-assert message.parsed
-for i, step in enumerate(message.parsed.steps):
-    print(f"Step #{i}:", step)
-print("Answer:", message.parsed.final_answer)
-```
+    completion = client.beta.chat.completions.parse(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful expert math tutor."},
+            {"role": "user", "content": "Solve 8x + 31 = 2."},
+        ],
+        response_format=MathResponse,
+    )
+
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    for i, step in enumerate(message.parsed.steps):
+        print(f"Step #{i}:", step)
+    print("Answer:", message.parsed.final_answer)
+    ```

 Output:

@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online
 Serving examples above. One example for the usage of the `choice` parameter is
 shown below:

-```python
-from vllm import LLM, SamplingParams
-from vllm.sampling_params import GuidedDecodingParams
+??? Code

-llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+    ```python
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams

-guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
-```
+    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+
+    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    outputs = llm.generate(
+        prompts="Classify this sentiment: vLLM is wonderful!",
+        sampling_params=sampling_params,
+    )
+    print(outputs[0].outputs[0].text)
+    ```

 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -15,44 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \

 Next, make a request to the model that should result in it using the available tools:

-```python
-from openai import OpenAI
-import json
+??? Code

-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+    ```python
+    from openai import OpenAI
+    import json

-def get_weather(location: str, unit: str):
-    return f"Getting the weather for {location} in {unit}..."
-tool_functions = {"get_weather": get_weather}
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")

-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-            },
-            "required": ["location", "unit"]
+    def get_weather(location: str, unit: str):
+        return f"Getting the weather for {location} in {unit}..."
+    tool_functions = {"get_weather": get_weather}
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
        }
-    }
-}]
+    }]

-response = client.chat.completions.create(
-    model=client.models.list().data[0].id,
-    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-    tools=tools,
-    tool_choice="auto"
-)
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
+        tool_choice="auto"
+    )

-tool_call = response.choices[0].message.tool_calls[0].function
-print(f"Function called: {tool_call.name}")
-print(f"Arguments: {tool_call.arguments}")
-print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
-```
+    tool_call = response.choices[0].message.tool_calls[0].function
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+    print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
+    ```

 Example output:

@ -226,6 +228,25 @@ AI21's Jamba-1.5 models are supported.

 Flags: `--tool-call-parser jamba`

+### xLAM Models (`xlam`)
+
+The xLAM tool parser is designed to support models that generate tool calls in various JSON formats. It detects function calls in several different output styles:
+
+1. Direct JSON arrays: Output strings that are JSON arrays starting with `[` and ending with `]`
+2. Thinking tags: Using `<think>...</think>` tags containing JSON arrays
+3. Code blocks: JSON in code blocks (```json ...```)
+4. Tool calls tags: Using `[TOOL_CALLS]` or `<tool_call>...</tool_call>` tags
+
+Parallel function calls are supported, and the parser can effectively separate text content from tool calls.
+
+Supported models:
+* Salesforce Llama-xLAM models: `Salesforce/Llama-xLAM-2-8B-fc-r`, `Salesforce/Llama-xLAM-2-70B-fc-r`
+* Qwen-xLAM models: `Salesforce/xLAM-1B-fc-r`, `Salesforce/xLAM-3B-fc-r`, `Salesforce/Qwen-xLAM-32B-fc-r`
+
+Flags:
+* For Llama-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_llama.jinja`
+* For Qwen-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_qwen.jinja`
+
 ### Qwen Models

 For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm)
@ -282,53 +303,55 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen

 Here is a summary of a plugin file:

-```python
+??? Code

-# import the required packages
+    ```python

-# define a tool parser and register it to vllm
-# the name list in register_module can be used
-# in --tool-call-parser. you can define as many
-# tool parsers as you want here.
-@ToolParserManager.register_module(["example"])
-class ExampleToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
-        super().__init__(tokenizer)
+    # import the required packages

-    # adjust request. e.g.: set skip special tokens
-    # to False for tool call output.
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        return request
+    # define a tool parser and register it to vllm
+    # the name list in register_module can be used
+    # in --tool-call-parser. you can define as many
+    # tool parsers as you want here.
+    @ToolParserManager.register_module(["example"])
+    class ExampleToolParser(ToolParser):
+        def __init__(self, tokenizer: AnyTokenizer):
+            super().__init__(tokenizer)

-    # implement the tool call parse for stream call
-    def extract_tool_calls_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-        request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-        return delta
+        # adjust request. e.g.: set skip special tokens
+        # to False for tool call output.
+        def adjust_request(
+                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+            return request

-    # implement the tool parse for non-stream call
-    def extract_tool_calls(
-        self,
-        model_output: str,
-        request: ChatCompletionRequest,
-    ) -> ExtractedToolCallInformation:
-        return ExtractedToolCallInformation(tools_called=False,
-                                            tool_calls=[],
-                                            content=text)
+        # implement the tool call parse for stream call
+        def extract_tool_calls_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+            request: ChatCompletionRequest,
+        ) -> Union[DeltaMessage, None]:
+            return delta

-```
+        # implement the tool parse for non-stream call
+        def extract_tool_calls(
+            self,
+            model_output: str,
+            request: ChatCompletionRequest,
+        ) -> ExtractedToolCallInformation:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=text)
+
+    ```

 Then you can use this plugin in the command line like this.

-```console
+```bash
    --enable-auto-tool-choice \
    --tool-parser-plugin <absolute path of the plugin file>
    --tool-call-parser example \
--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@ -26,7 +26,7 @@ The easiest way to launch a Trainium or Inferentia instance with pre-installed N
 - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
 - Once inside your instance, activate the pre-installed virtual environment for inference by running

-```console
+```bash
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 ```

@ -47,7 +47,7 @@ Currently, there are no pre-built Neuron wheels.

 To build and install vLLM from source, run:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install -U -r requirements/neuron.txt
@ -66,7 +66,7 @@ Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-

 To install the AWS Neuron fork, run the following:

-```console
+```bash
 git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
 cd upstreaming-to-vllm
 pip install -r requirements/neuron.txt
@ -100,7 +100,7 @@ to perform most of the heavy lifting which includes PyTorch model initialization
 To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
 as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include

-```console
+```python
 override_neuron_config={
    "enable_bucketing":False,
 }
@ -108,7 +108,7 @@ override_neuron_config={

 or when launching vLLM from the CLI, pass

-```console
+```bash
 --override-neuron-config "{\"enable_bucketing\":false}"
 ```

--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -76,21 +76,25 @@ Currently, there are no pre-built CPU wheels.

 ### Build image from source

-```console
-$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+??? Commands

-# Launching OpenAI server 
-$ docker run --rm \
-             --privileged=true \
-             --shm-size=4g \
-             -p 8000:8000 \
-             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
-             vllm-cpu-env \
-             --model=meta-llama/Llama-3.2-1B-Instruct \
-             --dtype=bfloat16 \
-             other vLLM OpenAI server arguments
-```
+    ```bash
+    docker build -f docker/Dockerfile.cpu \
+            --tag vllm-cpu-env \
+            --target vllm-openai .
+
+    # Launching OpenAI server
+    docker run --rm \
+                --privileged=true \
+                --shm-size=4g \
+                -p 8000:8000 \
+                -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+                -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+                vllm-cpu-env \
+                --model=meta-llama/Llama-3.2-1B-Instruct \
+                --dtype=bfloat16 \
+                other vLLM OpenAI server arguments
+    ```

 !!! tip
    For ARM or Apple silicon, use `docker/Dockerfile.arm`
@ -119,7 +123,7 @@ vLLM CPU backend supports the following vLLM features:

 - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:

-```console
+```bash
 sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 find / -name *libtcmalloc* # find the dynamic link library path
 export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
@ -128,7 +132,7 @@ python examples/offline_inference/basic/basic.py # run vLLM

 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:

-```console
+```bash
 export VLLM_CPU_KVCACHE_SPACE=40
 export VLLM_CPU_OMP_THREADS_BIND=0-29
 vllm serve facebook/opt-125m
@ -136,7 +140,7 @@ vllm serve facebook/opt-125m

 or using default auto thread binding:

-```console
+```bash
 export VLLM_CPU_KVCACHE_SPACE=40
 export VLLM_CPU_NUM_OF_RESERVED_CPU=2
 vllm serve facebook/opt-125m
@ -144,32 +148,34 @@ vllm serve facebook/opt-125m

 - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:

-```console
-$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+??? Commands

-# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
-CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
-0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    ```console
+    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores

-# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
-$ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/basic/basic.py
-```
+    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
+    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
+    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+
+    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-7
+    $ python examples/offline_inference/basic/basic.py
+    ```

 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.

@ -183,14 +189,20 @@ $ python examples/offline_inference/basic/basic.py

  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:

-    ```console
-    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```bash
+    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" \
+        vllm serve meta-llama/Llama-2-7b-chat-hf \
+        -tp=2 \
+        --distributed-executor-backend mp
    ```

    or using default auto thread binding:

-    ```console
-    VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```bash
+    VLLM_CPU_KVCACHE_SPACE=40 \
+        vllm serve meta-llama/Llama-2-7b-chat-hf \
+        -tp=2 \
+        --distributed-executor-backend mp
    ```

  - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@ -25,11 +25,11 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.

 After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install -r requirements/cpu.txt
-pip install -e . 
+pip install -e .
 ```

 !!! note
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@ -23,7 +23,7 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
+--8<-- "docs/getting_started/installation/cpu/build.inc.md"

 Testing has been conducted on AWS Graviton3 instances for compatibility.

--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@ -1,6 +1,6 @@
 First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:

-```console
+```bash
 sudo apt-get update  -y
 sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
@ -8,14 +8,14 @@ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /

 Second, clone vLLM project:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git vllm_source
 cd vllm_source
 ```

 Third, install Python packages for vLLM CPU backend building:

-```console
+```bash
 pip install --upgrade pip
 pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy
 pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
@ -23,13 +23,13 @@ pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorc

 Finally, build and install vLLM CPU backend:

-```console
+```bash
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```

 If you want to develop vllm, install it in editable mode instead.

-```console
+```bash
 VLLM_TARGET_DEVICE=cpu python setup.py develop
 ```

--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@ -26,7 +26,7 @@ Currently the CPU implementation for s390x architecture supports FP32 datatype o

 Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4:

-```console
+```bash
 dnf install -y \
    which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \
    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
@ -35,7 +35,7 @@ dnf install -y \

 Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation.

-```console
+```bash
 curl https://sh.rustup.rs -sSf | sh -s -- -y && \
    . "$HOME/.cargo/env"
 ```
@ -45,7 +45,7 @@ Execute the following commands to build and install vLLM from the source.
 !!! tip
    Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.

-```console
+```bash
    sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
    pip install -v \
        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -24,7 +24,7 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
+--8<-- "docs/getting_started/installation/cpu/build.inc.md"

 !!! note
    - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@ -68,7 +68,7 @@ For more information about using TPUs with GKE, see:

 Create a TPU v5e with 4 TPU chips:

-```console
+```bash
 gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
  --node-id TPU_NAME \
  --project PROJECT_ID \
@ -156,13 +156,13 @@ See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for i

 You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.

-```console
+```bash
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 ```

 Run the Docker image with the following command:

-```console
+```bash
 # Make sure to add `--privileged --net host --shm-size=16G`.
 docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
@ -185,6 +185,6 @@ docker run --privileged --net host --shm-size=16G -it vllm-tpu

    Install OpenBLAS with the following command:

-    ```console
+    ```bash
    sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
    ```
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@ -22,7 +22,7 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I

 You can install vLLM using either `pip` or `uv pip`:

-```console
+```bash
 # Install vLLM with CUDA 12.8.
 # If you are using pip.
 pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
@ -37,7 +37,7 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in

 As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:

-```console
+```bash
 # Install vLLM with CUDA 11.8.
 export VLLM_VERSION=0.6.1.post1
 export PYTHON_VERSION=312
@ -52,7 +52,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe

 ##### Install the latest code using `pip`

-```console
+```bash
 pip install -U vllm \
    --pre \
    --extra-index-url https://wheels.vllm.ai/nightly
@ -62,7 +62,7 @@ pip install -U vllm \

 Another way to install the latest code is to use `uv`:

-```console
+```bash
 uv pip install -U vllm \
    --torch-backend=auto \
    --extra-index-url https://wheels.vllm.ai/nightly
@ -72,7 +72,7 @@ uv pip install -U vllm \

 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:

-```console
+```bash
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
@ -83,7 +83,7 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p

 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:

-```console
+```bash
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
    --torch-backend=auto \
@ -99,7 +99,7 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb

 If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 VLLM_USE_PRECOMPILED=1 pip install --editable .
@ -118,7 +118,7 @@ This command will do the following:

 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.

-```console
+```bash
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 pip install --editable .
@ -134,7 +134,7 @@ You can find more information about vLLM's wheels in [install-the-latest-code][i

 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install -e .
@ -160,7 +160,7 @@ There are scenarios where the PyTorch dependency cannot be easily installed via

 To build vLLM using an existing PyTorch installation:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
@ -173,7 +173,7 @@ pip install --no-build-isolation -e .
 Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
 To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
@ -184,7 +184,7 @@ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
 To avoid your system being overloaded, you can limit the number of compilation jobs
 to be run simultaneously, via the environment variable `MAX_JOBS`. For example:

-```console
+```bash
 export MAX_JOBS=6
 pip install -e .
 ```
@ -194,7 +194,7 @@ A side effect is a much slower build process.

 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.

-```console
+```bash
 # Use `--ipc=host` to make sure the shared memory is large enough.
 docker run \
    --gpus all \
@ -205,14 +205,14 @@ docker run \

 If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:

-```console
+```bash
 export CUDA_HOME=/usr/local/cuda
 export PATH="${CUDA_HOME}/bin:$PATH"
 ```

 Here is a sanity check to verify that the CUDA Toolkit is correctly installed:

-```console
+```bash
 nvcc --version # verify that nvcc is in your PATH
 ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
 ```
@ -223,7 +223,7 @@ vLLM can fully run only on Linux but for development purposes, you can still bui

 Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:

-```console
+```bash
 export VLLM_TARGET_DEVICE=empty
 pip install -e .
 ```
@ -238,7 +238,7 @@ See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for i

 Another way to access the latest code is to use the docker images:

-```console
+```bash
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
 ```
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@ -31,17 +31,17 @@ Currently, there are no pre-built ROCm wheels.

    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:

-    ```console
+    ```bash
    # Install PyTorch
-    $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
+    pip uninstall torch -y
+    pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
    ```

 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)

    Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)

-    ```console
+    ```bash
    python3 -m pip install ninja cmake wheel pybind11
    pip uninstall -y triton
    git clone https://github.com/OpenAI/triton.git
@ -62,7 +62,7 @@ Currently, there are no pre-built ROCm wheels.

    For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.

-    ```console
+    ```bash
    git clone https://github.com/ROCm/flash-attention.git
    cd flash-attention
    git checkout b7d29fb
@ -76,7 +76,7 @@ Currently, there are no pre-built ROCm wheels.

 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:

-    ```console
+    ```bash
    python3 -m pip uninstall -y aiter
    git clone --recursive https://github.com/ROCm/aiter.git
    cd aiter
@ -90,24 +90,26 @@ Currently, there are no pre-built ROCm wheels.

 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:

-    ```bash
-    pip install --upgrade pip
+    ??? Commands

-    # Build & install AMD SMI
-    pip install /opt/rocm/share/amd_smi
+        ```bash
+        pip install --upgrade pip

-    # Install dependencies
-    pip install --upgrade numba \
-        scipy \
-        huggingface-hub[cli,hf_transfer] \
-        setuptools_scm
-    pip install "numpy<2"
-    pip install -r requirements/rocm.txt
+        # Build & install AMD SMI
+        pip install /opt/rocm/share/amd_smi

-    # Build vLLM for MI210/MI250/MI300.
-    export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-    python3 setup.py develop
-    ```
+        # Install dependencies
+        pip install --upgrade numba \
+            scipy \
+            huggingface-hub[cli,hf_transfer] \
+            setuptools_scm
+        pip install "numpy<2"
+        pip install -r requirements/rocm.txt
+
+        # Build vLLM for MI210/MI250/MI300.
+        export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+        python3 setup.py develop
+        ```

    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.

@ -146,7 +148,7 @@ If you choose to build this rocm_base image yourself, the steps are as follows.

 It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

-```console
+```json
 {
    "features": {
        "buildkit": true
@ -156,7 +158,7 @@ It is important that the user kicks off the docker build using buildkit. Either

 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

-```console
+```bash
 DOCKER_BUILDKIT=1 docker build \
    -f docker/Dockerfile.rocm_base \
    -t rocm/vllm-dev:base .
@ -167,7 +169,7 @@ DOCKER_BUILDKIT=1 docker build \
 First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

-```console
+```bash
 {
    "features": {
        "buildkit": true
@ -185,13 +187,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt

 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

-```console
+```bash
 DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 ```

 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:

-```console
+```bash
 DOCKER_BUILDKIT=1 docker build \
    --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \
    -f docker/Dockerfile.rocm \
@ -201,19 +203,21 @@ DOCKER_BUILDKIT=1 docker build \

 To run the above docker image `vllm-rocm`, use the below command:

-```console
-docker run -it \
-   --network=host \
-   --group-add=video \
-   --ipc=host \
-   --cap-add=SYS_PTRACE \
-   --security-opt seccomp=unconfined \
-   --device /dev/kfd \
-   --device /dev/dri \
-   -v <path/to/model>:/app/model \
-   vllm-rocm \
-   bash
-```
+??? Command
+
+    ```bash
+    docker run -it \
+    --network=host \
+    --group-add=video \
+    --ipc=host \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --device /dev/kfd \
+    --device /dev/dri \
+    -v <path/to/model>:/app/model \
+    vllm-rocm \
+    bash
+    ```

 Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.

--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@ -25,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
 - First, install required driver and Intel OneAPI 2025.0 or later.
 - Second, install Python packages for vLLM XPU backend building:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install --upgrade pip
@ -34,7 +34,7 @@ pip install -v -r requirements/xpu.txt

 - Then, build and install vLLM XPU backend:

-```console
+```bash
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```

@ -53,9 +53,9 @@ Currently, there are no pre-built XPU images.
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

-```console
-$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
-$ docker run -it \
+```bash
+docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+docker run -it \
             --rm \
             --network=host \
             --device /dev/dri \
@ -68,7 +68,7 @@ $ docker run -it \

 XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:

-```console
+```bash
 python -m vllm.entrypoints.openai.api_server \
     --model=facebook/opt-13b \
     --dtype=bfloat16 \
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@ -24,7 +24,7 @@ please follow the methods outlined in the

 To verify that the Intel Gaudi software was correctly installed, run:

-```console
+```bash
 hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
 apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
 pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
@ -42,7 +42,7 @@ for more details.

 Use the following commands to run a Docker image:

-```console
+```bash
 docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 docker run \
  -it \
@ -65,7 +65,7 @@ Currently, there are no pre-built Intel Gaudi wheels.

 To build and install vLLM from source, run:

-```console
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install -r requirements/hpu.txt
@ -74,7 +74,7 @@ python setup.py develop

 Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:

-```console
+```bash
 git clone https://github.com/HabanaAI/vllm-fork.git
 cd vllm-fork
 git checkout habana_main
@ -90,7 +90,7 @@ Currently, there are no pre-built Intel Gaudi images.

 ### Build image from source

-```console
+```bash
 docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
 docker run \
  -it \
@ -200,7 +200,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1

 `min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.

-Example (with ramp-up)
+Example (with ramp-up):

 ```text
 min = 2, step = 32, max = 64
@ -209,7 +209,7 @@ min = 2, step = 32, max = 64
 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
 ```

-Example (without ramp-up)
+Example (without ramp-up):

 ```text
 min = 128, step = 128, max = 512
@ -232,19 +232,21 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come

 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:

-```text
-INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-...
-INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-...
-INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-```
+??? Logs
+
+    ```text
+    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+    INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+    ...
+    INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+    INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+    ...
+    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    ```

 This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.

@ -279,37 +281,39 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi

 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):

-```text
-INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-...
-INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-...
-INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-...
-INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-```
+??? Logs
+
+    ```text
+    INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+    INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+    INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+    INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+    INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+    INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+    INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+    INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+    INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+    INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+    INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+    INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+    ...
+    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+    ...
+    INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+    INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+    ...
+    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+    INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+    INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+    INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+    ```

 ### Recommended vLLM Parameters

--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@ -1,6 +1,6 @@
 It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:

-```console
+```bash
 uv venv --python 3.12 --seed
 source .venv/bin/activate
 ```
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@ -19,7 +19,7 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/

 It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:

-```console
+```bash
 uv venv --python 3.12 --seed
 source .venv/bin/activate
 uv pip install vllm --torch-backend=auto
@ -29,13 +29,13 @@ uv pip install vllm --torch-backend=auto

 Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:

-```console
+```bash
 uv run --with vllm vllm --help
 ```

 You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.

-```console
+```bash
 conda create -n myenv python=3.12 -y
 conda activate myenv
 pip install --upgrade uv
@ -110,7 +110,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the

 Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:

-```console
+```bash
 vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```

@ -124,7 +124,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct

 This server can be queried in the same format as OpenAI API. For example, to list the models:

-```console
+```bash
 curl http://localhost:8000/v1/models
 ```

@ -134,7 +134,7 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY`

 Once your server is started, you can query the model with input prompts:

-```console
+```bash
 curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
@ -147,20 +147,22 @@ curl http://localhost:8000/v1/completions \

 Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-                                      prompt="San Francisco is a")
-print("Completion result:", completion)
-```
+    ```python
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+                                        prompt="San Francisco is a")
+    print("Completion result:", completion)
+    ```

 A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>

@ -170,7 +172,7 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter

 You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:

-```console
+```bash
 curl http://localhost:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
@ -184,26 +186,28 @@ curl http://localhost:8000/v1/chat/completions \

 Alternatively, you can use the `openai` Python package:

-```python
-from openai import OpenAI
-# Set OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+??? Code

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    ```python
+    from openai import OpenAI
+    # Set OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-chat_response = client.chat.completions.create(
-    model="Qwen/Qwen2.5-1.5B-Instruct",
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Tell me a joke."},
-    ]
-)
-print("Chat response:", chat_response)
-```
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    chat_response = client.chat.completions.create(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Tell me a joke."},
+        ]
+    )
+    print("Chat response:", chat_response)
+    ```

 ## On Attention Backends

--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@ -9,27 +9,27 @@ Further reading can be found in [Run:ai Model Streamer Documentation](https://gi
 vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
 You first need to install vLLM RunAI optional dependency:

-```console
+```bash
 pip3 install vllm[runai]
 ```

 To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:

-```console
+```bash
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
    --load-format runai_streamer
 ```

 To run model from AWS S3 object store run:

-```console
+```bash
 vllm serve s3://core-llm/Llama-3-8b \
    --load-format runai_streamer
 ```

 To run model from a S3 compatible object store run:

-```console
+```bash
 RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \
 AWS_EC2_METADATA_DISABLED=true \
 AWS_ENDPOINT_URL=https://storage.googleapis.com \
@ -44,7 +44,7 @@ You can tune parameters using `--model-loader-extra-config`:
 You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
 For reading from S3, it will be the number of client instances the host is opening to the S3 server.

-```console
+```bash
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
    --load-format runai_streamer \
    --model-loader-extra-config '{"concurrency":16}'
@ -53,7 +53,7 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
 You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).

-```console
+```bash
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
    --load-format runai_streamer \
    --model-loader-extra-config '{"memory_limit":5368709120}'
@ -66,13 +66,13 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \

 vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:

-```console
+```bash
 vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
 ```

 The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:

-```console
+```bash
 vllm serve /path/to/sharded/model \
    --load-format runai_streamer_sharded \
    --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
@ -82,7 +82,7 @@ To create sharded model files, you can use the script provided in <gh-file:examp

 The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:

-```console
+```bash
 vllm serve /path/to/sharded/model \
    --load-format runai_streamer_sharded \
    --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@ -85,35 +85,37 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
    In general, only instruction-tuned models have a chat template.
    Base models may perform poorly as they are not trained to respond to the chat conversation.

-```python
-from vllm import LLM
+??? Code

-llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-conversation = [
-    {
-        "role": "system",
-        "content": "You are a helpful assistant"
-    },
-    {
-        "role": "user",
-        "content": "Hello"
-    },
-    {
-        "role": "assistant",
-        "content": "Hello! How can I assist you today?"
-    },
-    {
-        "role": "user",
-        "content": "Write an essay about the importance of higher education.",
-    },
-]
-outputs = llm.chat(conversation)
+    ```python
+    from vllm import LLM

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    conversation = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello"
+        },
+        {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -70,7 +70,10 @@ To make your model compatible with the Transformers backend, it needs:
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.

-```python title="modeling_my_model.py"
+<details>
+<summary>modeling_my_model.py</summary>
+
+```python

 from transformers import PreTrainedModel
 from torch import nn
@ -93,6 +96,8 @@ class MyModel(PreTrainedModel):
    _supports_attention_backend = True
 ```

+</details>
+
 Here is what happens in the background when this model is loaded:

 1. The config is loaded.
@ -103,7 +108,10 @@ That's it!

 For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:

-```python title="configuration_my_model.py"
+<details>
+<summary>configuration_my_model.py</summary>
+
+```python

 from transformers import PretrainedConfig

@ -123,6 +131,8 @@ class MyConfig(PretrainedConfig):
    }
 ```

+</details>
+
 - `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
 - `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
    * You only need to do this for layers which are not present on all pipeline stages
@ -168,7 +178,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project

 If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository:

-```console
+```bash
 # Download a model
 huggingface-cli download HuggingFaceH4/zephyr-7b-beta

@ -183,7 +193,7 @@ huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json

 Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache:

-```console
+```bash
 # List cached models
 huggingface-cli scan-cache

@ -198,6 +208,9 @@ huggingface-cli scan-cache --dir ~/.cache/huggingface/hub

 Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:

+<details>
+<summary>Commands</summary>
+
 ```console
 # The `delete-cache` command requires extra dependencies to work with the TUI.
 # Please run `pip install huggingface_hub[cli]` to install them.
@ -224,6 +237,8 @@ Start deletion.
 Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
 ```

+</details>
+
 #### Using a proxy

 Here are some tips for loading/downloading models from Hugging Face using a proxy:
@ -392,15 +407,15 @@ Specified using `--task embed`.
 | Architecture                                           | Models              | Example HF Models                                                                                                   | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779)   |
 |--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------|
 | `BertModel`                                            | BERT-based          | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc.                                                |                      |                           |                       |
-| `Gemma2Model`                                          | Gemma 2-based       | `BAAI/bge-multilingual-gemma2`, etc.                                                                                | ✅︎                   |                           |                       |
+| `Gemma2Model`                                          | Gemma 2-based       | `BAAI/bge-multilingual-gemma2`, etc.                                                                                | ✅︎                   |                           | ✅︎                     |
 | `GritLM`                                               | GritLM              | `parasail-ai/GritLM-7B-vllm`.                                                                                       | ✅︎                   | ✅︎                        |                       |
 | `GteModel`                                             | Arctic-Embed-2.0-M  | `Snowflake/snowflake-arctic-embed-m-v2.0`.                                                                          | ︎                     |                           |                       |
 | `GteNewModel`                                          | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc.                                                                           | ︎                     | ︎                         |                       |
 | `ModernBertModel`                                      | ModernBERT-based    | `Alibaba-NLP/gte-modernbert-base`, etc.                                                                             | ︎                     | ︎                         |                       |
 | `NomicBertModel`                                       | Nomic BERT          | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎                     | ︎                         |                       |
-| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based         | `intfloat/e5-mistral-7b-instruct`, etc.                                                                             | ✅︎                   | ✅︎                        |                       |
-| `Qwen2Model`, `Qwen2ForCausalLM`                       | Qwen2-based         | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.              | ✅︎                   | ✅︎                        |                       |
-| `Qwen3Model`, `Qwen3ForCausalLM`                       | Qwen3-based         | `Qwen/Qwen3-Embedding-0.6B`, etc.                                                                                   | ✅︎                   | ✅︎                        |                       |
+| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based         | `intfloat/e5-mistral-7b-instruct`, etc.                                                                             | ✅︎                   | ✅︎                        | ✅︎                     |
+| `Qwen2Model`, `Qwen2ForCausalLM`                       | Qwen2-based         | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.              | ✅︎                   | ✅︎                        | ✅︎                     |
+| `Qwen3Model`, `Qwen3ForCausalLM`                       | Qwen3-based         | `Qwen/Qwen3-Embedding-0.6B`, etc.                                                                                   | ✅︎                   | ✅︎                        | ✅︎                     |
 | `RobertaModel`, `RobertaForMaskedLM`                   | RoBERTa-based       | `sentence-transformers/all-roberta-large-v1`, etc.                                                                  |                      |                           |                       |

 !!! note
@ -427,9 +442,10 @@ Specified using `--task reward`.

 | Architecture              | Models          | Example HF Models                                                      | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎                     | ✅︎                          |                       |
-| `LlamaForCausalLM`        | Llama-based     | `peiyi9979/math-shepherd-mistral-7b-prm`, etc.                         | ✅︎                     | ✅︎                          |                       |
-| `Qwen2ForRewardModel`     | Qwen2-based     | `Qwen/Qwen2.5-Math-RM-72B`, etc.                                       | ✅︎                     | ✅︎                          |                       |
+| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎                     | ✅︎                          | ✅︎                     |
+| `LlamaForCausalLM`        | Llama-based     | `peiyi9979/math-shepherd-mistral-7b-prm`, etc.                         | ✅︎                     | ✅︎                          | ✅︎                     |
+| `Qwen2ForRewardModel`     | Qwen2-based     | `Qwen/Qwen2.5-Math-RM-72B`, etc.                                       | ✅︎                     | ✅︎                          | ✅︎                     |
+| `Qwen2ForProcessRewardModel`     | Qwen2-based     | `Qwen/Qwen2.5-Math-PRM-7B`, etc.                                       | ✅︎                     | ✅︎                          | ✅︎                     |

 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@ -445,7 +461,7 @@ Specified using `--task classify`.
 | Architecture                     | Models   | Example HF Models                      | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
-
+| `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
 If your model is not in the above list, we will try to automatically convert the model using
 [as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.

@ -456,7 +472,7 @@ Specified using `--task score`.
 | Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779)   |
 |---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
 | `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                       |
-| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. |                       |
+| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                     |
 | `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                       |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                       |

@ -562,6 +578,7 @@ Specified using `--task generate`.
 | `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                        | ✅︎                          | ✅︎                    |
 | `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                             | ✅︎                    |
 | `TarsierForConditionalGeneration`            | Tarsier                                                                  | T + I<sup>E+</sup>                                                    | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b`                                                                                                      |                        | ✅︎                          | ✅︎                    |
+| `Tarsier2ForConditionalGeneration`<sup>^</sup>            | Tarsier2                                                                  | T + I<sup>E+</sup> + V<sup>E+</sup>                                                    | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115`                                                                                                      |                        | ✅︎                          | ✅︎                    |

 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
@ -600,27 +617,29 @@ Specified using `--task generate`.

    For the best results, we recommend using the following dependency versions (tested on A10 and L40):

-    ```text
-    # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
-    torch==2.5.1
-    torchvision==0.20.1
-    transformers==4.48.1
-    tokenizers==0.21.0
-    tiktoken==0.7.0
-    vllm==0.7.0
+    ??? Dependency versions

-    # Optional but recommended for improved performance and stability
-    triton==3.1.0
-    xformers==0.0.28.post3
-    uvloop==0.21.0
-    protobuf==5.29.3
-    openai==1.60.2
-    opencv-python-headless==4.11.0.86
-    pillow==10.4.0
+        ```text
+        # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
+        torch==2.5.1
+        torchvision==0.20.1
+        transformers==4.48.1
+        tokenizers==0.21.0
+        tiktoken==0.7.0
+        vllm==0.7.0

-    # Installed FlashAttention (for float16 only)
-    flash-attn>=2.5.6  # Not used in float32, but should be documented
-    ```
+        # Optional but recommended for improved performance and stability
+        triton==3.1.0
+        xformers==0.0.28.post3
+        uvloop==0.21.0
+        protobuf==5.29.3
+        openai==1.60.2
+        opencv-python-headless==4.11.0.86
+        pillow==10.4.0
+
+        # Installed FlashAttention (for float16 only)
+        flash-attn>=2.5.6  # Not used in float32, but should be documented
+        ```

    **Note:** Make sure you understand the security implications of using outdated packages.

--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@ -34,15 +34,15 @@ output = llm.generate("San Francisco is a")

 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:

-```console
- vllm serve facebook/opt-13b \
+```bash
+vllm serve facebook/opt-13b \
     --tensor-parallel-size 4
 ```

 You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:

-```console
- vllm serve gpt2 \
+```bash
+vllm serve gpt2 \
     --tensor-parallel-size 4 \
     --pipeline-parallel-size 2
 ```
@ -55,7 +55,7 @@ The first step, is to start containers and organize them into a cluster. We have

 Pick a node as the head node, and run the following command:

-```console
+```bash
 bash run_cluster.sh \
                vllm/vllm-openai \
                ip_of_head_node \
@ -66,7 +66,7 @@ bash run_cluster.sh \

 On the rest of the worker nodes, run the following command:

-```console
+```bash
 bash run_cluster.sh \
                vllm/vllm-openai \
                ip_of_head_node \
@ -87,7 +87,7 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container,

 After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:

-```console
+```bash
 vllm serve /path/to/the/model/in/the/container \
     --tensor-parallel-size 8 \
     --pipeline-parallel-size 2
@ -95,7 +95,7 @@ After that, on any node, use `docker exec -it node /bin/bash` to enter the conta

 You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:

-```console
+```bash
 vllm serve /path/to/the/model/in/the/container \
     --tensor-parallel-size 16
 ```
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@ -7,25 +7,27 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain

 To install LangChain, run

-```console
+```bash
 pip install langchain langchain_community -q
 ```

 To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.

-```python
-from langchain_community.llms import VLLM
+??? Code

-llm = VLLM(model="mosaicml/mpt-7b",
-           trust_remote_code=True,  # mandatory for hf models
-           max_new_tokens=128,
-           top_k=10,
-           top_p=0.95,
-           temperature=0.8,
-           # tensor_parallel_size=... # for distributed inference
-)
+    ```python
+    from langchain_community.llms import VLLM

-print(llm("What is the capital of France ?"))
-```
+    llm = VLLM(model="mosaicml/mpt-7b",
+            trust_remote_code=True,  # mandatory for hf models
+            max_new_tokens=128,
+            top_k=10,
+            top_p=0.95,
+            temperature=0.8,
+            # tensor_parallel_size=... # for distributed inference
+    )
+
+    print(llm("What is the capital of France ?"))
+    ```

 Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
--- a/docs/serving/integrations/llamaindex.md
+++ b/docs/serving/integrations/llamaindex.md
@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index

 To install LlamaIndex, run

-```console
+```bash
 pip install llama-index-llms-vllm -q
 ```

--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -15,22 +15,24 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \

 To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).

-```python
-from openai import OpenAI
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="token-abc123",
-)
+??? Code

-completion = client.chat.completions.create(
-    model="NousResearch/Meta-Llama-3-8B-Instruct",
-    messages=[
-        {"role": "user", "content": "Hello!"}
-    ]
-)
+    ```python
+    from openai import OpenAI
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )

-print(completion.choices[0].message)
-```
+    completion = client.chat.completions.create(
+        model="NousResearch/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "user", "content": "Hello!"}
+        ]
+    )
+
+    print(completion.choices[0].message)
+    ```

 !!! tip
    vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
@ -147,27 +149,29 @@ with `--enable-request-id-headers`.
 > rather than within the vLLM layer for this reason.
 > See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.

-```python
-completion = client.chat.completions.create(
-    model="NousResearch/Meta-Llama-3-8B-Instruct",
-    messages=[
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-    ],
-    extra_headers={
-        "x-request-id": "sentiment-classification-00001",
-    }
-)
-print(completion._request_id)
+??? Code

-completion = client.completions.create(
-    model="NousResearch/Meta-Llama-3-8B-Instruct",
-    prompt="A robot may not injure a human being",
-    extra_headers={
-        "x-request-id": "completion-test",
-    }
-)
-print(completion._request_id)
-```
+    ```python
+    completion = client.chat.completions.create(
+        model="NousResearch/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
+        extra_headers={
+            "x-request-id": "sentiment-classification-00001",
+        }
+    )
+    print(completion._request_id)
+
+    completion = client.completions.create(
+        model="NousResearch/Meta-Llama-3-8B-Instruct",
+        prompt="A robot may not injure a human being",
+        extra_headers={
+            "x-request-id": "completion-test",
+        }
+    )
+    print(completion._request_id)
+    ```

 ## API Reference

@ -184,15 +188,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>

 The following [sampling parameters][sampling-params] are supported.

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
+    ```

 The following extra parameters are supported:

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
+    ```

 [](){ #chat-api }

@ -212,15 +220,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>

 The following [sampling parameters][sampling-params] are supported.

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
+    ```

 The following extra parameters are supported:

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
+    ```

 [](){ #embeddings-api }

@ -259,29 +271,31 @@ and passing a list of `messages` in the request. Refer to the examples below for

    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:

-    ```python
-    import requests
+    ??? Code

-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+        ```python
+        import requests

-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "TIGER-Lab/VLM2Vec-Full",
-            "messages": [{
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": image_url}},
-                    {"type": "text", "text": "Represent the given image."},
-                ],
-            }],
-            "encoding_format": "float",
-        },
-    )
-    response.raise_for_status()
-    response_json = response.json()
-    print("Embedding output:", response_json["data"][0]["embedding"])
-    ```
+        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+        response = requests.post(
+            "http://localhost:8000/v1/embeddings",
+            json={
+                "model": "TIGER-Lab/VLM2Vec-Full",
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "text", "text": "Represent the given image."},
+                    ],
+                }],
+                "encoding_format": "float",
+            },
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        print("Embedding output:", response_json["data"][0]["embedding"])
+        ```

 === "DSE-Qwen2-MRL"

@ -316,15 +330,19 @@ The following [pooling parameters][pooling-params] are supported.

 The following extra parameters are supported by default:

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+    ```

 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+    ```

 [](){ #transcriptions-api }

@ -343,15 +361,19 @@ Code example: <gh-file:examples/online_serving/openai_transcription_client.py>

 The following [sampling parameters][sampling-params] are supported.

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
+    ```

 The following extra parameters are supported:

-```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
+    ```

 [](){ #tokenizer-api }

@ -387,8 +409,6 @@ Code example: <gh-file:examples/online_serving/openai_classification_client.py>

 You can classify multiple texts by passing an array of strings:

-Request:
-
 ```bash
 curl -v "http://127.0.0.1:8000/classify" \
  -H "Content-Type: application/json" \
@ -401,47 +421,45 @@ curl -v "http://127.0.0.1:8000/classify" \
  }'
 ```

-Response:
+??? Response

-```bash
-{
-  "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
-  "object": "list",
-  "created": 1745383065,
-  "model": "jason9693/Qwen2.5-1.5B-apeach",
-  "data": [
+    ```bash
    {
-      "index": 0,
-      "label": "Default",
-      "probs": [
-        0.565970778465271,
-        0.4340292513370514
+      "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
+      "object": "list",
+      "created": 1745383065,
+      "model": "jason9693/Qwen2.5-1.5B-apeach",
+      "data": [
+        {
+          "index": 0,
+          "label": "Default",
+          "probs": [
+            0.565970778465271,
+            0.4340292513370514
+          ],
+          "num_classes": 2
+        },
+        {
+          "index": 1,
+          "label": "Spoiled",
+          "probs": [
+            0.26448777318000793,
+            0.7355121970176697
+          ],
+          "num_classes": 2
+        }
      ],
-      "num_classes": 2
-    },
-    {
-      "index": 1,
-      "label": "Spoiled",
-      "probs": [
-        0.26448777318000793,
-        0.7355121970176697
-      ],
-      "num_classes": 2
+      "usage": {
+        "prompt_tokens": 20,
+        "total_tokens": 20,
+        "completion_tokens": 0,
+        "prompt_tokens_details": null
+      }
    }
-  ],
-  "usage": {
-    "prompt_tokens": 20,
-    "total_tokens": 20,
-    "completion_tokens": 0,
-    "prompt_tokens_details": null
-  }
-}
-```
+    ```

 You can also pass a string directly to the `input` field:

-Request:
-
 ```bash
 curl -v "http://127.0.0.1:8000/classify" \
  -H "Content-Type: application/json" \
@ -451,33 +469,33 @@ curl -v "http://127.0.0.1:8000/classify" \
  }'
 ```

-Response:
+??? Response

-```bash
-{
-  "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
-  "object": "list",
-  "created": 1745383213,
-  "model": "jason9693/Qwen2.5-1.5B-apeach",
-  "data": [
+    ```bash
    {
-      "index": 0,
-      "label": "Default",
-      "probs": [
-        0.565970778465271,
-        0.4340292513370514
+      "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
+      "object": "list",
+      "created": 1745383213,
+      "model": "jason9693/Qwen2.5-1.5B-apeach",
+      "data": [
+        {
+          "index": 0,
+          "label": "Default",
+          "probs": [
+            0.565970778465271,
+            0.4340292513370514
+          ],
+          "num_classes": 2
+        }
      ],
-      "num_classes": 2
+      "usage": {
+        "prompt_tokens": 10,
+        "total_tokens": 10,
+        "completion_tokens": 0,
+        "prompt_tokens_details": null
+      }
    }
-  ],
-  "usage": {
-    "prompt_tokens": 10,
-    "total_tokens": 10,
-    "completion_tokens": 0,
-    "prompt_tokens_details": null
-  }
-}
-```
+    ```

 #### Extra parameters

@ -508,8 +526,6 @@ Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>

 You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.

-Request:
-
 ```bash
 curl -X 'POST' \
  'http://127.0.0.1:8000/score' \
@ -523,24 +539,24 @@ curl -X 'POST' \
 }'
 ```

-Response:
+??? Response

-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
+    ```bash
    {
-      "index": 0,
-      "object": "score",
-      "score": 1
+      "id": "score-request-id",
+      "object": "list",
+      "created": 693447,
+      "model": "BAAI/bge-reranker-v2-m3",
+      "data": [
+        {
+          "index": 0,
+          "object": "score",
+          "score": 1
+        }
+      ],
+      "usage": {}
    }
-  ],
-  "usage": {}
-}
-```
+    ```

 #### Batch inference

@ -548,95 +564,95 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente
 where each pair is built from `text_1` and a string in `text_2`.
 The total number of pairs is `len(text_2)`.

-Request:
+??? Request

-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "text_1": "What is the capital of France?",
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
-  ]
-}'
-```
+    ```bash
+    curl -X 'POST' \
+      'http://127.0.0.1:8000/score' \
+      -H 'accept: application/json' \
+      -H 'Content-Type: application/json' \
+      -d '{
+      "model": "BAAI/bge-reranker-v2-m3",
+      "text_1": "What is the capital of France?",
+      "text_2": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris."
+      ]
+    }'
+    ```

-Response:
+??? Response

-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693570,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
+    ```bash
    {
-      "index": 0,
-      "object": "score",
-      "score": 0.001094818115234375
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": 1
+      "id": "score-request-id",
+      "object": "list",
+      "created": 693570,
+      "model": "BAAI/bge-reranker-v2-m3",
+      "data": [
+        {
+          "index": 0,
+          "object": "score",
+          "score": 0.001094818115234375
+        },
+        {
+          "index": 1,
+          "object": "score",
+          "score": 1
+        }
+      ],
+      "usage": {}
    }
-  ],
-  "usage": {}
-}
-```
+    ```

 You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
 where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
 The total number of pairs is `len(text_2)`.

-Request:
+??? Request

-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": [
-    "What is the capital of Brazil?",
-    "What is the capital of France?"
-  ],
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
-  ]
-}'
-```
+    ```bash
+    curl -X 'POST' \
+      'http://127.0.0.1:8000/score' \
+      -H 'accept: application/json' \
+      -H 'Content-Type: application/json' \
+      -d '{
+      "model": "BAAI/bge-reranker-v2-m3",
+      "encoding_format": "float",
+      "text_1": [
+        "What is the capital of Brazil?",
+        "What is the capital of France?"
+      ],
+      "text_2": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris."
+      ]
+    }'
+    ```

-Response:
+??? Response

-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
+    ```bash
    {
-      "index": 0,
-      "object": "score",
-      "score": 1
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": 1
+      "id": "score-request-id",
+      "object": "list",
+      "created": 693447,
+      "model": "BAAI/bge-reranker-v2-m3",
+      "data": [
+        {
+          "index": 0,
+          "object": "score",
+          "score": 1
+        },
+        {
+          "index": 1,
+          "object": "score",
+          "score": 1
+        }
+      ],
+      "usage": {}
    }
-  ],
-  "usage": {}
-}
-```
+    ```

 #### Extra parameters

@ -675,51 +691,51 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
 Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
 Result documents will be sorted by relevance, and the `index` property can be used to determine original order.

-Request:
+??? Request

-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/rerank' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-base",
-  "query": "What is the capital of France?",
-  "documents": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris.",
-    "Horses and cows are both animals"
-  ]
-}'
-```
+    ```bash
+    curl -X 'POST' \
+      'http://127.0.0.1:8000/v1/rerank' \
+      -H 'accept: application/json' \
+      -H 'Content-Type: application/json' \
+      -d '{
+      "model": "BAAI/bge-reranker-base",
+      "query": "What is the capital of France?",
+      "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Horses and cows are both animals"
+      ]
+    }'
+    ```

-Response:
+??? Response

-```bash
-{
-  "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
-  "model": "BAAI/bge-reranker-base",
-  "usage": {
-    "total_tokens": 56
-  },
-  "results": [
+    ```bash
    {
-      "index": 1,
-      "document": {
-        "text": "The capital of France is Paris."
+      "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
+      "model": "BAAI/bge-reranker-base",
+      "usage": {
+        "total_tokens": 56
      },
-      "relevance_score": 0.99853515625
-    },
-    {
-      "index": 0,
-      "document": {
-        "text": "The capital of Brazil is Brasilia."
-      },
-      "relevance_score": 0.0005860328674316406
+      "results": [
+        {
+          "index": 1,
+          "document": {
+            "text": "The capital of France is Paris."
+          },
+          "relevance_score": 0.99853515625
+        },
+        {
+          "index": 0,
+          "document": {
+            "text": "The capital of Brazil is Brasilia."
+          },
+          "relevance_score": 0.0005860328674316406
+        }
+      ]
    }
-  ]
-}
-```
+    ```

 #### Extra parameters

--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@ -6,34 +6,38 @@ OpenAI compatible API server.

 You can start the server using Python, or using [Docker][deployment-docker]:

-```console
+```bash
 vllm serve unsloth/Llama-3.2-1B-Instruct
 ```

 Then query the endpoint to get the latest metrics from the server:

-```console
-$ curl http://0.0.0.0:8000/metrics
+??? Output

-# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
-# TYPE vllm:iteration_tokens_total histogram
-vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
-vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-...
-```
+    ```console
+    $ curl http://0.0.0.0:8000/metrics
+
+    # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
+    # TYPE vllm:iteration_tokens_total histogram
+    vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
+    vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+    ...
+    ```

 The following metrics are exposed:

-```python
--8<-- "vllm/engine/metrics.py:metrics-definitions"
-```
+??? Code
+
+    ```python
+    --8<-- "vllm/engine/metrics.py:metrics-definitions"
+    ```

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@ -60,79 +60,84 @@ To identify the particular CUDA operation that causes the error, you can add `--

 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.

-```python
-# Test PyTorch NCCL
-import torch
-import torch.distributed as dist
-dist.init_process_group(backend="nccl")
-local_rank = dist.get_rank() % torch.cuda.device_count()
-torch.cuda.set_device(local_rank)
-data = torch.FloatTensor([1,] * 128).to("cuda")
-dist.all_reduce(data, op=dist.ReduceOp.SUM)
-torch.cuda.synchronize()
-value = data.mean().item()
-world_size = dist.get_world_size()
-assert value == world_size, f"Expected {world_size}, got {value}"
+??? Code

-print("PyTorch NCCL is successful!")
+    ```python
+    # Test PyTorch NCCL
+    import torch
+    import torch.distributed as dist
+    dist.init_process_group(backend="nccl")
+    local_rank = dist.get_rank() % torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+    data = torch.FloatTensor([1,] * 128).to("cuda")
+    dist.all_reduce(data, op=dist.ReduceOp.SUM)
+    torch.cuda.synchronize()
+    value = data.mean().item()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"

-# Test PyTorch GLOO
-gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
-cpu_data = torch.FloatTensor([1,] * 128)
-dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
-value = cpu_data.mean().item()
-assert value == world_size, f"Expected {world_size}, got {value}"
+    print("PyTorch NCCL is successful!")

-print("PyTorch GLOO is successful!")
+    # Test PyTorch GLOO
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"

-if world_size <= 1:
-    exit()
+    print("PyTorch GLOO is successful!")

-# Test vLLM NCCL, with cuda graph
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    if world_size <= 1:
+        exit()

-pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
-# pynccl is enabled by default for 0.6.5+,
-# but for 0.6.4 and below, we need to enable it manually.
-# keep the code for backward compatibility when because people
-# prefer to read the latest documentation.
-pynccl.disabled = False
+    # Test vLLM NCCL, with cuda graph
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+
+    pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+    # pynccl is enabled by default for 0.6.5+,
+    # but for 0.6.4 and below, we need to enable it manually.
+    # keep the code for backward compatibility when because people
+    # prefer to read the latest documentation.
+    pynccl.disabled = False
+
+    s = torch.cuda.Stream()
+    with torch.cuda.stream(s):
+        data.fill_(1)
+        out = pynccl.all_reduce(data, stream=s)
+        value = out.mean().item()
+        assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("vLLM NCCL is successful!")
+
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cuda_graph=g, stream=s):
+        out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())

-s = torch.cuda.Stream()
-with torch.cuda.stream(s):
    data.fill_(1)
-    out = pynccl.all_reduce(data, stream=s)
+    g.replay()
+    torch.cuda.current_stream().synchronize()
    value = out.mean().item()
    assert value == world_size, f"Expected {world_size}, got {value}"

-print("vLLM NCCL is successful!")
+    print("vLLM NCCL with cuda graph is successful!")

-g = torch.cuda.CUDAGraph()
-with torch.cuda.graph(cuda_graph=g, stream=s):
-    out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
-
-data.fill_(1)
-g.replay()
-torch.cuda.current_stream().synchronize()
-value = out.mean().item()
-assert value == world_size, f"Expected {world_size}, got {value}"
-
-print("vLLM NCCL with cuda graph is successful!")
-
-dist.destroy_process_group(gloo_group)
-dist.destroy_process_group()
-```
+    dist.destroy_process_group(gloo_group)
+    dist.destroy_process_group()
+    ```

 If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:

-```console
+```bash
 NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
 ```

 If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:

-```console
-NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
+```bash
+NCCL_DEBUG=TRACE torchrun --nnodes 2 \
+    --nproc-per-node=2 \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$MASTER_ADDR test.py
 ```

 If the script runs successfully, you should see the message `sanity check is successful!`.
@ -165,25 +170,27 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously

 or an error from Python that looks like this:

-```console
-RuntimeError:
-        An attempt has been made to start a new process before the
-        current process has finished its bootstrapping phase.
+??? Logs

-        This probably means that you are not using fork to start your
-        child processes and you have forgotten to use the proper idiom
-        in the main module:
+    ```console
+    RuntimeError:
+            An attempt has been made to start a new process before the
+            current process has finished its bootstrapping phase.

-            if __name__ == '__main__':
-                freeze_support()
-                ...
+            This probably means that you are not using fork to start your
+            child processes and you have forgotten to use the proper idiom
+            in the main module:

-        The "freeze_support()" line can be omitted if the program
-        is not going to be frozen to produce an executable.
+                if __name__ == '__main__':
+                    freeze_support()
+                    ...

-        To fix this issue, refer to the "Safe importing of main module"
-        section in https://docs.python.org/3/library/multiprocessing.html
-```
+            The "freeze_support()" line can be omitted if the program
+            is not going to be frozen to produce an executable.
+
+            To fix this issue, refer to the "Safe importing of main module"
+            section in https://docs.python.org/3/library/multiprocessing.html
+    ```

 then you must update your Python code to guard usage of `vllm` behind a `if
 __name__ == '__main__':` block. For example, instead of this:
@ -207,20 +214,22 @@ if __name__ == '__main__':

 vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:

-```python
-import torch
+??? Code

-@torch.compile
-def f(x):
-    # a simple function to test torch.compile
-    x = x + 1
-    x = x * 2
-    x = x.sin()
-    return x
+    ```python
+    import torch

-x = torch.randn(4, 4).cuda()
-print(f(x))
-```
+    @torch.compile
+    def f(x):
+        # a simple function to test torch.compile
+        x = x + 1
+        x = x * 2
+        x = x.sin()
+        return x
+
+    x = torch.randn(4, 4).cuda()
+    print(f(x))
+    ```

 If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.

--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@ -10,36 +10,38 @@ The list of data collected by the latest version of vLLM can be found here: <gh-

 Here is an example as of v0.4.0:

-```json
-{
-  "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
-  "provider": "GCP",
-  "num_cpu": 24,
-  "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
-  "cpu_family_model_stepping": "6,85,7",
-  "total_memory": 101261135872,
-  "architecture": "x86_64",
-  "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
-  "gpu_count": 2,
-  "gpu_type": "NVIDIA L4",
-  "gpu_memory_per_device": 23580639232,
-  "model_architecture": "OPTForCausalLM",
-  "vllm_version": "0.3.2+cu123",
-  "context": "LLM_CLASS",
-  "log_time": 1711663373492490000,
-  "source": "production",
-  "dtype": "torch.float16",
-  "tensor_parallel_size": 1,
-  "block_size": 16,
-  "gpu_memory_utilization": 0.9,
-  "quantization": null,
-  "kv_cache_dtype": "auto",
-  "enable_lora": false,
-  "enable_prefix_caching": false,
-  "enforce_eager": false,
-  "disable_custom_all_reduce": true
-}
-```
+??? Output
+
+    ```json
+    {
+      "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
+      "provider": "GCP",
+      "num_cpu": 24,
+      "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
+      "cpu_family_model_stepping": "6,85,7",
+      "total_memory": 101261135872,
+      "architecture": "x86_64",
+      "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
+      "gpu_count": 2,
+      "gpu_type": "NVIDIA L4",
+      "gpu_memory_per_device": 23580639232,
+      "model_architecture": "OPTForCausalLM",
+      "vllm_version": "0.3.2+cu123",
+      "context": "LLM_CLASS",
+      "log_time": 1711663373492490000,
+      "source": "production",
+      "dtype": "torch.float16",
+      "tensor_parallel_size": 1,
+      "block_size": 16,
+      "gpu_memory_utilization": 0.9,
+      "quantization": null,
+      "kv_cache_dtype": "auto",
+      "enable_lora": false,
+      "enable_prefix_caching": false,
+      "enforce_eager": false,
+      "disable_custom_all_reduce": true
+    }
+    ```

 You can preview the collected data by running the following command:

--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -39,12 +39,24 @@ This living user guide outlines a few known **important changes and limitations*
 For each item, our progress towards V1 support falls into one of the following states:

 - **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
- **🟢 Functional**: Fully operational, with ongoing optimizations.  
- **🚧 WIP**: Under active development.  
- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
+- **🟢 Functional**: Fully operational, with ongoing optimizations.
+- **🚧 WIP**: Under active development.
+- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
 - **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
 - **🔴 Deprecated**: Not planned for V1 unless there is strong demand.

+!!! note
+    vLLM V1’s unified scheduler treats both prompt and output tokens the same
+    way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
+    allocate a fixed token budget per request, enabling features like chunked prefills,
+    prefix caching, and speculative decoding without a strict separation between prefill
+    and decode phases.
+
+The V1 scheduler supports multiple scheduling policies, including First-Come,
+First-Served (FCFS) and priority-based scheduling (where requests are processed
+based on assigned priority, with FCFS as a tie-breaker), configurable via the
+`--scheduling-policy` argument.
+
 ### Hardware

 | Hardware   | Status                             |
@ -70,7 +82,7 @@ For each item, our progress towards V1 support falls into one of the following s
 |-----------------------------|------------------------------------------------------------------------------------|
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
 | **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
-| **Embedding Models**        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
+| **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
 | **Mamba Models**            | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |

@ -80,11 +92,11 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco

    This corresponds to the V1 column in our [list of supported models][supported-models].

-See below for the status of models that are still not yet supported in V1.
+See below for the status of models that are not yet supported or have more features planned in V1.

 #### Embedding Models

-The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
+The initial basic support is now functional.

 Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
 which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@ -12,7 +12,10 @@ def parse_args():
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
-        model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
+        model="intfloat/e5-mistral-7b-instruct",
+        task="embed",
+        enforce_eager=True,
+        max_model_len=1024,
    )
    return parser.parse_args()

--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@ -29,14 +29,14 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e

 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

-```console
+```bash
 wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
 ```

 Once you've created your batch file it should look like this

-```console
-$ cat offline_inference/openai_batch/openai_example_batch.jsonl
+```bash
+cat offline_inference/openai_batch/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@ -47,7 +47,7 @@ The batch running tool is designed to be used from the command line.

 You can run the batch with the following command, which will write its results to a file called `results.jsonl`

-```console
+```bash
 python -m vllm.entrypoints.openai.run_batch \
    -i offline_inference/openai_batch/openai_example_batch.jsonl \
    -o results.jsonl \
@ -56,7 +56,7 @@ python -m vllm.entrypoints.openai.run_batch \

 or use command-line:

-```console
+```bash
 vllm run-batch \
    -i offline_inference/openai_batch/openai_example_batch.jsonl \
    -o results.jsonl \
@ -67,8 +67,8 @@ vllm run-batch \

 You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`

-```console
-$ cat results.jsonl
+```bash
+cat results.jsonl
 {"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
 {"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
 ```
@ -79,7 +79,7 @@ The batch runner supports remote input and output urls that are accessible via h

 For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run

-```console
+```bash
 python -m vllm.entrypoints.openai.run_batch \
    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
    -o results.jsonl \
@ -88,7 +88,7 @@ python -m vllm.entrypoints.openai.run_batch \

 or use command-line:

-```console
+```bash
 vllm run-batch \
    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
    -o results.jsonl \
@ -112,21 +112,21 @@ To integrate with cloud blob storage, we recommend using presigned urls.

 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

-```console
+```bash
 wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
 ```

 Once you've created your batch file it should look like this

-```console
-$ cat offline_inference/openai_batch/openai_example_batch.jsonl
+```bash
+cat offline_inference/openai_batch/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```

 Now upload your batch file to your S3 bucket.

-```console
+```bash
 aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```

@ -181,7 +181,7 @@ output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AW

 You can now run the batch runner, using the urls generated in the previous section.

-```console
+```bash
 python -m vllm.entrypoints.openai.run_batch \
    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
@ -190,7 +190,7 @@ python -m vllm.entrypoints.openai.run_batch \

 or use command-line:

-```console
+```bash
 vllm run-batch \
    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
@ -201,7 +201,7 @@ vllm run-batch \

 Your results are now on S3. You can view them in your terminal by running

-```console
+```bash
 aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 ```

@ -230,8 +230,8 @@ You can run the batch using the same command as in earlier examples.

 You can check your results by running `cat results.jsonl`

-```console
-$ cat results.jsonl
+```bash
+cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
 ```
@ -261,8 +261,8 @@ You can run the batch using the same command as in earlier examples.

 You can check your results by running `cat results.jsonl`

-```console
-$ cat results.jsonl
+```bash
+cat results.jsonl
 {"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
 {"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
 ```
--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@ -22,15 +22,19 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
 # If you want to load the official original version, the init parameters are
 # as follows.

-model = LLM(
-    model=model_name,
-    task="score",
-    hf_overrides={
-        "architectures": ["Qwen3ForSequenceClassification"],
-        "classifier_from_token": ["no", "yes"],
-        "is_original_qwen3_reranker": True,
-    },
-)
+
+def get_model() -> LLM:
+    """Initializes and returns the LLM model for Qwen3-Reranker."""
+    return LLM(
+        model=model_name,
+        task="score",
+        hf_overrides={
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+    )
+

 # Why do we need hf_overrides for the official original version:
 # vllm converts it to Qwen3ForSequenceClassification when loaded for
@ -51,7 +55,8 @@ suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
 query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
 document_template = "<Document>: {doc}{suffix}"

-if __name__ == "__main__":
+
+def main() -> None:
    instruction = (
        "Given a web search query, retrieve relevant passages that answer the query"
    )
@ -72,6 +77,13 @@ if __name__ == "__main__":
    ]
    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]

+    model = get_model()
    outputs = model.score(queries, documents)

+    print("-" * 30)
    print([output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -1040,6 +1040,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
    )


+def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "omni-research/Tarsier2-Recap-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # SkyworkR1V
 def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1112,6 +1143,7 @@ model_example_map = {
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
    "tarsier": run_tarsier,
+    "tarsier2": run_tarsier2,
 }


--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@ -94,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
        task="embed",
+        max_model_len=4096,
        trust_remote_code=True,
        mm_processor_kwargs={"num_crops": 4},
        limit_mm_per_prompt={"image": 1},
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -828,6 +828,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier2-Recap-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=32768,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+    )
+
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
+        f"<|vision_end|>{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
@ -853,6 +879,7 @@ model_example_map = {
    "qwen2_5_vl": load_qwen2_5_vl,
    "smolvlm": load_smolvlm,
    "tarsier": load_tarsier,
+    "tarsier2": load_tarsier2,
 }


--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_response(response, tool_functions, original_query):
+    """Process a non-streaming response with possible tool calls"""
+
+    print("\n--- Response Output ---")
+
+    # Check if the response has content
+    if response.choices[0].message.content:
+        print(f"Content: {response.choices[0].message.content}")
+
+    # Check if the response has tool calls
+    if response.choices[0].message.tool_calls:
+        print("--------------------------------")
+        print(f"Tool calls: {response.choices[0].message.tool_calls}")
+        print("--------------------------------")
+
+        # Collect all tool calls and results before making follow-up request
+        tool_results = []
+        assistant_message = {"role": "assistant"}
+
+        if response.choices[0].message.content:
+            assistant_message["content"] = response.choices[0].message.content
+
+        assistant_tool_calls = []
+
+        # Process each tool call
+        for tool_call in response.choices[0].message.tool_calls:
+            function_name = tool_call.function.name
+            function_args = tool_call.function.arguments
+            function_id = tool_call.id
+
+            print(f"Function called: {function_name}")
+            print(f"Arguments: {function_args}")
+            print(f"Function ID: {function_id}")
+
+            # Execute the function
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(f"\n--- Function Result ---\n{function_result}\n")
+
+                # Add tool call to assistant message
+                assistant_tool_calls.append(
+                    {
+                        "id": function_id,
+                        "type": "function",
+                        "function": {"name": function_name, "arguments": function_args},
+                    }
+                )
+
+                # Add tool result to tool_results
+                tool_results.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+        # Add tool_calls to assistant message
+        assistant_message["tool_calls"] = assistant_tool_calls
+
+        # Create a follow-up message with all function results
+        follow_up_messages = [
+            {"role": "user", "content": original_query},
+            assistant_message,
+        ]
+
+        # Add all tool results to the messages
+        follow_up_messages.extend(tool_results)
+
+        # Get completion with all tool results in a single follow-up
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=False,
+        )
+
+        print("\n--- Follow-up Response ---")
+        print(follow_up_response.choices[0].message.content)
+        print("--- End Follow-up ---\n")
+
+    print("--- End Response ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create non-streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=False,
+    )
+
+    # Process the non-streaming response, passing the original query
+    process_response(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@ -0,0 +1,272 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled for xLAM-2 models:
+
+vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+OR
+
+vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
+
+This example demonstrates streaming tool calls with xLAM models.
+"""
+
+import json
+import time
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "empty"
+openai_api_base = "http://localhost:8000/v1"
+
+
+# Define tool functions
+def get_weather(location: str, unit: str):
+    return f"Weather in {location} is 22 degrees {unit}."
+
+
+def calculate_expression(expression: str):
+    try:
+        result = eval(expression)
+        return f"The result of {expression} is {result}"
+    except Exception as e:
+        return f"Could not calculate {expression}: {e}"
+
+
+def translate_text(text: str, target_language: str):
+    return f"Translation of '{text}' to {target_language}: [translated content]"
+
+
+# Define tools
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and state, e.g., 'San Francisco, CA'",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_expression",
+            "description": "Calculate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to evaluate, needs to be a valid Python expression",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate text to another language",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string", "description": "Text to translate"},
+                    "target_language": {
+                        "type": "string",
+                        "description": "Target language for translation",
+                    },
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+]
+
+# Map of function names to implementations
+tool_functions = {
+    "get_weather": get_weather,
+    "calculate_expression": calculate_expression,
+    "translate_text": translate_text,
+}
+
+
+def process_stream(response, tool_functions, original_query):
+    """Process a streaming response with possible tool calls"""
+    # Track multiple tool calls
+    tool_calls = {}  # Dictionary to store tool calls by ID
+
+    current_id = None
+
+    print("\n--- Stream Output ---")
+    for chunk in response:
+        # Handle tool calls in the stream
+        if chunk.choices[0].delta.tool_calls:
+            for tool_call_chunk in chunk.choices[0].delta.tool_calls:
+                # Get the tool call ID
+                if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
+                    current_id = tool_call_chunk.id
+                    if current_id not in tool_calls:
+                        tool_calls[current_id] = {
+                            "function_name": None,
+                            "function_args": "",
+                            "function_id": current_id,
+                        }
+
+                # Extract function information as it comes in chunks
+                if (
+                    hasattr(tool_call_chunk, "function")
+                    and current_id
+                    and current_id in tool_calls
+                ):
+                    if (
+                        hasattr(tool_call_chunk.function, "name")
+                        and tool_call_chunk.function.name
+                    ):
+                        tool_calls[current_id]["function_name"] = (
+                            tool_call_chunk.function.name
+                        )
+                        print(f"Function called: {tool_call_chunk.function.name}")
+
+                    if (
+                        hasattr(tool_call_chunk.function, "arguments")
+                        and tool_call_chunk.function.arguments
+                    ):
+                        tool_calls[current_id]["function_args"] += (
+                            tool_call_chunk.function.arguments
+                        )
+                        print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
+
+        # Handle regular content in the stream
+        elif chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="")
+
+    print("\n--- End Stream ---\n")
+
+    # Execute each function call and build messages for follow-up
+    follow_up_messages = [{"role": "user", "content": original_query}]
+
+    for tool_id, tool_data in tool_calls.items():
+        function_name = tool_data["function_name"]
+        function_args = tool_data["function_args"]
+        function_id = tool_data["function_id"]
+
+        if function_name and function_args:
+            try:
+                # Parse the JSON arguments
+                args = json.loads(function_args)
+
+                # Call the function with the arguments
+                function_result = tool_functions[function_name](**args)
+                print(
+                    f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
+                )
+
+                # Add the assistant message with tool call
+                follow_up_messages.append(
+                    {
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "id": function_id,
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": function_args,
+                                },
+                            }
+                        ],
+                    }
+                )
+
+                # Add the tool message with function result
+                follow_up_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": function_id,
+                        "content": function_result,
+                    }
+                )
+
+            except Exception as e:
+                print(f"Error executing function: {e}")
+
+    # Only send follow-up if we have results to process
+    if len(follow_up_messages) > 1:
+        # Create a follow-up message with all the function results
+        follow_up_response = client.chat.completions.create(
+            model=client.models.list().data[0].id,
+            messages=follow_up_messages,
+            stream=True,
+        )
+
+        print("\n--- Follow-up Response ---")
+        for chunk in follow_up_response:
+            if chunk.choices[0].delta.content:
+                print(chunk.choices[0].delta.content, end="")
+        print("\n--- End Follow-up ---\n")
+
+
+def run_test_case(query, test_name):
+    """Run a single test case with the given query"""
+    print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
+    print(f"Query: '{query}'")
+
+    start_time = time.time()
+
+    # Create streaming chat completion request
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": query}],
+        tools=tools,
+        tool_choice="auto",
+        stream=True,
+    )
+
+    # Process the streaming response
+    process_stream(response, tool_functions, query)
+
+    end_time = time.time()
+    print(f"Test completed in {end_time - start_time:.2f} seconds")
+
+
+def main():
+    # Initialize OpenAI client
+    global client
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Run test cases
+    test_cases = [
+        ("I want to know the weather in San Francisco", "Weather Information"),
+        ("Calculate 25 * 17 + 31", "Math Calculation"),
+        ("Translate 'Hello world' to Spanish", "Text Translation"),
+        ("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
+    ]
+
+    # Execute all test cases
+    for query, test_name in test_cases:
+        run_test_case(query, test_name)
+        time.sleep(1)  # Small delay between tests
+
+    print("\nAll tests completed.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@ -1,5 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to use the vLLM API server to perform audio
+transcription with the `openai/whisper-large-v3` model.
+
+Before running this script, you must start the vLLM server with the following command:
+
+    vllm serve openai/whisper-large-v3
+
+Requirements:
+- vLLM with audio support
+- openai Python SDK
+- httpx for streaming support
+
+The script performs:
+1. Synchronous transcription using OpenAI-compatible API.
+2. Streaming transcription using raw HTTP request to the vLLM server.
+"""
+
 import asyncio
 import json

@ -21,6 +39,9 @@ client = OpenAI(


 def sync_openai():
+    """
+    Perform synchronous transcription using OpenAI-compatible API.
+    """
    with open(str(mary_had_lamb), "rb") as f:
        transcription = client.audio.transcriptions.create(
            file=f,
@ -37,11 +58,11 @@ def sync_openai():
        print("transcription result:", transcription.text)


-sync_openai()
-
-
 # OpenAI Transcription API client does not support streaming.
 async def stream_openai_response():
+    """
+    Perform streaming transcription using vLLM's raw HTTP streaming API.
+    """
    data = {
        "language": "en",
        "stream": True,
@ -68,7 +89,15 @@ async def stream_openai_response():
                        # Extract and print the content
                        content = chunk["choices"][0].get("delta", {}).get("content")
                        print(content, end="")
+    print()  # Final newline after stream ends


-# Run the asynchronous function
-asyncio.run(stream_openai_response())
+def main():
+    sync_openai()
+
+    # Run the asynchronous function
+    asyncio.run(stream_openai_response())
+
+
+if __name__ == "__main__":
+    main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Tyler Michael Smith	e53382cc2e	Sage Moore fixes for full cuda graph support for DeepEP+DeepGEMM LL Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-06-24 11:21:52 -04:00
Reid	26d34eb67e	refactor example - qwen3_reranker (#19847 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-24 14:03:20 +00:00
Li, Jiang	53da4cd397	[Bugfix][CPU] Fix InputBatch for pooling models in the CPU v1 (#20014 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-06-24 13:20:04 +00:00
Vadim Gimpelson	9a3b88328f	[PERF] Speedup of MRoPE prepare inputs (#19939 ) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>	2025-06-23 23:01:26 -07:00
Reid	3014c920da	add some examples for other benchmark scripts (#19893 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-24 05:57:46 +00:00
Kay Yan	0eed516951	[doc] Fix broken link in the installation for CPU (#19980 ) Signed-off-by: Kay Yan <kay.yan@daocloud.io>	2025-06-24 12:04:11 +08:00
Chenyaaang	ee5ad8d2c5	[Misc][Tools][Benchmark] Add profile to autotune script (#19711 ) Signed-off-by: Chenyaaang <chenyangli@google.com>	2025-06-24 00:59:41 +00:00
QiliangCui	a738dbb2a1	Update test case parameter to have the throughput above 8.0 (#19994 ) Signed-off-by: Qiliang Cui <derrhein@gmail.com>	2025-06-24 00:18:10 +00:00
Chenyaaang	33d5e29be9	[TPU] Fix tpu model runner test (#19995 ) Signed-off-by: Chenyaaang <chenyangli@google.com>	2025-06-23 16:04:28 -07:00
22quinn	4671ac6e2a	[Bugfix][Benchmark] Fix Marlin benchmark (#19929 )	2025-06-24 07:25:12 +09:00
Jun-Howie	dd2ccf8dde	Feat Dynamic Quantization for MoE Layers in GPTQ Marlin Backend (#19395 )	2025-06-24 07:23:28 +09:00
22quinn	a3bc76e4b5	[CI/Build] Push latest tag for cpu and neuron docker image (#19897 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-06-23 14:15:37 -07:00
cascade	e6327c9b3e	[Feature] Support sequence parallelism for static fp8 quantization (#19181 ) Signed-off-by: cascade812 <cascade812@outlook.com>	2025-06-23 16:09:02 -04:00
lkchen	d0132f025d	[Misc] Add type alias `ReqId` and `EngineId` for better readability (#19880 ) Signed-off-by: Linkun Chen <github@lkchen.net>	2025-06-23 12:57:57 -07:00
Isotr0py	61f4fc5dc6	[Bugfix][v1] Fix step pooler implementation and step pooling usage in v1 (#19956 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-06-23 18:38:06 +00:00
Tyler Michael Smith	68aaeb3749	[EP+DP] Optimize the little operations in the DeepGEMM + DeepEP low latency case (#19885 ) Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>	2025-06-23 11:07:47 -07:00
Lukas Geiger	c3649e4fee	[Docs] Fix syntax highlighting of shell commands (#19870 ) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>	2025-06-23 17:59:09 +00:00
Reid	53243e5c42	[doc] improve readability for long commands (#19920 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-23 14:27:07 +00:00
Jee Jee Li	a6e6604d32	[Bugfix] Fix CI bitsandbytes failure (#19969 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-06-23 21:30:55 +08:00
Reid	b82e0f82cb	[doc] use MkDocs collapsible blocks - supplement (#19973 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-23 10:54:16 +00:00
Isotr0py	5111642a6f	[Doc] Update V1 status for decoder-only embedding models (#19952 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-06-23 09:31:06 +00:00
lkchen	1bcd15edc7	[BugFix][P/D] Fix for cases where _recving_transfers can be cleaned up when all transfer done (#19874 ) Signed-off-by: Linkun Chen <github@lkchen.net>	2025-06-22 22:41:53 -07:00
Nicolò Lucchesi	2ebff5b77c	[P/D][NixlConnector] Support `tp_size > num_kv_heads` deployments (#19691 ) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com>	2025-06-22 22:41:50 -07:00
Reid	f17aec0d63	[doc] Fold long code blocks to improve readability (#19926 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-23 05:24:23 +00:00
Vensen	493c275352	Fix(models/siglip): Add compatibility for Gemma models quantized by llm-compressor (#19643 ) Signed-off-by: Vensenmu <vensenmu@gmail.com>	2025-06-23 03:40:28 +00:00
jinqinn	f39ab2d4bd	[Misc] Configurable timeout for execute_model RPC calls via env var (#19544 ) Signed-off-by: jinqinn <goodqinjin@163.com>	2025-06-22 20:36:26 -07:00
amit	4a0f7888a3	[Core] feat: Implement Priority Scheduling in V1 Engine (#19057 ) Signed-off-by: amit <amit.man@gmail.com> Co-authored-by: Roger Wang <Rogerw0108@gmail.com>	2025-06-22 20:18:08 -07:00
Aaron Pham	c4cf260677	[Perf][CLI] Improve overall startup time (#19941 )	2025-06-22 23:11:22 +00:00
Ye (Charlotte) Qi	33d51f599e	[BugFix] Add an env to disable moe chunking to work around compile incompatibility (#19642 ) Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>	2025-06-22 15:17:49 -07:00
Aaron Pham	e91386cde1	[Chore] dedup logs (#19955 )	2025-06-22 19:43:07 +00:00
Ye (Charlotte) Qi	2c11a29f0b	[Misc] Simplify vllm bench cli subcommand implementation (#19948 )	2025-06-22 12:34:48 -04:00
Roger Wang	c76a506bd6	[Misc] Update model-specific PR tagging (#19949 ) Signed-off-by: Roger Wang <hey@rogerw.me>	2025-06-22 12:16:08 +00:00
Reid	ec0db6f51c	[doc] use snippets for contact us (#19944 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-22 10:26:13 +00:00
22quinn	c305a2109d	[CI/Build] Auto tag perf benchmarks related PRs (#19943 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-06-22 08:46:21 +00:00
Wang, Yi	202c5df935	[Benchmark] fix request loss if "ping" is returned (#19535 ) Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-06-22 07:21:04 +00:00
Ning Xie	2bb246b8f7	[MISC] add cpu_kvcache_space_bytes to CacheConfig (#19812 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-06-22 13:39:09 +08:00
Ning Xie	4c409cabc2	[Misc] add vllm_config in __init__ (#19866 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-06-21 23:10:46 -04:00
Adrian	3b1e4c6a23	[Docs] Add GPT2ForSequenceClassification to supported models in docs (#19932 ) Signed-off-by: nie3e <adrcwiek@gmail.com>	2025-06-21 20:57:19 +00:00
Woosuk Kwon	2c5302fadd	[Multimodal] Optimize Qwen2/2.5-VL startup time (#19756 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.me>	2025-06-21 20:01:07 +00:00
Reid	caa680fd2e	[doc] add contact us in community (#19922 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-21 17:29:06 +00:00
汪志鹏	c3bf9bad11	[New model support]Support Tarsier2 (#19887 ) Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>	2025-06-21 04:01:51 +00:00
Isotr0py	6f170f11dd	[Bugfix] Fix bnb 8bit model weights loading (#19917 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-06-21 03:29:09 +00:00
Rabin Adhikari	8ca81bb069	Fix: Check the type of params to be a Sequence not list. (#19910 ) Signed-off-by: Rabin Adhikari <rabin.adk1@gmail.com>	2025-06-20 23:03:17 +00:00
wangxiyuan	e773a9e1c2	[Misc] Clean up useless code (#19889 ) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>	2025-06-20 21:09:09 +00:00
Ning Xie	71baf85ae1	[Kernel] mark TorchSDPABackend swap_blocks NotImplementedError (#19749 )	2025-06-20 18:18:11 +00:00
Li, Jiang	79f2f1c2a1	[CPU][CI] Fallback sliding window to v0 and fix CPU pooling model tests (#19901 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-06-20 15:30:36 +00:00
Vlad Tiberiu Mihailescu	2e3e3c86dc	Export NaNs in logits to scheduler_stats if output is corrupted (#18777 ) Signed-off-by: Vlad Mihailescu <vtmihailescu@gmail.com>	2025-06-20 22:47:16 +08:00
Chendi.Xue	7e8977fcd4	[custom_op][vllm-plugin] update custom_op class to use op_registry (#19164 ) Signed-off-by: Chendi.Xue <chendi.xue@intel.com>	2025-06-20 07:44:56 -07:00
Adrian	f1e840e842	[Model] GPT2ForSequenceClassification model (#19663 ) Signed-off-by: nie3e <adrcwiek@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-06-20 12:07:41 +00:00
Thomas Parnell	7771d1de88	[Fix] import regex instead of re (#19875 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-06-20 11:16:48 +00:00
Ning Xie	71d1219545	[Kernel] correct cpu worker function parameter type (#19745 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-06-20 10:50:13 +00:00
Reid	e384f2f108	[Misc] refactor example - openai_transcription_client (#19851 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-20 08:02:21 +00:00
Reid	089a306f19	[Misc] update cuda version (#19526 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-20 07:25:15 +00:00
kourosh hakhamaneshi	5e666f72cd	[Bugfix][Ray] Set the cuda context eagerly in the ray worker (#19583 )	2025-06-19 22:01:16 -07:00
qli88	e3a3e4db46	[Bugfix] Enable PP with AITER+V1 (#19822 ) Signed-off-by: Qiang Li <qiang.li2@amd.com>	2025-06-20 12:43:20 +08:00
Xerxes	e41bf15cd0	[Chore]: qwen3-moe-type-hints-mistake (#19860 ) Co-authored-by: xinnan.hou <hxn02029096@alibaba-inc.com>	2025-06-19 21:43:07 -07:00
Brayden Zhong	5aa4a015ce	[Benchmark] Fix `Value of type "SampleRequest" is not indexable` (#18032 ) Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>	2025-06-19 21:28:55 -07:00
Elaine Zhao	b6bad3d186	[CI][Neuron] Fail and exit on first error (#19622 ) Signed-off-by: Elaine Zhao <elaineyz@amazon.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-06-20 12:27:51 +08:00
Isotr0py	ee9a1531aa	[CI/Build][Bugfix] Fix deadlock on v1 engine test CI (#19872 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-06-20 09:51:07 +08:00
Robert Shaw	10d82f9ac5	[Benchmark][Bugfix] Fix Dataset Length Calculation (#19868 ) Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>	2025-06-19 18:30:41 -07:00
xzbdmw	ea10dd9d9e	[Frontend] early return chat format resolution when specified (#19735 )	2025-06-19 18:49:59 +00:00
Alex Brooks	ead2110297	[Core][Bugfix] Fix Online MM Beam Search (#19688 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>	2025-06-19 17:18:07 +00:00
Li, Jiang	01220ce89a	[CI][CPU] Improve dummy Triton interfaces and fix the CPU CI (#19838 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-06-19 15:46:09 +00:00
22quinn	6f68c49220	[Doc] Update V1 user guide for embedding models (#19842 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-06-19 09:43:27 +00:00
Alexei-V-Ivanov-AMD	4719460644	Fixing Chunked Prefill Test. (#19762 ) Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>	2025-06-19 01:36:16 -07:00
NekoMimiUnagi	466166dcfd	[Frontend] Add optional token-level progress bar to `LLM.beam_search` (#19301 ) Signed-off-by: Ruosen Li <rxl190028@utdallas.edu> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-06-19 03:21:41 -04:00
Zuxin	1d0ae26c85	Add xLAM tool parser support (#17148 )	2025-06-19 14:26:41 +08:00
Isotr0py	6021999573	[Minor] Allow redirecting model path for HfRunner in test (#19795 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-06-18 23:04:10 -07:00
Ning Xie	c7b370c603	raise exception for pin_lora (#19809 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-06-18 22:57:35 -07:00
zsolt-borbely-htec	aa20d10a91	[Misc] [ROCm] Prevent surplus tensor reshape (#19803 ) Signed-off-by: Zsolt Borbely <zsolt.borbely@htecgroup.com>	2025-06-19 13:57:16 +08:00
TJian	2de12be428	[ROCm] [AITER] [Bugfix] Patch for AITER commit `648764942e552a8bb5fe16026703716a81f05374` (#18990 ) Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>	2025-06-18 22:56:31 -07:00
Yu-Hang "Maxin" Tang	83ca9ae47b	Mark invariant normalizer in Gemma as non-persistent (#19788 ) Signed-off-by: Yu-Hang Tang <Tang.Maxin@gmail.com>	2025-06-18 22:56:03 -07:00
kourosh hakhamaneshi	e2148dc5ea	[Bugfix] Add check_health to v1 async client. (#19821 ) Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>	2025-06-18 21:47:01 -07:00
Lu Fang	b1098b4072	[Bugfix] Fix the linter (#19826 ) Signed-off-by: Lu Fang <lufang@fb.com>	2025-06-18 21:44:41 -07:00
Maximilien de Bayser	799397ee4f	Support embedding models in V1 (#16188 ) Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Max de Bayser <maxdebayser@gmail.com> Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-06-18 21:36:33 -07:00