updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
updated
2025-03-24 01:00:20 +00:00 · 2025-03-24 00:46:46 +00:00 · 2025-03-23 23:03:42 +00:00 · 2025-03-23 22:56:53 +00:00 · 2025-03-23 22:50:20 +00:00 · 2025-03-23 22:44:36 +00:00
654 changed files with 38211 additions and 12964 deletions
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.231
  - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.22
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -13,6 +13,7 @@ from pathlib import Path

 import lm_eval
 import numpy
+import pytest
 import yaml

 RTOL = 0.05
@ -46,6 +47,10 @@ def test_lm_eval_correctness():
    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))

+    if eval_config[
+            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
+        pytest.skip("FBGEMM is currently failing on main.")
+
    # Launch eval requests.
    results = launch_lm_eval(eval_config)

--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -361,7 +361,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
+  export VLLM_LOGGING_LEVEL="WARNING"

  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -57,8 +57,6 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "rm -f /var/log/syslog"
-      - "rm -f /var/log/kern.log"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
@ -84,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -101,16 +101,30 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_permute_cols.py"
 fi

-#ignore certain Entrypoints tests
+#ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_accuracy.py \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_encoder_decoder.py \
-  --ignore=entrypoints/openai/test_embedding.py \
-  --ignore=entrypoints/openai/test_oot_registration.py "}
+  --ignore=entrypoints/openai/test_chat.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -44,11 +44,11 @@ remove_docker_container() {
 trap remove_docker_container EXIT

 # Run the image
-docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,16 +0,0 @@
-#!/bin/bash
-
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -19,7 +19,20 @@ docker run --privileged --net host --shm-size=16G -it \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && export VLLM_USE_V1=1 \
+    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo TEST_1 \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+    && echo TEST_2 \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && echo TEST_3 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && echo TEST_4 \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && echo TEST_5 \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+
+
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -4,16 +4,28 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex

+image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
 # Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
+remove_docker_container() { 
+  docker rm -f "${container_name}" || true; 
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container

 # Run the image and test offline inference/tensor parallel
-docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+docker run \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    sh -c '
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,7 +41,6 @@ steps:
  - grep \"sig sig-object py\" build/html/api/inference_params.html

 - label: Async Engine, Inputs, Utils, Worker Test # 24min
-  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@ -118,15 +117,14 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
-  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
@ -138,7 +136,11 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -152,7 +154,6 @@ steps:

 - label: Metrics, Tracing Test # 10min
  num_gpus: 2
-  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
@ -200,16 +201,19 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
-    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/core
+    - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_oracle.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - VLLM_USE_V1=1 pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -227,14 +231,17 @@ steps:
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/vision_language.py
-    - python3 offline_inference/vision_language_multi_image.py
-    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_embedding.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
@ -284,7 +291,6 @@ steps:
  parallelism: 4

 - label: PyTorch Fullgraph Smoke Test # 9min
-  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -293,6 +299,7 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/test_pass_manager.py

 - label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
@ -379,7 +386,8 @@ steps:
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_initialization.py
+    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -508,8 +516,6 @@ steps:
  - entrypoints/llm/test_collective_rpc.py
  commands:
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@ -522,13 +528,12 @@ steps:
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
-  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -1,28 +0,0 @@
-name: 🎲 Misc/random discussions that do not fit into the above categories.
-description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
-title: "[Misc]: "
-labels: ["misc"]
-
-body:
- type: markdown
-  attributes:
-    value: >
-      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
-  attributes:
-    label: Anything you want to discuss about vllm.
-    description: >
-      Anything you want to discuss about vllm.
-  validations:
-    required: true
- type: markdown
-  attributes:
-    value: >
-      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1 +1,5 @@
 blank_issues_enabled: false
+contact_links:
+  - name: Questions
+    url: https://discuss.vllm.ai
+    about: Ask questions and discuss with other vLLM community members
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -53,7 +53,7 @@ repos:
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")

 #
 # Try to find python package with an executable that exactly matches
@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # Only build AllSpark kernels if we are building for at least some compatible archs.
  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
+  if (ALLSPARK_ARCHS)
    set(ALLSPARK_SRCS
       "csrc/quantization/gptq_allspark/allspark_repack.cu"
       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
  else()
    message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures, or CUDA not >= 12.0")
+                   " in CUDA target architectures")
  endif()


--- a/2
+++ b/2
@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist

 RUN --mount=type=cache,target=/root/.cache/uv \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples

--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,29 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
-    apt-get install -y \
-        git python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-WORKDIR /workspace
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U pip
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-
-COPY examples/ /workspace/examples
-COPY benchmarks/ /workspace/benchmarks
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-CMD ["/bin/bash"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -61,6 +61,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
    && pip install -U -r requirements/rocm.txt \
+    && pip install -U -r requirements/rocm-test.txt \
    && pip uninstall -y vllm \
    && pip install *.whl

--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="b7d29fb"
 ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base

@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl

+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+
 ARG BASE_IMAGE
 ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
+    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,11 +1,7 @@
-FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+RUN rm /etc/apt/sources.list.d/intel-graphics.list

 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@ -21,8 +17,6 @@ RUN apt-get update -y && \
    python3 \
    python3-dev \
    python3-pip \
-    libze-intel-gpu-dev \
-    libze-intel-gpu1 \
    wget

 WORKDIR /workspace/vllm
--- a/README.md
+++ b/README.md
@ -10,37 +10,25 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

 ---

-We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!  
+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!

-Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
-
-👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!  
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)

 ---

 *Latest News* 🔥

+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
+- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!

 ---

@ -151,10 +139,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

- For technical questions and feature requests, please use GitHub issues or discussions.
- For discussing with fellow users and coordinating contributions and development, please use Slack.
- For security disclosures, please use GitHub's security advisory feature.
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)

 ## Media Kit

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -42,21 +42,27 @@ become available.
    </tr>
    <tr>
      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🚧</td>
+      <td style="text-align: center;">🟡</td>
+      <td style="text-align: center;">🟡</td>
      <td>Specify your dataset path on HuggingFace</td>
    </tr>
    <tr>
      <td><strong>VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🚧</td>
+      <td style="text-align: center;">✅</td>
      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
    </tr>
  </tbody>
 </table>
-✅: supported  
+
+✅: supported
+
 🚧: to be supported

+🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.
+
 **Note**: VisionArena’s `dataset-name` should be set to `hf`

 ---
@ -76,10 +82,10 @@ Then run the benchmarking script
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
 DATASET_NAME="sharegpt"
 DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```

 If successful, you will see the following output
@ -123,7 +129,7 @@ DATASET_NAME="hf"
 DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
 DATASET_SPLIT='train'

-python3 benchmarks/benchmark_serving.py \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend "${BACKEND}" \
  --model "${MODEL_NAME}" \
  --endpoint "/v1/chat/completions" \
@ -133,6 +139,57 @@ python3 benchmarks/benchmark_serving.py \
  --num-prompts "${NUM_PROMPTS}"
 ```

+### HuggingFaceDataset Examples
+
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+```
+
 ---
 ## Example - Offline Throughput Benchmark

@ -140,35 +197,65 @@ python3 benchmarks/benchmark_serving.py \
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
 DATASET_NAME="sonnet"
-DATASET_PATH="benchmarks/sonnet.txt"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"

-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --dataset-name "${DATASET_NAME}" \
  --dataset-path "${DATASET_PATH}" \
  --num-prompts "${NUM_PROMPTS}"
-  ```
+```

 If successful, you will see the following output

 ```
-Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens:  5014
+Total num output tokens:  1500
+```
+
+### VisionArena Benchmark for Vision Language Models
+
+``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model "${MODEL_NAME}" \
+  --backend "vllm-chat" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-split "${DATASET_SPLIT}"
+```
+
+The `num prompt tokens` now includes image token counts
+
+```
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens:  14527
+Total num output tokens:  1280
 ```

 ### Benchmark with LoRA Adapters

 ``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="meta-llama/Llama-2-7b-hf"
 BACKEND="vllm"
 DATASET_NAME="sharegpt"
-DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
 NUM_PROMPTS=10
 MAX_LORAS=2
 MAX_LORA_RANK=8
 ENABLE_LORA="--enable-lora"
 LORA_PATH="yard1/llama-2-7b-sql-lora-test"

-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --backend "${BACKEND}" \
  --dataset_path "${DATASET_PATH}" \
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.

 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)

@ -62,7 +63,7 @@ async def async_request_tgi(
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
+            "ignore_eos_token": request_func_input.ignore_eos,
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -70,6 +71,10 @@ async def async_request_tgi(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None

        ttft = 0.0
        st = time.perf_counter()
@ -333,7 +338,7 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "chat/completions"
+        ("chat/completions", "profile")
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(trust_env=True,
@ -427,6 +432,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download

+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
        with get_lock(pretrained_model_name_or_path):
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -17,6 +17,7 @@ SampleRequest instances, similar to the approach used in ShareGPT.
 import base64
 import io
 import json
+import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
@ -35,6 +36,8 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer

+logger = logging.getLogger(__name__)
+
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@ -46,7 +49,7 @@ class SampleRequest:
    Represents a single inference request for benchmarking.
    """

-    prompt: str
+    prompt: Union[str, Any]
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@ -61,9 +64,6 @@ class SampleRequest:
 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0

-    # num_requests has default 1000 in both the benchmark_serving.py and
-    # benchmark_throughput.py
-
    def __init__(
        self,
        dataset_path: Optional[str] = None,
@ -84,13 +84,27 @@ class BenchmarkDataset(ABC):
                            if random_seed is not None else self.DEFAULT_SEED)
        self.data = None

+    def apply_multimodal_chat_transformation(
+            self,
+            prompt: str,
+            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
-        
+
        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.
-        
+
        Raises:
            NotImplementedError: If a subclass does not implement this method.
        """
@ -107,18 +121,18 @@ class BenchmarkDataset(ABC):
        """
        Optionally select a random LoRA request and return its associated
        tokenizer.
-        
+
        This method is used when LoRA parameters are provided.  It randomly
        selects a LoRA based on max_loras and retrieves a cached tokenizer for
        that LoRA if available. Otherwise, it returns the base tokenizer.
-        
+
        Args:
            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
            LoRA is selected.  max_loras (Optional[int]): The maximum number of
            LoRAs available. If None, LoRA is not used.  lora_path
            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
            is not used.
-        
+
        Returns:
            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
            element is a LoRARequest (or None if not applicable) and the second
@ -146,21 +160,39 @@ class BenchmarkDataset(ABC):
               num_requests: int) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
-        
+
        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.
-        
+
        Args:
            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
             for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.
-        
+
        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+

 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@ -262,15 +294,16 @@ class RandomDataset(BenchmarkDataset):
    ) -> None:
        super().__init__(**kwargs)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               prefix_len: int = DEFAULT_PREFIX_LEN,
-               range_ratio: float = DEFAULT_RANGE_RATIO,
-               input_len: int = DEFAULT_INPUT_LEN,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               **kwargs) -> list[SampleRequest]:
-
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
        vocab_size = tokenizer.vocab_size

        prefix_token_ids = (np.random.randint(
@ -332,19 +365,24 @@ class ShareGPTDataset(BenchmarkDataset):
        random.seed(self.random_seed)
        random.shuffle(self.data)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               lora_path: Optional[str] = None,
-               max_loras: Optional[int] = None,
-               output_len: Optional[int] = None,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
-            prompt, completion = entry["conversations"][0]["value"],\
-                entry["conversations"][1]["value"]
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )

            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@ -358,6 +396,9 @@ class ShareGPTDataset(BenchmarkDataset):
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
@ -365,6 +406,7 @@ class ShareGPTDataset(BenchmarkDataset):
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
                ))
+        self.maybe_oversample_requests(samples, num_requests)
        return samples


@ -397,19 +439,20 @@ class SonnetDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = f.readlines()

-    def sample(self,
-               tokenizer,
-               num_requests: int,
-               prefix_len: int = DEFAULT_PREFIX_LEN,
-               input_len: int = DEFAULT_INPUT_LEN,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               return_prompt_formatted: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
        avg_len = sum(len(tokens)
-                      for tokens in \
-                        tokenized_lines) / len(tokenized_lines)
+                      for tokens in tokenized_lines) / len(tokenized_lines)

        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
@ -488,12 +531,14 @@ class BurstGPTDataset(BenchmarkDataset):
        # Convert the dataframe to a list of lists.
        return data.values.tolist()

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               max_loras: Optional[int] = None,
-               lora_path: Optional[str] = None,
-               **kwargs) -> list[SampleRequest]:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
        samples = []
        data = self._sample_loaded_data(num_requests=num_requests)
        for i in range(num_requests):
@ -526,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
    Dataset class for processing a HuggingFace dataset with conversation data
    and optional images.
    """
-    DEFAULT_NUM_REQUESTS = 1000

    def __init__(
        self,
@ -550,10 +594,13 @@ class HuggingFaceDataset(BenchmarkDataset):
            split=self.dataset_split,
            streaming=True,
        )
-
-        if "conversations" not in self.data.features:
-            raise ValueError("HF Dataset must have a 'conversations' column.")
-
+        if self.data.features is None or "conversations" \
+            not in self.data.features:
+            raise ValueError(
+                "HuggingFaceDataset currently only supports datasets with "
+                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+                "Please consider contributing if you would like to add "
+                "support for additional dataset formats.")
        # Shuffle and filter examples with at least 2 conversations.
        self.data = self.data.shuffle(seed=self.random_seed).filter(
            lambda x: len(x["conversations"]) >= 2)
@ -561,9 +608,8 @@ class HuggingFaceDataset(BenchmarkDataset):
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
-               lora_path: Optional[str] = None,
-               max_loras: Optional[int] = None,
               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        sampled_requests = []
        dynamic_output = output_len is None
@ -571,13 +617,9 @@ class HuggingFaceDataset(BenchmarkDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-
            conv = item["conversations"]
            prompt, completion = conv[0]["value"], conv[1]["value"]

-            lora_request, tokenizer = self.get_random_lora_request(
-                tokenizer, lora_path=lora_path, max_loras=max_loras)
-
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
@ -587,17 +629,22 @@ class HuggingFaceDataset(BenchmarkDataset):
            if dynamic_output and not is_valid_sequence(
                    prompt_len, completion_len):
                continue
-
            mm_content = process_image(
                item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                    lora_request=lora_request,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -606,25 +653,19 @@ class HuggingFaceDataset(BenchmarkDataset):
 # -----------------------------------------------------------------------------


-class VisionArenaDataset(BenchmarkDataset):
+class VisionArenaDataset(HuggingFaceDataset):
    """
    Vision Arena Dataset.
    """

    DEFAULT_OUTPUT_LEN = 128
-    DEFAULT_NUM_REQUESTS = 1000
    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"

    def __init__(
        self,
-        dataset_split: str,
-        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
-        self.dataset_split = dataset_split
-        self.dataset_subset = dataset_subset
-
        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
            raise ValueError(f"Only support Vision Arena dataset.\
                    This data path {self.dataset_path} is not valid.")
@ -642,12 +683,14 @@ class VisionArenaDataset(BenchmarkDataset):
        )
        self.data = dataset.shuffle(seed=self.random_seed)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               **kwargs) -> list:
-        # TODO (jenniferzhao): Add support for offline benchmark sampling
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
@ -655,8 +698,14 @@ class VisionArenaDataset(BenchmarkDataset):
            if len(sampled_requests) >= num_requests:
                break
            prompt = item["turns"][0][0]["content"]
-            prompt_len = len(tokenizer(prompt).input_ids)
            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
@ -664,4 +713,5 @@ class VisionArenaDataset(BenchmarkDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )

+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                    "input_lens", "output_lens", "ttfts", "itls",
+                    "generated_texts", "errors"
+            ]:
+                if field in result_json:
+                    del result_json[field]
+
        # Traffic
        result_json["request_rate"] = (args.request_rate if args.request_rate
                                       < float("inf") else "inf")
@ -828,6 +837,12 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"

-    tokenizer = get_tokenizer(tokenizer_id,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )

    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@ -876,6 +879,13 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
    parser.add_argument(
        "--num-prompts",
        type=int,
@ -989,11 +999,12 @@ if __name__ == "__main__":
                        type=float,
                        default=1.0,
                        help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
-                        default="xgrammar",
-                        help="Backend to use for structured outputs")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,8 +11,9 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
-                               ShareGPTDataset, SonnetDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -23,6 +24,7 @@ from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators

@ -32,7 +34,7 @@ def run_vllm(
    n: int,
    engine_args: EngineArgs,
    disable_detokenize: bool = False,
-) -> float:
+) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
@ -66,12 +68,13 @@ def run_vllm(

    use_beam_search = False

+    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
-        llm.generate(prompts,
-                     sampling_params,
-                     lora_request=lora_requests,
-                     use_tqdm=True)
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_requests,
+                               use_tqdm=True)
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
@ -89,7 +92,46 @@ def run_vllm(
                ignore_eos=True,
            ))
        end = time.perf_counter()
-    return end - start
+    return end - start, outputs
+
+
+def run_vllm_chat(
+        requests: list[SampleRequest],
+        n: int,
+        engine_args: EngineArgs,
+        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of "
+            "prompt_len and expected_output_len for all requests.")
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs


 async def run_vllm_async(
@ -264,6 +306,8 @@ def get_requests(args, tokenizer):
        dataset_cls = RandomDataset
    elif args.dataset_name == "sharegpt":
        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
            "Tokenizer/model must have chat template for sonnet dataset.")
@ -272,6 +316,19 @@ def get_requests(args, tokenizer):
        sample_kwargs["return_prompt_formatted"] = True
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.backend != "vllm-chat":
+            raise ValueError(
+                "hf datasets only are supported by vllm-chat backend")
+        # Choose between VisionArenaDataset and HuggingFaceDataset based on
+        # provided parameters.
+        dataset_cls = (VisionArenaDataset if args.dataset_path
+                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                       and args.hf_subset is None else HuggingFaceDataset)
+        common_kwargs['dataset_subset'] = args.hf_subset
+        common_kwargs['dataset_split'] = args.hf_split
+        sample_kwargs["enable_multimodal_chat"] = True
+
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -290,6 +347,7 @@ def main(args: argparse.Namespace):
    requests = get_requests(args, tokenizer)
    is_multi_modal = any(request.multi_modal_data is not None
                         for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
            elapsed_time = uvloop.run(
@ -301,9 +359,9 @@ def main(args: argparse.Namespace):
                    args.disable_detokenize,
                ))
        else:
-            elapsed_time = run_vllm(requests, args.n,
-                                    EngineArgs.from_cli_args(args),
-                                    args.disable_detokenize)
+            elapsed_time, request_outputs = run_vllm(
+                requests, args.n, EngineArgs.from_cli_args(args),
+                args.disable_detokenize)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -312,20 +370,45 @@ def main(args: argparse.Namespace):
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args),
+            args.disable_detokenize)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(request.prompt_len + request.expected_output_len
-                           for request in requests)
-    total_output_tokens = sum(request.expected_output_len
-                              for request in requests)
-    if is_multi_modal:
-        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += len(
+                ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            total_output_tokens += sum(
+                len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len
+                               for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print("\033[91mWARNING\033[0m: Multi-modal request with "
+              f"{args.backend} backend detected. The "
              "following metrics are not accurate because image tokens are not"
              " counted. See vllm-project/vllm/issues/9778 for details.")
        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")

    # Output JSON results if specified
    if args.output_json:
@ -341,17 +424,100 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, results)


+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print(
+            "When dataset path is not set, it will default to random dataset")
+        args.dataset_name = 'random'
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+            getattr(args, "hf_subset", None) is not None
+            or getattr(args, "hf_split", None) is not None):
+        warnings.warn("--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+                      stacklevel=2)
+    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+        raise ValueError(
+            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != 'random' and args.random_range_ratio is not None:
+        warnings.warn("--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+                      stacklevel=2)
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if args.dataset_name not in {"random", "sonnet", None
+                                 } and args.prefix_len is not None:
+        warnings.warn("--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+                      stacklevel=2)
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError(
+            "LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+                                                 None) is not None:
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError(
+            "Tokenizer must be the same as the model for MII backend.")
+
+
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
-                        choices=["vllm", "hf", "mii"],
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
                        default="vllm")
-    parser.add_argument("--dataset-name",
-                        type=str,
-                        choices=["sharegpt", "random", "sonnet", "burstgpt"],
-                        help="Name of the dataset to benchmark on.",
-                        default="sharegpt")
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt")
    parser.add_argument(
        "--dataset",
        type=str,
@ -419,55 +585,24 @@ if __name__ == "__main__":
    parser.add_argument(
        "--random-range-ratio",
        type=float,
-        default=1.0,
+        default=None,
        help="Range of sampled ratio of input/output length, "
        "used only for RandomDataSet.",
    )

+    # hf dtaset
+    parser.add_argument("--hf-subset",
+                        type=str,
+                        default=None,
+                        help="Subset of the HF dataset.")
+    parser.add_argument("--hf-split",
+                        type=str,
+                        default=None,
+                        help="Split of the HF dataset.")
+
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next "
-            "release. Please use '--dataset-name' and "
-            "'--dataset-path' in the future runs.",
-            stacklevel=2)
-        args.dataset_path = args.dataset
-    if args.dataset is None and args.dataset_path is None:
-        # for random dataset, the default sampling setting is in
-        # benchmark_dataset.RandomDataset
-        print("When dataset is not set, it will default to random dataset")
-    else:
-        assert args.input_len is None
-    if args.enable_lora:
-        assert args.lora_path is not None
-
-    if args.backend == "vllm":
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-    elif args.backend == "hf":
-        if args.hf_max_batch_size is None:
-            raise ValueError("HF max batch size is required for HF backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.enable_lora is not None:
-            raise ValueError("LoRA benchmarking is only supported for vLLM"
-                             " backend")
-    elif args.backend == "mii":
-        if args.dtype != "auto":
-            raise ValueError("dtype must be auto for MII backend.")
-        if args.n != 1:
-            raise ValueError("n must be 1 for MII backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-        if args.tokenizer != args.model:
-            raise ValueError("Tokenizer must be the same as the model for MII "
-                             "backend.")
-        if args.enable_lora is not None:
-            raise ValueError("LoRA benchmarking is only supported for vLLM"
-                             " backend")
+    validate_args(args)
    main(args)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -17,13 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES

-from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -167,69 +162,25 @@ class OpType(Enum):
    """
    LoRA Ops to benchmark and its properties.
    """
-    SGMV_SHRINK = auto()
-    BGMV_SHRINK = auto()
-    SGMV_EXPAND = auto()
-    BGMV_EXPAND = auto()
-    BGMV_EXPAND_SLICE = auto()
-    V1_SHRINK = auto()
-    V1_EXPAND = auto()
+    LORA_SHRINK = auto()
+    LORA_EXPAND = auto()

    @staticmethod
    def from_str(s: str) -> "OpType":
-        if s.lower() == 'sgmv_shrink':
-            return OpType.SGMV_SHRINK
-        if s.lower() == 'sgmv_expand':
-            return OpType.SGMV_EXPAND
-        if s.lower() == 'bgmv_shrink':
-            return OpType.BGMV_SHRINK
-        if s.lower() == 'bgmv_expand':
-            return OpType.BGMV_EXPAND
-        if s.lower() == "bgmv_expand_slice":
-            return OpType.BGMV_EXPAND_SLICE
-        if s.lower() == "v1_shrink":
-            return OpType.V1_SHRINK
-        if s.lower() == "v1_expand":
-            return OpType.V1_EXPAND
+        if s.lower() == "lora_shrink":
+            return OpType.LORA_SHRINK
+        if s.lower() == "lora_expand":
+            return OpType.LORA_EXPAND
        raise ValueError(f"Unrecognized str {s} to convert to OpType")

    def is_shrink_fn(self) -> bool:
-        return self in [
-            OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
-        ]
+        return self in [OpType.LORA_SHRINK]

    def is_expand_fn(self) -> bool:
-        return self in [
-            OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
-        ]
-
-    def is_prefill_op(self) -> bool:
-        return self in [
-            OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
-            OpType.V1_EXPAND
-        ]
-
-    def is_decode_op(self) -> bool:
-        return self in [
-            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
-            OpType.V1_SHRINK, OpType.V1_EXPAND
-        ]
-
-    def is_expand_slice_fn(self) -> bool:
-        return self in [OpType.BGMV_EXPAND_SLICE]
+        return self in [OpType.LORA_EXPAND]

    def num_slices(self) -> list[int]:
-        if self in [
-                OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
-                OpType.V1_EXPAND
-        ]:
-            # SGMV kernels and v1 kernels supports slices
-            return [1, 2, 3]
-        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
-            return [1]
-        if self in [OpType.BGMV_EXPAND_SLICE]:
-            return [2, 3]
-        raise ValueError(f"Unrecognized OpType {self}")
+        return [1, 2, 3]

    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
            lora_rank: int) -> tuple[int, int, int]:
@ -239,7 +190,7 @@ class OpType(Enum):
            k = hidden_size
            n = lora_rank
        else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
            m = num_tokens
            k = lora_rank
            n = hidden_size
@ -254,7 +205,7 @@ class OpType(Enum):
        if self.is_shrink_fn():
            return op_dtype, op_dtype, torch.float32
        else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
            return torch.float32, op_dtype, op_dtype

    def matmul_shapes(
@ -268,43 +219,19 @@ class OpType(Enum):
        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)

        b_shape = (num_loras, n, k)  # col-major
-        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
-            # SGMV shrink and V1 shrink kernels support num_slices inherently
-            # in the kernel.
+        if self in [OpType.LORA_SHRINK]:
+            # LoRA shrink kernels support num_slices inherently in the kernel.
            return ((m, k), b_shape, (num_slices, m, n))
-        if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            # SGMV expand and V1 expand kernels support num_slices inherently
-            # in the kernel
+        if self in [OpType.LORA_EXPAND]:
+            # LoRA expand kernels support num_slices inherently in the kernel
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
-        if self == OpType.BGMV_SHRINK:
-            return ((m, k), b_shape, (m, n))
-        if self == OpType.BGMV_EXPAND:
-            return ((m, k), b_shape, (m, n))
-        if self == OpType.BGMV_EXPAND_SLICE:
-            return ((num_slices, m, k), b_shape, (m, n * num_slices))
-
        raise ValueError(f"Unrecognized op_type {self}")

    def bench_fn(self) -> Callable:
-
-        def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
-            for x in kwargs_list:
-                bgmv_expand_slice(**x)
-
-        if self == OpType.SGMV_SHRINK:
-            return sgmv_shrink
-        if self == OpType.SGMV_EXPAND:
-            return sgmv_expand
-        if self == OpType.BGMV_SHRINK:
-            return bgmv_shrink
-        if self == OpType.BGMV_EXPAND:
-            return bgmv_expand
-        if self == OpType.BGMV_EXPAND_SLICE:
-            return emulate_bgmv_expand_slice
-        if self == OpType.V1_SHRINK:
-            return v1_shrink
-        if self == OpType.V1_EXPAND:
-            return v1_expand
+        if self == OpType.LORA_SHRINK:
+            return lora_shrink
+        if self == OpType.LORA_EXPAND:
+            return lora_expand

        raise ValueError(f"Unrecognized optype {self}")

@ -318,34 +245,13 @@ class OpType(Enum):
        """
        w_dtype = lora_weights[0].dtype
        num_slices = len(lora_weights)
-        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
+        if self in [OpType.LORA_SHRINK]:
            for slice_idx in range(num_slices):
                ref_group_gemm(ref_out=output[slice_idx, :],
                               input=input,
                               lora_weights=lora_weights[slice_idx],
                               **kwargs)
-        elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            hidden_size = lora_weights[0].shape[1]
-            for slice_idx in range(num_slices):
-                slice_offset = slice_idx * hidden_size
-                ref_group_gemm(
-                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
-                    input=input[slice_idx].clone().to(dtype=w_dtype),
-                    lora_weights=lora_weights[slice_idx],
-                    **kwargs)
-        elif self == OpType.BGMV_SHRINK:
-            assert num_slices == 1
-            ref_group_gemm(ref_out=output,
-                           input=input,
-                           lora_weights=lora_weights[0],
-                           **kwargs)
-        elif self == OpType.BGMV_EXPAND:
-            assert num_slices == 1
-            ref_group_gemm(ref_out=output,
-                           input=input.clone().to(dtype=w_dtype),
-                           lora_weights=lora_weights[0],
-                           **kwargs)
-        elif self == OpType.BGMV_EXPAND_SLICE:
+        elif self in [OpType.LORA_EXPAND]:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
@ -411,13 +317,11 @@ class BenchmarkTensors:
    input: torch.Tensor
    lora_weights_lst: list[torch.Tensor]
    output: torch.Tensor
-    # metadata tensors
+    # LoRA kernel metadata
+    lora_kernel_meta: LoRAKernelMeta
+    # Metadata tensors used in testing correctness
    seq_lens: torch.Tensor
-    seq_start_loc: torch.Tensor
    prompt_lora_mapping: torch.Tensor
-    token_lora_mapping: torch.Tensor
-    # v1 kernel metadata
-    v1_kernel_meta: Optional[V1KernelMeta] = None

    def io_types(self) -> str:
        return (f"{dtype_to_str(self.input.dtype)}x"
@ -444,35 +348,29 @@ class BenchmarkTensors:
        assert ctx.num_active_loras <= ctx.num_loras
        total_tokens = ctx.batch_size * ctx.seq_length

+        # Make metadata tensors involved in correctness testing.
        # Prepare seq lens tensor
        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
                                       (ctx.batch_size, ))
-        # Prepare seq_start_loc tensor
-        seq_start_loc_tensor = torch.cumsum(torch.tensor(
-            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-                                            dim=0)
        assert total_tokens == seq_len_tensor.sum()
        # Prepare prompt lora indices tensor
        prompt_lora_indices_tensor = make_prompt_lora_mapping(
            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
-        # Prepare token lora indices tensor
+
+        # Make LoRAKernelMeta
        token_lora_indices_tensor = make_token_lora_mapping(
            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
            seq_len_tensor, "cpu")
-
-        v1_kernel_meta = None
-        if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
-            v1_kernel_meta = V1KernelMeta.make(
-                max_loras=ctx.num_loras,
-                max_num_tokens=token_lora_indices_tensor.size(0),
-                device="cpu")
-            v1_kernel_meta.prepare_tensors(
-                token_lora_mapping=token_lora_indices_tensor)
+        lora_kernel_meta = LoRAKernelMeta.make(
+            max_loras=ctx.num_loras,
+            max_num_tokens=token_lora_indices_tensor.size(0),
+            device="cpu")
+        lora_kernel_meta.prepare_tensors(
+            token_lora_mapping=token_lora_indices_tensor)

        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
-                                seq_len_tensor, seq_start_loc_tensor,
-                                prompt_lora_indices_tensor,
-                                token_lora_indices_tensor, v1_kernel_meta)
+                                lora_kernel_meta, seq_len_tensor,
+                                prompt_lora_indices_tensor)

    def sanity_check(self) -> None:
        """
@ -482,9 +380,9 @@ class BenchmarkTensors:
        # check metadata tensors
        assert torch.sum(self.seq_lens) == num_tokens
        num_seqs = self.seq_lens.shape[0]
-        assert self.seq_start_loc.shape[0] == num_seqs
+        #assert self.seq_start_loc.shape[0] == num_seqs
        assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.token_lora_mapping.shape[0] == num_tokens
+        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens

    def to_device(self, device: str):
        """
@ -499,220 +397,27 @@ class BenchmarkTensors:
        self.input = to_device(self.input)
        self.output = to_device(self.output)
        self.seq_lens = to_device(self.seq_lens)
-        self.seq_start_loc = to_device(self.seq_start_loc)
        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
-        self.token_lora_mapping = to_device(self.token_lora_mapping)
        for i in range(len(self.lora_weights_lst)):
            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])

-        # v1 meta
-        if self.v1_kernel_meta:
-            for field_name in V1KernelMeta.__dataclass_fields__:
-                field = getattr(self.v1_kernel_meta, field_name)
-                assert isinstance(field, torch.Tensor)
-                setattr(self.v1_kernel_meta, field_name, to_device(field))
+        # LoRA meta
+        for field_name in LoRAKernelMeta.__dataclass_fields__:
+            field = getattr(self.lora_kernel_meta, field_name)
+            assert isinstance(field, torch.Tensor)
+            setattr(self.lora_kernel_meta, field_name, to_device(field))

    def metadata(self) -> tuple[int, int, int]:
        """
        Return num_seqs, num_tokens and max_seq_len
        """
        num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.token_lora_mapping.shape[0]
+        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
        max_seq_len = torch.max(self.seq_lens).item()
        num_slices = len(self.lora_weights_lst)
        return num_seqs, num_tokens, max_seq_len, num_slices

-    def convert_to_sgmv_benchmark_tensors(self):
-        """
-        For sgmv punica kernels, when consecutive sequences have the
-        same LoRA ID, we just merge them together.
-        This happens in punica.py::compute_metadata
-        """
-
-        # Collapse seq_lens and seq_start_loc
-        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
-                                               return_counts=True)
-        cum_result = torch.cumsum(seq_lens, dim=0)
-        seq_start_loc = torch.zeros_like(seq_lens)
-        seq_start_loc[1:].copy_(cum_result[:-1])
-
-        # Collapse prompt mapping
-        prompt_lora_mapping = torch.unique_consecutive(
-            self.prompt_lora_mapping)
-
-        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
-         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
-
-        self.prompt_lora_mapping = prompt_lora_mapping.to(
-            dtype=self.prompt_lora_mapping.dtype)
-        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
-        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
-
-    def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
-        self.convert_to_sgmv_benchmark_tensors()
-        self.sanity_check()
-        self.to_device(self.input.device)
-
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
-        # Sanity check matrix shapes.
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, hidden_size]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        hidden_size = i_shape[1]
-        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == hidden_size
-        lora_rank = lw_shape[1]
-        # Expected output shape [num_slices, num_tokens, lora_rank]
-        assert len(o_shape) == 3
-        assert o_shape == (num_slices, num_tokens, lora_rank)
-
-        return {
-            'inputs': self.input,
-            'lora_a_weights': self.lora_weights_lst,
-            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
-            'seq_len_tensor': self.seq_lens,
-            'lora_indices_tensor': self.prompt_lora_mapping,
-            'batches': num_seqs,
-            'max_seq_length': max_seq_len,
-            'token_nums': num_tokens,
-            'scaling': 1.0,
-        }
-
-    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
-        self.convert_to_sgmv_benchmark_tensors()
-        self.sanity_check()
-        self.to_device(self.input.device)
-
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
-        # Sanity check matrix shapes.
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape : [num_slices, num_tokens, lora_rank]
-        assert len(i_shape) == 3
-        assert i_shape[0] == num_slices
-        assert i_shape[1] == num_tokens
-        lora_rank = i_shape[2]
-        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape : [num_tokens, hidden_size * num_slices]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size * num_slices)
-
-        return {
-            'inputs': self.input,
-            'lora_b_weights': self.lora_weights_lst,
-            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
-            'seq_len_tensor': self.seq_lens,
-            'lora_indices_tensor': self.prompt_lora_mapping,
-            'batches': num_seqs,
-            'max_seq_length': max_seq_len,
-            'token_nums': num_tokens,
-            'offset_start': 0,
-            'add_inputs': add_inputs,
-        }
-
-    def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
-        assert len(self.lora_weights_lst) == 1
-        self.to_device(self.input.device)
-
-        _, num_tokens, _, _ = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, hidden_size]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        hidden_size = i_shape[1]
-        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == hidden_size
-        lora_rank = lw_shape[1]
-        # Expected output shape [num_tokens, lora_rank]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, lora_rank)
-
-        return {
-            'inputs': self.input,
-            'lora_a_weights': self.lora_weights_lst[0],
-            'output_tensor': self.output,
-            'lora_indices_tensor': self.token_lora_mapping,
-            'scaling': 1.0
-        }
-
-    def as_bgmv_expand_kwargs(self, add_inputs: bool):
-        assert len(self.lora_weights_lst) == 1
-        self.to_device(self.input.device)
-
-        _, num_tokens, _, _ = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, lora_rank]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        lora_rank = i_shape[1]
-        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape [num_tokens, hidden_size]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size)
-
-        return {
-            'inputs': self.input,
-            'lora_b_weights': self.lora_weights_lst[0],
-            'output_tensor': self.output,
-            'lora_indices_tensor': self.token_lora_mapping,
-            'add_inputs': add_inputs
-        }
-
-    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
-        _, num_tokens, _, num_slices = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_slices, num_tokens, lora_rank]
-        assert len(i_shape) == 3
-        assert i_shape[0] == num_slices
-        assert i_shape[1] == num_tokens
-        lora_rank = i_shape[2]
-        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape [num_tokens, hidden_size * num_slices]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size * num_slices)
-
-        self.to_device(self.input.device)
-
-        kwargs_list = []
-        for i in range(num_slices):
-            kwargs_list.append({
-                'inputs': self.input[i],
-                'lora_b_weights': self.lora_weights_lst[i],
-                'output_tensor': self.output,
-                'lora_indices_tensor': self.token_lora_mapping,
-                'slice_offset': i * hidden_size,
-                'slice_size': hidden_size,
-                'add_inputs': add_inputs,
-            })
-        return {'kwargs_list': kwargs_list}
-
-    def as_v1_shrink_kwargs(self) -> dict[str, Any]:
-        assert self.v1_kernel_meta is not None
+    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
        self.sanity_check()
        self.to_device(self.input.device)

@ -737,17 +442,16 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
            'scaling': 1.0,
        }

-    def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-        assert self.v1_kernel_meta is not None
+    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
        self.sanity_check()
        self.to_device(self.input.device)

@ -773,12 +477,12 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
            'offset_start': 0,
            'add_inputs': add_inputs,
        }
@ -791,20 +495,10 @@ class BenchmarkTensors:
        else:
            assert add_inputs is not None

-        if op_type == OpType.SGMV_SHRINK:
-            return self.as_sgmv_shrink_kwargs()
-        if op_type == OpType.SGMV_EXPAND:
-            return self.as_sgmv_expand_kwargs(add_inputs)
-        if op_type == OpType.BGMV_SHRINK:
-            return self.as_bgmv_shrink_kwargs()
-        if op_type == OpType.BGMV_EXPAND:
-            return self.as_bgmv_expand_kwargs(add_inputs)
-        if op_type == OpType.BGMV_EXPAND_SLICE:
-            return self.as_bgmv_expand_slice_kwargs(add_inputs)
-        if op_type == OpType.V1_SHRINK:
-            return self.as_v1_shrink_kwargs()
-        if op_type == OpType.V1_EXPAND:
-            return self.as_v1_expand_kwargs(add_inputs)
+        if op_type == OpType.LORA_SHRINK:
+            return self.as_lora_shrink_kwargs()
+        if op_type == OpType.LORA_EXPAND:
+            return self.as_lora_expand_kwargs(add_inputs)
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(self, op_type: OpType,
@ -993,10 +687,6 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
    for bench_ctx in bench_ctxs:
        for seq_len in args.seq_lengths:
            bench_ops: list[OpType] = args.op_types
-            if seq_len > 1:
-                # bench only prefill ops
-                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
-
            seq_len_timers = []
            for bench_op in bench_ops:
                for num_slices in bench_op.num_slices():
@ -1206,13 +896,13 @@ Benchmark LoRA kernels:
    {use_cuda_graph_recommendation()}

    list_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32

    model_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 

    range_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter)

--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
+    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}

  echo "Completed benchmark with QPS: $qps"
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
+          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(

 }  // namespace vllm

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@ -393,8 +393,8 @@ void reshape_and_cache(
                             CALL_RESHAPE_AND_CACHE)
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@ -446,8 +446,8 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -3,6 +3,12 @@

 #include "cpu_types.hpp"

+#if defined(__x86_64__)
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
+#else
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
+#endif
+
 namespace {
 template <typename scalar_t>
 void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@ -95,13 +101,12 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
  }

  const int element_num_per_block = key_caches[0][0].numel();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-                                       element_num_per_block, num_layers);
-        CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-      });
+  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
+                                   element_num_per_block, num_layers);
+    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+  });
 }

 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@ -118,16 +123,15 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);

-  VLLM_DISPATCH_FLOATING_TYPES(
-      key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-        reshape_and_cache_cpu_impl<scalar_t>(
-            key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-            key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-            slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-            value_stride, num_heads, head_size, block_size, x);
-        CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
-      });
+  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
+    reshape_and_cache_cpu_impl<scalar_t>(
+        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
+        num_heads, head_size, block_size, x);
+    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
+  });
 }

 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -16,9 +16,18 @@ namespace vec_op {
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)

+#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
+
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

+#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                \
+                     VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
+
 #ifndef CPU_OP_GUARD
  #define CPU_KERNEL_GUARD_IN(NAME)
  #define CPU_KERNEL_GUARD_OUT(NAME)
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
-  int num_tokens = query.numel() / query.size(-1);
+  int num_tokens = positions.numel();
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;
  int num_kv_heads = key.size(-1) / head_size;
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@ -274,7 +274,7 @@ void advance_step_flashinfer(
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);

-  int block_tables_stride = block_tables.stride(0);
+  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
  TORCH_CHECK((blocks * threads > num_queries),
              "multi-step: not enough threads to map to num_queries = ",
              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@ -19,12 +19,24 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
  return {};
 }

+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
 template <>
 __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
  return c10::Float8_e4m3fn(
      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
                             __hip_fp8_e4m3::__default_interpret),
      c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
 }

 template <>
@ -434,7 +446,7 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
 template <>
 __inline__ __device__ uint32_t
 scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
-  __half2_raw h2r =
+  [[maybe_unused]] __half2_raw h2r =
      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
  union {
    __half2_raw h2r;
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  // sum of squares
  float ss = 0.0f;

-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales(
  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};

  float block_absmax_val_maybe = 0.0f;
-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;

-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  int32_t const num_vec_elems = hidden_size >> 2;

 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];

    vec4_t<float> x;
@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales(
  float block_absmax_val_maybe = 0.0f;

 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];

@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
 //  replace scaled_fp8_conversion_vec
 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> const in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];

--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
 template<typename dst_t>
 static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;
@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

-    const int r = threadIdx.x/4;
+    const auto r = threadIdx.x/4;
    const int tid = r/2;
    const int is0 = r%2;
    const int l0 = 16*is0 + 4*(threadIdx.x%4);
@ -164,10 +164,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 32 threads
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int is  = 2*il;
@ -197,10 +197,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/16;   // il is in 0...3
    const int ir  = tid%16;   // ir is in 0...15
    const int is  = 2*il;     // is is in 0...6
@ -231,10 +231,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int ip  = tid/32;   // ip is 0 or 1
    const int il  = tid - 32*ip; // 0...32
    const int is  = 8*ip + il/16;
@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst

 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
        default:
            return nullptr;
    }
-}
+}
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -19,11 +19,11 @@ template <typename scalar_t>
 static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                     void* __restrict__ vy, const int kx,
                                     const int kx_padded) {
-  const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
  if (ix >= kx_padded) {
    return;
  }
-  const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
  const int i_padded = iy * kx_padded + ix;

  block_q8_1* y = (block_q8_1*)vy;
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(

    const int & ncols_dst = ncols_y;

-    const int row_dst_0 = blockIdx.x*mmq_y;
+    const auto row_dst_0 = blockIdx.x*mmq_y;
    const int & row_x_0 = row_dst_0;

-    const int col_dst_0 = blockIdx.y*mmq_x;
+    const auto col_dst_0 = blockIdx.y*mmq_x;
    const int & col_y_0 = col_dst_0;

    int   * tile_x_ql = nullptr;
@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
-            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
            const int kbxd = kqs / QI8_1;

 #pragma unroll
@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
 #pragma unroll
            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);

                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
        if (col_dst >= ncols_dst) {
            return;
        }

 #pragma unroll
        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
            if (row_dst >= nrows_dst) {
                continue;
            }
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@ -1,7 +1,7 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
 template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

-    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
        const int ibx = row*blocks_per_row + i; // x block index

        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(

  const int ncols_dst = ncols_y * top_k;

-  const int row_dst_0 = blockIdx.x * mmq_y;
+  const auto row_dst_0 = blockIdx.x * mmq_y;
  const int& row_x_0 = row_dst_0;

-  const int col_dst_0 = blockIdx.y * mmq_x;
+  const auto col_dst_0 = blockIdx.y * mmq_x;

  int token_offs[mmq_x / nwarps];
  for (int i = 0; i < mmq_x; i += nwarps) {
@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
    const int n_per_r = ((qk * blocks_per_warp) / qr);
 #pragma unroll
    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
      const int kbxd = kqs / QI8_1;

 #pragma unroll
@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
      }

      if (threadIdx.x < n_per_r / QK8_1) {
-        const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
        const int col_y_eff = token_offs[threadIdx.y] / top_k;
        const int block_x =
            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(

 #pragma unroll
    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const int row_dst = row_dst_0 + threadIdx.x + i;
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
      if (row_dst >= nrows_dst) {
        continue;
      }
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -199,15 +199,15 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -337,15 +337,15 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -458,15 +458,15 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -586,15 +586,15 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  int zero_width = width / 8;
  int vec_height = height * 4;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 8;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  }

  __shared__ half2 deq2[256][8];
-  int val = threadIdx.x / 8;
-  int off = threadIdx.x % 8;
+  auto val = threadIdx.x / 8;
+  auto off = threadIdx.x % 8;
  for (; val < 256; val += BLOCK_KN_SIZE / 8) {
    deq2[val][off] =
        __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
  int zero_width = width / 4;
  int vec_height = height * 2;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 4;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
                                        half* __restrict__ out) {
  // Start of block

-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32 / bit;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32 / bit;
  if (column >= width) return;

  // Views
@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
    const int height, const int width, const int group,
    half* __restrict__ out) {
  // Start of block
-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32;
  if (column >= width) return;

  // Views
@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,

 __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 3;
  uint64_t dst = 0;

@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 4;
  uint64_t dst = 0;

@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
                                            uint32_t* __restrict__ w_new,
                                            const int* __restrict__ q_perm,
                                            const int w_width) {
-  int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w_column >= w_width) return;
-  int w_new_row = blockIdx.y * 3;
-  int q_perm_idx = blockIdx.y << 5;
+  auto w_new_row = blockIdx.y * 3;
+  auto q_perm_idx = blockIdx.y << 5;
  uint32_t dst[3] = {0, 0, 0};

 #pragma unroll
@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 2;
  uint64_t dst = 0;

--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
                            blockIdx.z * params.SplitK * 4;

-    const int lane_id = threadIdx.x % WARP_SIZE;
+    const auto lane_id = threadIdx.x % WARP_SIZE;

    // For matrix A, a block load/store Mtile(row) x 32(col) elements in
    // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
-    const int Aldg_row_base_idx = threadIdx.x / 4;
+    const auto Aldg_row_base_idx = threadIdx.x / 4;
    Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
    const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;

@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
    // * 128(col) per iter
    Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
-    const int Bldg_row_base_idx = threadIdx.x / 8;
+    const auto Bldg_row_base_idx = threadIdx.x / 8;
    const int Bldg_base_offset =
        Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;

@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    B_ldg_guard = 0;
  #pragma unroll
    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-      int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
+      auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
      if (m_idx < params.M) {
        A_ldg_guard |= (1u << i);
      }
@ -98,8 +98,8 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    const int N_padded = (params.N + 31) / 32 * 32;
  #pragma unroll
    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-      int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
-                  i * N_SIZE_ONE_LOAD;
+      auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
+                   i * N_SIZE_ONE_LOAD;
      if (n_idx < N_padded) {
        B_ldg_guard |= (1u << i);
      }
@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
  __device__ void fused_splitk_reduce() {
    // need splitk-reduce if enable splitk
    if (gridDim.z > 1) {
-      int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
+      auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
      // Wait for all previous blocks in the splitk direction to accumulate the
      // results into C_tmp
      if (threadIdx.x == 0) {
@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      }
      __syncthreads();

-      int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
+      auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
      if (blockIdx.z != 0) {
        // expecting that temporary register here reuses the previous A&B frag
        // register
@ -437,9 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
  #pragma unroll
        for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 = static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2]);
+          FType low16 =
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
          FType high16 =
-              static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
          uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                         (reinterpret_cast<uint32_t&>(high16) << 16);
          int sts_offset =
@ -455,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {

    FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
    // C_tile lds and stg
-    int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
+    auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
    bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
    if (WARP_NTILE == 32) {
      int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
@ -579,9 +580,9 @@ __global__ void __launch_bounds__(BLOCK)
  int sts_stage_idx = 0;
  int lds_stage_idx = 0;

-  int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
-                       ? params.SplitK
-                       : params.K - blockIdx.z * params.SplitK;
+  auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
+                        ? params.SplitK
+                        : params.K - blockIdx.z * params.SplitK;
  int k_tiles = (tb_k_slice + 31) / 32;
  int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;

@ -776,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
    const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
    const int N_32align, const int N, const int K) {
  __shared__ FT smem[64 * 32];
-  int warp_id = threadIdx.x / 32;
-  int lane_id = threadIdx.x % 32;
-  const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
+  auto warp_id = threadIdx.x / 32;
+  auto lane_id = threadIdx.x % 32;
+  const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
  const int src_col_idx =
      blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
  const int src_offset = src_row_idx * K * 4 + src_col_idx;
-  int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
+  auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;

  QT qval_reg[16];
  const QT* pdata = qdata + src_offset;
@ -793,7 +794,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
  FT scale_reg[4];
  *(reinterpret_cast<uint2*>(scale_reg)) =
      *(reinterpret_cast<const uint2*>(scales + params_nidx));
-  FT zero_reg[4] = {0};
+  FT zero_reg[4];
  if (zeros != nullptr) {
    *(reinterpret_cast<uint2*>(zero_reg)) =
        *(reinterpret_cast<const uint2*>(zeros + params_nidx));
@ -809,8 +810,10 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
        reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
  #pragma unroll
    for (int ki = 0; ki < 4; ++ki) {
-      fval_reg[ni * 4 + ki] =
-          (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
+      if (zeros != nullptr) {
+        fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]);
+      }
+      fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
      int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
                       ((ni + lane_id % 4) % 4) * 8;
      smem[sts_offset] = fval_reg[ni * 4 + ki];
@ -826,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
        *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
  }

-  const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
-  const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
+  const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
+  const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
  #pragma unroll
  for (int i = 0; i < 2; ++i) {
    int dst_row_kidx = dst_row_base_kidx + i * 32;
@ -1005,4 +1008,4 @@ torch::Tensor allspark_w8a16_gemm(

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
-}
+}
--- a/csrc/quantization/gptq_allspark/allspark_repack.cu
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
        const uint8_t* B, const FType* B_scale, const FType* B_zero,
        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
        const int K, const int N, const int N_32align) {
-  const int lane_id = threadIdx.x % 32;
-  const int warp_id = threadIdx.x / 32;
+  const auto lane_id = threadIdx.x % 32;
+  const auto warp_id = threadIdx.x / 32;

  if (blockIdx.x != gridDim.x - 1) {
    // Load B
@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
    }

    // Store B
-    const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
    const int dst_col_idx =
        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
    for (int i = 0; i < 8; ++i) {
@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
  } else {
    // Load B_scale and B_zero
    FType b_scale_reg, b_zero_reg;
-    int src_offset = blockIdx.y * 128 + threadIdx.x;
+    auto src_offset = blockIdx.y * 128 + threadIdx.x;
    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
    if (B_zero != nullptr)
      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@ -7,6 +7,8 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <iostream>
+#include "../gptq_marlin/marlin_dtypes.cuh"
+using marlin::ScalarType;

 namespace allspark {

@ -60,20 +62,20 @@ template <typename FType, int BLOCK, int N_MATRIX>
 __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
                                              uint32_t n, uint32_t n_matrix,
                                              uint32_t matrix_size) {
-  int idx = blockIdx.x * BLOCK + threadIdx.x;
+  auto idx = blockIdx.x * BLOCK + threadIdx.x;

  if (idx >= matrix_size) {
    return;
  }

-  FType sum(0);
+  float sum = 0.f;

  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
  for (int i = 0; i < n_mat; ++i) {
-    sum += C_split[idx + i * matrix_size];
+    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
  }

-  C[idx] = sum;
+  C[idx] = ScalarType<FType>::float2num(sum);
 }

 template <typename FType>
@ -405,4 +407,4 @@ static __device__ half2 inline num2num2(const half x) {
  return __half2half2(x);
 }

-}  // namespace allspark
+}  // namespace allspark
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {

 template <typename T>
 __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
-  union tmpcvt {
+  [[maybe_unused]] union tmpcvt {
    uint16_t u;
    _Float16 f;
    __hip_bfloat16 b;
@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
                                        const _B16x4& inp2) {
-  union tmpcvt {
+  [[maybe_unused]] union tmpcvt {
    uint16_t u;
    _Float16 f;
    __hip_bfloat16 b;
@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(

  constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);

-  __shared__ float shared_qk_max[NWARPS][16 + 1];
-  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
+  [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
  // shared_logits is used for multiple purposes
  __shared__ _B16x4 shared_logits[NWARPS][4][16][4];

@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    [[maybe_unused]] const int kglobal_token_idx =
+        partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

@ -1272,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
  const int seq_idx = blockIdx.y;
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;

  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
      "bool");
  ops.impl("cutlass_scaled_mm_supports_block_fp8",
-           &cutlass_scaled_mm_supports_fp8);
+           &cutlass_scaled_mm_supports_block_fp8);

  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
  // given capability
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -4,6 +4,7 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@ -34,7 +34,8 @@ Further update the model as follows:
            image_features = self.vision_encoder(image_input)
            return self.multi_modal_projector(image_features)

-        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+        def get_multimodal_embeddings(
+                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

            # Validate the multimodal input keyword arguments
            image_input = self._parse_and_validate_image_input(**kwargs)
@ -61,7 +62,7 @@ Further update the model as follows:
        def get_input_embeddings(
            self,
            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[NestedTensors] = None,
+            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
        ) -> torch.Tensor:

            # `get_input_embeddings` should already be implemented for the language 
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@ -124,3 +124,52 @@ nsys stats report1.nsys-rep
 GUI example:

 <img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
+
+## Profiling vLLM Python Code
+
+The Python standard library includes
+[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
+code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
+Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+used to profile a section of code.
+
+### Example usage - decorator
+
+The first helper is a Python decorator that can be used to profile a function.
+If a filename is specified, the profile will be saved to that file. If no filename is
+specified, profile data will be printed to stdout.
+
+```python
+import vllm.utils
+
+@vllm.utils.cprofile("expensive_function.prof")
+def expensive_function():
+    # some expensive code
+    pass
+```
+
+### Example Usage - context manager
+
+The second helper is a context manager that can be used to profile a block of
+code. Similar to the decorator, the filename is optional.
+
+```python
+import vllm.utils
+
+def another_function():
+    # more expensive code
+    pass
+
+with vllm.utils.cprofile_context("another_function.prof"):
+    another_function()
+```
+
+### Analyzing Profile Results
+
+There are multiple tools available that can help analyze the profile results.
+One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
+
+```bash
+pip install snakeviz
+snakeviz expensive_function.prof
+```
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:

 ```Dockerfile
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:v0.8.0

 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.7.3
+RUN uv pip install vllm[audio,video]==0.8.0
 ```

 :::
@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest

-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+RUN uv pip install git+https://github.com/huggingface/transformers.git
 ```

 :::
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@ -4,9 +4,9 @@

 A Helm chart to deploy vLLM for Kubernetes

-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.

-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.

 ## Prerequisites

--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.

 vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.

-Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
-deploying vLLM on Kubernetes using LWS.
+## Prerequisites
+
+* At least two Kubernetes nodes, each with 8 GPUs, are required.
+* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
+
+## Deploy and Serve
+
+Deploy the following yaml file `lws.yaml`
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+spec:
+  replicas: 2
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: docker.io/vllm/vllm-openai:latest
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            readinessProbe:
+              tcpSocket:
+                port: 8080
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: docker.io/vllm/vllm-openai:latest
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
+```
+
+```bash
+kubectl apply -f lws.yaml
+```
+
+Verify the status of the pods:
+
+```bash
+kubectl get pods
+```
+
+Should get an output similar to this:
+
+```bash
+NAME       READY   STATUS    RESTARTS   AGE
+vllm-0     1/1     Running   0          2s
+vllm-0-1   1/1     Running   0          2s
+vllm-1     1/1     Running   0          2s
+vllm-1-1   1/1     Running   0          2s
+```
+
+Verify that the distributed tensor-parallel inference works:
+
+```bash
+kubectl logs vllm-0 |grep -i "Loading model weights took" 
+```
+
+Should get something similar to this:
+
+```text
+INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
+(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
+```
+
+## Access ClusterIP service
+
+```bash
+# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
+kubectl port-forward svc/vllm-leader 8080:8080
+```
+
+The output should be similar to the following:
+
+```text
+Forwarding from 127.0.0.1:8080 -> 8080
+Forwarding from [::1]:8080 -> 8080
+```
+
+## Serve the model
+
+Open another terminal and send a request
+
+```text
+curl http://localhost:8080/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "prompt": "San Francisco is a",
+    "max_tokens": 7,
+    "temperature": 0
+}'
+```
+
+The output should be similar to the following
+
+```text
+{
+  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+  "object": "text_completion",
+  "created": 1715138766,
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+  "choices": [
+    {
+      "index": 0,
+      "text": " top destination for foodies, with",
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 5,
+    "total_tokens": 12,
+    "completion_tokens": 7
+  }
+}
+```
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -4,17 +4,19 @@

 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.

--------
-
-Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
-
-* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
-
--------
+Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+* [Helm](frameworks/helm.md)
+* [InftyAI/llmaz](integrations/llmaz.md)
+* [KServe](integrations/kserve.md)
+* [kubernetes-sigs/lws](frameworks/lws.md)
+* [meta-llama/llama-stack](integrations/llamastack.md)
+* [substratusai/kubeai](integrations/kubeai.md)
+* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+* [vllm-project/production-stack](integrations/production-stack.md)

 ## Pre-requisite

-Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).

 ## Deployment using native K8s

--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@ -419,7 +419,7 @@ List of `v_vec` for one thread
  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
  multiple inner iterations, each warp will process one block of value
  tokens. And with multiple outer iterations, the whole context value
-  tokens are processd
+  tokens are processed

  ```cpp
  float accs[NUM_ROWS_PER_THREAD];
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
 Metrics in vLLM can be categorized as follows:

 1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
-2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.

 The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.

@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:tokens_total` (Counter)
 - `vllm:iteration_tokens_total` (Histogram)
 - `vllm:time_in_queue_requests` (Histogram)
- `vllm:model_forward_time_milliseconds` (Histogram
+- `vllm:model_forward_time_milliseconds` (Histogram)
 - `vllm:model_execute_time_milliseconds` (Histogram)
 - `vllm:request_params_n` (Histogram)
 - `vllm:request_params_max_tokens` (Histogram)
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we

 In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.

-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-1.png
 :alt: Example Time 1
@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 3
 :::

-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-4.png
 :alt: Example Time 4
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.

-Note: Enabling this feature in production environments is risky as user may participate model adapter management.
+Note: Enabling this feature in production environments is risky as users may participate in model adapter management.

 To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
 is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -25,7 +25,7 @@ import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## Inflight quantization: load as 4bit quantization
@ -35,7 +35,7 @@ from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## OpenAI Compatible Server
@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
 Append the following to your 4bit model arguments:

 ```console
--quantization bitsandbytes --load-format bitsandbytes
+--quantization bitsandbytes
 ```
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models

 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>).

 ```python
 from vllm import LLM, SamplingParams
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@ -15,7 +15,7 @@ more are listed [here](#supported-models).

 By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
 [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
-but they are expected be inferior to models that are specifically trained on embedding tasks.
+but they are expected to be inferior to models that are specifically trained on embedding tasks.

 ______________________________________________________________________

--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
@ -26,4 +26,3 @@ installation/ai_accelerator
  - Google TPU
  - Intel Gaudi
  - AWS Neuron
-  - OpenVINO
--- a/docs/source/getting_started/installation/ai_accelerator.md
+++ b/docs/source/getting_started/installation/ai_accelerator.md
@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
 :::::

 ## Requirements
@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
 :::::

 ## Configure a new environment
@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} python_env_setup.inc.md
-:::
-
-::::
-
 :::::

 ## Set up using Python
@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
 :::::

 ### Build wheel from source
@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
 :::::

 ## Set up using Docker
@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
 :::::

 ### Build image from source
@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
 :::::

 ## Extra information
@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
 :::::
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un

 ## Supported configurations

-The following configurations have been validated to be function with
+The following configurations have been validated to function with
 Gaudi2 devices. Configurations that are not listed may or may not work.

 - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@ -1,110 +0,0 @@
-# Installation
-
-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
-
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-
-## Requirements
-
- OS: Linux
- Instruction set architecture (ISA) requirement: at least AVX2.
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built OpenVINO wheels.
-
-### Build wheel from source
-
-First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
-
-```console
-sudo apt-get update  -y
-sudo apt-get install python3
-pip install --upgrade pip
-```
-
-Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
-
-```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-```
-
-Finally, install vLLM with OpenVINO backend:
-
-```console
-PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-```
-
-:::{tip}
-To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
-:::
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built OpenVINO images.
-
-### Build image from source
-
-```console
-docker build -f Dockerfile.openvino -t vllm-openvino-env .
-docker run -it --rm vllm-openvino-env
-```
-
-## Extra information
-
-## Supported features
-
-OpenVINO vLLM backend supports the following advanced vLLM features:
-
- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)
-
-## Performance tips
-
-### vLLM OpenVINO backend environment variables
-
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
-
-### CPU performance tips
-
-CPU uses the following environment variables to control behavior:
-
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
-
-OpenVINO best known configuration for CPU is:
-
-```console
-$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-```
-
-### GPU performance tips
-
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-```console
-$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## Limitations
-
- LoRA serving is not supported.
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -189,12 +189,13 @@ vLLM CPU backend supports the following vLLM features:
 - Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
- FP8-E5M2 KV-Caching (TODO)
+- FP8-E5M2 KV cache

 ## Related runtime environment variables

 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).

 ## Performance tips

--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
 As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.

+When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 :::
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -1,6 +1,6 @@
 # Installation

-vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+vLLM initially supports basic model inference and serving on Intel GPU platform.

 :::{attention}
 There are no pre-built wheels or images for this device, so you must build vLLM from source.
@ -65,7 +65,7 @@ $ docker run -it \

 ## Supported features

-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:

 ```console
 python -m vllm.entrypoints.openai.api_server \
@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \
     -tp=8
 ```

-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.

-There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
+There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -0,0 +1,161 @@
+# vLLM V1 User Guide
+
+V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
+
+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
+## Why vLLM V1?
+
+vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
+
+Building on V0’s success, vLLM V1 retains the stable and proven components from V0
+(such as the models, GPU kernels, and utilities). At the same time, it significantly
+re-architects the core systems, covering the scheduler, KV cache manager, worker,
+sampler, and API server, to provide a cohesive, maintainable framework that better
+accommodates continued growth and innovation.
+
+Specifically, V1 aims to:
+
+- Provide a **simple, modular, and easy-to-hack codebase**.
+- Ensure **high performance** with near-zero CPU overhead.
+- **Combine key optimizations** into a unified architecture.
+- Require **zero configs** by enabling features/optimizations by default.
+
+We see significant performance improvements from upgrading to V1 core engine, in
+particular for long context scenarios. Please see performance benchmark (To be
+added).
+
+For more details, check out the vLLM V1 blog post [vLLM V1: A Major
+Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
+
+This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
+
+### Supports Overview
+#### Hardware
+
+| Hardware | Status                                   |
+|----------|------------------------------------------|
+| **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
+| **AMD**    | <nobr>🚧 WIP</nobr>           |
+| **TPU**    | <nobr>🚧 WIP</nobr>           |
+#### Feature / Model
+
+| Feature / Model | Status |
+|-----------------|-----------------------------------------------------------------------------------|
+| **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
+| **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
+| **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
+| **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
+| **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
+| **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
+| **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
+| **Encoder-Decoder Models**                  | <nobr>🟡 Planned</nobr>                                                           |
+| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+
+- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
+- **🟢 Functional**: Fully operational, with ongoing optimizations.  
+- **🚧 WIP**: Under active development.  
+- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
+- **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
+
+**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
+way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
+allocate a fixed token budget per request, enabling features like chunked prefills,
+prefix caching, and speculative decoding without a strict separation between prefill
+and decode phases.
+
+### Semantic Changes and Deprecated Features
+
+#### Logprobs
+
+vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
+differences compared to V0:
+
+**Logprobs Calculation**
+
+Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
+
+**Prompt Logprobs with Prefix Caching**
+
+Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
+
+#### Deprecated Features
+
+As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+
+**Sampling features**
+
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **Per-Request Logits Processors**: In V0, users could pass custom
+  processing functions to adjust logits on a per-request basis. In vLLM V1, this
+  feature has been deprecated. Instead, the design is moving toward supporting **global logits
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+
+**KV Cache features**
+
+- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
+to handle request preemptions.
+
+**Structured Output features**
+
+- **Request-level Structured Output Backend**: Deprecated, alternative backends
+  (outlines, guidance) with fallbacks is WIP.
+### Feature & Model Support in Progress
+
+Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
+
+#### Features to Be Optimized
+
+These features are already supported in vLLM V1, but their optimization is still
+in progress.
+
+- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
+  inferior to that of V0. The team is actively working on improving its
+  performance
+(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
+
+- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
+  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
+
+#### Features to Be Supported
+
+- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
+
+- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
+  supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
+  Details about the structured outputs can be found
+  [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
+
+#### Models to Be Supported
+
+vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
+and the majority fall into the following categories. V1 support for these models will be added eventually.
+
+**Embedding Models**  
+Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
+
+**Mamba Models**  
+Models using selective state-space mechanisms (instead of standard transformer attention)
+are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
+
+**Encoder-Decoder Models**  
+vLLM V1 is currently optimized for decoder-only transformers. Models requiring
+  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
+
+For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## FAQ
+
+TODO
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -67,6 +67,8 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
+getting_started/v1_user_guide
+
 :::

 % What does vLLM support?
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -101,7 +101,7 @@ class MyAttention(nn.Module):

  def forward(self, hidden_states, **kwargs): # <- kwargs are required
    ...
-    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
    attn_output, attn_weights = attention_interface(
      self,
      query_states,
@ -472,11 +472,21 @@ See [this page](#generative-models) for more information on how to use generativ
  * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
  * ✅︎
  * ✅︎
+- * `TeleFLMForCausalLM`
+  * TeleFLM
+  * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
+  * ✅︎
+  * ✅︎
 - * `XverseForCausalLM`
  * XVERSE
  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
  * ✅︎
  * ✅︎
+- * `Zamba2ForCausalLM`
+  * Zamba2
+  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
+  *
+  *
 :::

 :::{note}
@ -879,7 +889,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
  *
  * ✅︎
  * ✅︎
@ -946,7 +956,7 @@ V0 correctly implements the model's attention pattern:

 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
 - Will be updated in the future to support the correct behavior

 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b

 ## Running vLLM on a single node

-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.

 Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.

@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size`
 ```python
 from vllm import LLM
 llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-output = llm.generate("San Franciso is a")
+output = llm.generate("San Francisco is a")
 ```

 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should

 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.

-After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:

 ```console
 vllm serve /path/to/the/model/in/the/container \
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@ -39,7 +39,16 @@ The following metrics are exposed:

 The following metrics are deprecated and due to be removed in a future version:

- *(No metrics are currently deprecated)*
+- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
+  `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
+  used in V1.
+- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
+  counters in V1.
+- `vllm:time_in_queue_requests` because it duplicates
+  `vllm:request_queue_time_seconds`.
+- `vllm:model_forward_time_milliseconds` and
+  `vllm:model_execute_time_milliseconds` because
+  prefill/decode/inference time metrics should be used instead.

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -29,6 +29,11 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```

+:::{tip}
+vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
+You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
+:::
+
 ## Supported APIs

 We currently support the following OpenAI APIs:
--- a/docs/source/training/rlhf.md
+++ b/docs/source/training/rlhf.md
@ -1,6 +1,6 @@
 # Reinforcement Learning from Human Feedback

-Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.

 vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 import os
+from dataclasses import asdict
+from typing import NamedTuple, Optional

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
@ -23,21 +25,31 @@ question_per_audio_count = {
    2: "What sport and what nursery rhyme are referenced?"
 }

+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.


 # MiniCPM-O
-def run_minicpmo(question: str, audio_count: int):
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
    model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    llm = LLM(model=model_name,
-              trust_remote_code=True,
-              max_model_len=4096,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )

    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=audio_chat_template)
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+    )


 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: str, audio_count: int):
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process audio inputs.
@ -67,36 +84,35 @@ def run_phi4mm(questions: str, audio_count: int):
    speech_lora_path = os.path.join(model_path, "speech-lora")
    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])

-    prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
-        lora_extra_vocab_size=0,
        limit_mm_per_prompt={"audio": audio_count},
    )
-    lora_request = LoRARequest("speech", 1, speech_lora_path)
-    # To maintain code compatibility in this script, we add LoRA here.
-    llm.llm_engine.add_lora(lora_request=lora_request)
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)

-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )


 # Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int):
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
    model_name = "Qwen/Qwen2-Audio-7B-Instruct"

-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )

    audio_in_prompt = "".join([
        f"Audio {idx+1}: "
@ -107,12 +123,15 @@ def run_qwen2_audio(question: str, audio_count: int):
              "<|im_start|>user\n"
              f"{audio_in_prompt}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )


 # Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
@ -124,29 +143,39 @@ def run_ultravox(question: str, audio_count: int):
                                           tokenize=False,
                                           add_generation_prompt=True)

-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=5,
-              trust_remote_code=True,
-              limit_mm_per_prompt={"audio": audio_count})
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )


 # Whisper
-def run_whisper(question: str, audio_count: int):
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
    assert audio_count == 1, (
        "Whisper only support single audio input per prompt")
    model_name = "openai/whisper-large-v3-turbo"

    prompt = "<|startoftranscript|>"

-    llm = LLM(model=model_name,
-              max_model_len=448,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=448,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )


 model_example_map = {
@ -164,14 +193,24 @@ def main(args):
        raise ValueError(f"Model type {model} is not supported.")

    audio_count = args.num_audios
-    llm, prompt, stop_token_ids = model_example_map[model](
-        question_per_audio_count[audio_count], audio_count)
+    req_data = model_example_map[model](question_per_audio_count[audio_count],
+                                        audio_count)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)

    mm_data = {}
    if audio_count > 0:
@ -183,7 +222,7 @@ def main(args):
        }

    assert args.num_prompts > 0
-    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
@ -214,6 +253,10 @@ if __name__ == "__main__":
                        default=1,
                        choices=[0, 1, 2],
                        help="Number of audio items per prompt.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")

    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -76,5 +76,10 @@ if __name__ == "__main__":
                             GPUs_per_dp_rank))
        proc.start()
        procs.append(proc)
+    exit_code = 0
    for proc in procs:
        proc.join()
+        if proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--dataset",
+    type=str,
+    default="./examples/data/gsm8k.jsonl",
+    help="downloaded from the eagle repo " \
+    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+)
+parser.add_argument("--max_num_seqs", type=int, default=8)
+parser.add_argument("--num_prompts", type=int, default=80)
+parser.add_argument("--num_spec_tokens", type=int, default=2)
+parser.add_argument("--tp", type=int, default=1)
+parser.add_argument("--draft_tp", type=int, default=1)
+parser.add_argument("--enforce_eager", action='store_true')
+parser.add_argument("--enable_chunked_prefill", action='store_true')
+parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+parser.add_argument("--temp", type=float, default=0)
+
+args = parser.parse_args()
+
+print(args)
+
+model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+
+max_model_len = 2048
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+if os.path.exists(args.dataset):
+    prompts = []
+    num_prompts = args.num_prompts
+    with open(args.dataset) as f:
+        for line in f:
+            data = json.loads(line)
+            prompts.append(data["turns"][0])
+else:
+    prompts = ["The future of AI is", "The president of the United States is"]
+
+prompts = prompts[:args.num_prompts]
+num_prompts = len(prompts)
+
+prompt_ids = [
+    tokenizer.apply_chat_template([{
+        "role": "user",
+        "content": prompt
+    }],
+                                  add_generation_prompt=True)
+    for prompt in prompts
+]
+
+llm = LLM(
+    model=model_dir,
+    trust_remote_code=True,
+    tensor_parallel_size=args.tp,
+    enable_chunked_prefill=args.enable_chunked_prefill,
+    max_num_batched_tokens=args.max_num_batched_tokens,
+    enforce_eager=args.enforce_eager,
+    max_model_len=max_model_len,
+    max_num_seqs=args.max_num_seqs,
+    gpu_memory_utilization=0.8,
+    speculative_model=eagle_dir,
+    num_speculative_tokens=args.num_spec_tokens,
+    speculative_draft_tensor_parallel_size=args.draft_tp,
+    speculative_max_model_len=max_model_len,
+    disable_log_stats=False,
+)
+
+sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+outputs = llm.generate(prompt_token_ids=prompt_ids,
+                       sampling_params=sampling_params)
+
+# calculate the average number of accepted tokens per forward pass, +1 is
+# to account for the token from the target model that's always going to be
+# accepted
+acceptance_counts = [0] * (args.num_spec_tokens + 1)
+for output in outputs:
+    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
+        acceptance_counts[step] += count
+
+print(f"mean acceptance length: \
+    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
 import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple

-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.utils import FlexibleArgumentParser


+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: Sequence[PromptType]
+
+
 def run_florence2():
-    # Create a Florence-2 encoder/decoder model instance
-    llm = LLM(
+    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
        tokenizer="facebook/bart-large",
        max_num_seqs=8,
@ -39,12 +46,15 @@ def run_florence2():
            "decoder_prompt": "",
        },
    ]
-    return llm, prompts
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 def run_mllama():
-    # Create a Mllama encoder/decoder model instance
-    llm = LLM(
+    engine_args = EngineArgs(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
        max_model_len=4096,
        max_num_seqs=2,
@ -69,12 +79,15 @@ def run_mllama():
            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
        },
    ]
-    return llm, prompts
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 def run_whisper():
-    # Create a Whisper encoder/decoder model instance
-    llm = LLM(
+    engine_args = EngineArgs(
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,
        max_num_seqs=16,
@ -99,7 +112,11 @@ def run_whisper():
            "decoder_prompt": "<|startoftranscript|>",
        }
    ]
-    return llm, prompts
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 model_example_map = {
@ -114,7 +131,12 @@ def main(args):
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

-    llm, prompts = model_example_map[model]()
+    req_data = model_example_map[model]()
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    prompts = req_data.prompts

    # Create a sampling params object.
    sampling_params = SamplingParams(
@ -153,6 +175,10 @@ if __name__ == "__main__":
                        default="mllama",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")

    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
-                                 load_format="bitsandbytes",
                                 enable_lora=True,
                                 max_lora_rank=64)
    else:
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -6,14 +6,16 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams

-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3.1
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@ -43,12 +45,20 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py advanced


-def run_simple_demo():
-    model_name = "mistralai/Pixtral-12B-2409"
+def run_simple_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

-    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
-    llm = LLM(model=model_name, tokenizer_mode="mistral")
+    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
@ -76,8 +86,8 @@ def run_simple_demo():
    print(outputs[0].outputs[0].text)


-def run_advanced_demo():
-    model_name = "mistralai/Pixtral-12B-2409"
+def run_advanced_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    max_img_per_msg = 5
    max_tokens_per_img = 4096

@ -85,8 +95,11 @@ def run_advanced_demo():
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    prompt = "Describe the following image."
@ -153,14 +166,19 @@ def main():
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
    args = parser.parse_args()

    if args.mode == "simple":
        print("Running simple demo...")
-        run_simple_demo()
+        run_simple_demo(args)
    elif args.mode == "advanced":
        print("Running advanced demo...")
-        run_advanced_demo()
+        run_advanced_demo(args)


 if __name__ == "__main__":
--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+from vllm import LLM, SamplingParams
+
+# vLLM does not guarantee the reproducibility of the results by default,
+# for the sake of performance. You need to do the following to achieve
+# reproducible results:
+# 1. Turn off multiprocessing to make the scheduling deterministic.
+#    NOTE(woosuk): This is not needed and will be ignored for V0.
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+# 2. Fix the global seed for reproducibility. The default seed is None, which is
+# not reproducible.
+SEED = 42
+
+# NOTE(woosuk): Even with the above two settings, vLLM only provides
+# reproducibility when it runs on the same hardware and the same vLLM version.
+# Also, the online serving API (`vllm serve`) does not support reproducibility
+# because it is almost impossible to make the scheduling deterministic in the
+# online serving setting.
+
+llm = LLM(model="facebook/opt-125m", seed=SEED)
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -8,126 +8,167 @@ on HuggingFace model repository.
 """
 import os
 import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser

+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.


 # Aria
-def run_aria(questions: list[str], modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

    # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              dtype="bfloat16",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # BLIP-2
-def run_blip2(questions: list[str], modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
-    llm = LLM(model="Salesforce/blip2-opt-2.7b",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="Salesforce/blip2-opt-2.7b",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Chameleon
-def run_chameleon(questions: list[str], modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"{question}<image>" for question in questions]
-    llm = LLM(model="facebook/chameleon-7b",
-              max_model_len=4096,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="facebook/chameleon-7b",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "deepseek-ai/deepseek-vl2-tiny"

-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+    )

    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Florence2
-def run_florence2(question: str, modality: str):
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    llm = LLM(model="microsoft/Florence-2-large",
-              tokenizer="facebook/bart-large",
-              max_num_seqs=8,
-              trust_remote_code=True,
-              dtype="bfloat16",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

-    prompt = "<MORE_DETAILED_CAPTION>"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Fuyu
-def run_fuyu(questions: list[str], modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"{question}\n" for question in questions]
-    llm = LLM(model="adept/fuyu-8b",
-              max_model_len=2048,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="adept/fuyu-8b",
+        max_model_len=2048,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Gemma 3
-def run_gemma3(questions: list[str], modality: str):
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
        mm_processor_kwargs={"do_pan_and_scan": True},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
@ -135,22 +176,27 @@ def run_gemma3(questions: list[str], modality: str):
    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # GLM-4v
-def run_glm4v(questions: list[str], modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True,
-              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        enforce_eager=True,
+        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@ -158,16 +204,21 @@ def run_glm4v(questions: list[str], modality: str):
    ]

    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "h2oai/h2ovl-mississippi-800m"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -187,15 +238,20 @@ def run_h2ovl(questions: list[str], modality: str):
    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -212,17 +268,20 @@ def run_idefics3(questions: list[str], modality: str):
    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
    ) for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # InternVL
-def run_internvl(questions: list[str], modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -245,53 +304,75 @@ def run_internvl(questions: list[str], modality: str):
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # LLaVA-1.5
-def run_llava(questions: list[str], modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]

-    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
-              max_model_len=4096,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_model_len=4096,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        max_model_len=8192,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(questions: list[str], modality: str):
+def run_llava_next_video(questions: list[str],
+                         modality: str) -> ModelRequestData:
    assert modality == "video"

    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+        max_model_len=8192,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # LLaVA-OneVision
-def run_llava_onevision(questions: list[str], modality: str):
+def run_llava_onevision(questions: list[str],
+                        modality: str) -> ModelRequestData:

    if modality == "video":
        prompts = [
@ -305,15 +386,20 @@ def run_llava_onevision(questions: list[str], modality: str):
        <|im_start|>assistant\n" for question in questions
        ]

-    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+        max_model_len=16384,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Mantis
-def run_mantis(questions: list[str], modality: str):
+def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
@ -322,14 +408,19 @@ def run_mantis(questions: list[str], modality: str):
        for question in questions
    ]

-    llm = LLM(
+    engine_args = EngineArgs(
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    stop_token_ids = [128009]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # MiniCPM-V
@ -357,7 +448,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
@ -389,19 +480,24 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


-def run_minicpmo(questions: list[str], modality: str):
+def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")


-def run_minicpmv(questions: list[str], modality: str):
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")


 # LLama 3.2
-def run_mllama(questions: list[str], modality: str):
+def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@ -411,7 +507,7 @@ def run_mllama(questions: list[str], modality: str):
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@ -432,17 +528,20 @@ def run_mllama(questions: list[str], modality: str):
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Molmo
-def run_molmo(questions: list[str], modality: str):
+def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "allenai/Molmo-7B-D-0924"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
@ -453,18 +552,21 @@ def run_molmo(questions: list[str], modality: str):
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # NVLM-D
-def run_nvlm_d(questions: list[str], modality: str):
+def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -481,36 +583,47 @@ def run_nvlm_d(questions: list[str], modality: str):
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # PaliGemma
-def run_paligemma(question: str, modality: str):
+def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma has special prompt format for VQA
-    prompt = ["caption en"]
-    llm = LLM(model="google/paligemma-3b-mix-224",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma-3b-mix-224",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # PaliGemma 2
-def run_paligemma2(question: str, modality: str):
+def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma 2 has special prompt format for VQA
-    prompt = ["caption en"]
-    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma2-3b-ft-docci-448",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Phi-3-Vision
-def run_phi3v(questions: list[str], modality: str):
+def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
@ -530,7 +643,7 @@ def run_phi3v(questions: list[str], modality: str):
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@ -539,12 +652,15 @@ def run_phi3v(questions: list[str], modality: str):
        mm_processor_kwargs={"num_crops": 16},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: list[str], modality: str):
+def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
@ -558,33 +674,30 @@ def run_phi4mm(questions: list[str], modality: str):
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
-        lora_extra_vocab_size=0,
    )
-    lora_request = LoRARequest("vision", 1, vision_lora_path)
-    # To maintain code compatibility in this script, we add LoRA here.
-    llm.llm_engine.add_lora(lora_request=lora_request)
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)

-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )


 # Pixtral HF-format
-def run_pixtral_hf(questions: list[str], modality: str):
+def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

    # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -592,15 +705,18 @@ def run_pixtral_hf(questions: list[str], modality: str):
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Qwen
-def run_qwen_vl(questions: list[str], modality: str):
+def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    llm = LLM(
+    engine_args = EngineArgs(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_model_len=1024,
@ -610,16 +726,19 @@ def run_qwen_vl(questions: list[str], modality: str):
    )

    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Qwen2-VL
-def run_qwen2_vl(questions: list[str], modality: str):
+def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@ -642,16 +761,19 @@ def run_qwen2_vl(questions: list[str], modality: str):
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Qwen2.5-VL
-def run_qwen2_5_vl(questions: list[str], modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@ -674,8 +796,11 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 model_example_map = {
@ -789,18 +914,28 @@ def main(args):
    data = mm_input["data"]
    questions = mm_input["questions"]

-    llm, prompts, stop_token_ids = model_example_map[model](questions,
-                                                            modality)
+    req_data = model_example_map[model](questions, modality)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
    # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = prompts if args.use_different_prompt_per_request else [
-        prompts[0]
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
    ]

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)

    assert args.num_prompts > 0
    if args.num_prompts == 1:
@ -865,6 +1000,10 @@ if __name__ == "__main__":
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 from argparse import Namespace
+from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args

 from PIL.Image import Image

-from vllm import LLM
+from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser

@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]


 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
    prompt: str
    image: Optional[Image]


-def run_e5_v(query: Query):
+def run_e5_v(query: Query) -> ModelRequestData:
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501

    if query["modality"] == "text":
@ -58,20 +59,20 @@ def run_e5_v(query: Query):
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")

-    llm = LLM(
+    engine_args = EngineArgs(
        model="royokong/e5-v",
        task="embed",
        max_model_len=4096,
    )

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )


-def run_vlm2vec(query: Query):
+def run_vlm2vec(query: Query) -> ModelRequestData:
    if query["modality"] == "text":
        text = query["text"]
        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")

-    llm = LLM(
+    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
        task="embed",
        trust_remote_code=True,
@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
    )

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )
@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)


-def run_encode(model: str, modality: QueryModality):
+def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    query = get_query(modality)
    req_data = model_example_map[model](query)

+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
    mm_data = {}
    if req_data.image is not None:
        mm_data["image"] = req_data.image

-    outputs = req_data.llm.embed({
+    outputs = llm.embed({
        "prompt": req_data.prompt,
        "multi_modal_data": mm_data,
    })
@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):


 def main(args: Namespace):
-    run_encode(args.model_name, args.modality)
+    run_encode(args.model_name, args.modality, args.seed)


 model_example_map = {
@ -167,5 +171,10 @@ if __name__ == "__main__":
                        default="image",
                        choices=get_args(QueryModality),
                        help='Modality of the input.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -6,13 +6,14 @@ using the chat template defined by the model.
 """
 import os
 from argparse import Namespace
+from dataclasses import asdict
 from typing import NamedTuple, Optional

 from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer

-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@ -25,11 +26,12 @@ IMAGE_URLS = [


 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
    prompt: str
-    stop_token_ids: Optional[list[int]]
    image_data: list[Image]
-    chat_template: Optional[str]
+    stop_token_ids: Optional[list[int]] = None
+    chat_template: Optional[str] = None
+    lora_requests: Optional[list[LoRARequest]] = None


 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -37,58 +39,58 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.


-def load_aria(question, image_urls: list[str]) -> ModelRequestData:
+def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "rhymes-ai/Aria"
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              limit_mm_per_prompt={"image": len(image_urls)})
+    engine_args = EngineArgs(
+        model=model_name,
+        tokenizer_mode="slow",
+        trust_remote_code=True,
+        dtype="bfloat16",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


-def load_deepseek_vl2(question: str, image_urls: list[str]):
+def load_deepseek_vl2(question: str,
+                      image_urls: list[str]) -> ModelRequestData:
    model_name = "deepseek-ai/deepseek-vl2-tiny"

-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
-              limit_mm_per_prompt={"image": len(image_urls)})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )

    placeholder = "".join(f"image_{i}:<image>\n"
                          for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


-def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
+def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "google/gemma-3-4b-it"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
-        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@ -112,18 +114,16 @@ def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
                                           add_generation_prompt=True)

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


 def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-800m"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -146,19 +146,18 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


-def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
+def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
@ -177,18 +176,16 @@ def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -214,19 +211,18 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


-def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
+def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@ -236,19 +232,17 @@ def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
    placeholders = "<|image|>" * len(image_urls)
    prompt = f"{placeholders}<|begin_of_text|>{question}"
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


-def load_nvlm_d(question: str, image_urls: list[str]):
+def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -266,14 +260,11 @@ def load_nvlm_d(question: str, image_urls: list[str]):
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
-    stop_token_ids = None

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


@ -281,7 +272,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -291,14 +282,11 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
-    stop_token_ids = None

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


@ -315,7 +303,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@ -326,14 +314,11 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
-    stop_token_ids = None

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
    )


@ -347,7 +332,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=10000,
@ -355,32 +340,24 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
-        lora_extra_vocab_size=0,
    )
-    lora_request = LoRARequest("vision", 1, vision_lora_path)
-    # To maintain code compatibility in this script, we add LoRA here.
-    llm.llm_engine.add_lora(lora_request=lora_request)
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)

    placeholders = "".join(f"<|image_{i}|>"
                           for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
-    stop_token_ids = None

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )


 def load_qwen_vl_chat(question: str,
                      image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
@ -411,7 +388,7 @@ def load_qwen_vl_chat(question: str,
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
@ -419,7 +396,7 @@ def load_qwen_vl_chat(question: str,
    )


-def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -431,7 +408,7 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"

    # Tested on L40
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
@ -460,23 +437,19 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)

-    stop_token_ids = None
-
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=stop_token_ids,
        image_data=image_data,
-        chat_template=None,
    )


-def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -487,7 +460,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
@ -516,8 +489,6 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)

-    stop_token_ids = None
-
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
@ -525,11 +496,9 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
                                            return_video_kwargs=False)

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
-        stop_token_ids=stop_token_ids,
        image_data=image_data,
-        chat_template=None,
    )


@ -551,14 +520,25 @@ model_example_map = {
 }


-def run_generate(model, question: str, image_urls: list[str]):
+def run_generate(model, question: str, image_urls: list[str],
+                 seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)

+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=req_data.stop_token_ids)

-    outputs = req_data.llm.generate(
+    outputs = llm.generate(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": {
@ -572,13 +552,24 @@ def run_generate(model, question: str, image_urls: list[str]):
        print(generated_text)


-def run_chat(model: str, question: str, image_urls: list[str]):
+def run_chat(model: str, question: str, image_urls: list[str],
+             seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)

+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=req_data.stop_token_ids)
-    outputs = req_data.llm.chat(
+    outputs = llm.chat(
        [{
            "role":
            "user",
@ -607,11 +598,12 @@ def run_chat(model: str, question: str, image_urls: list[str]):
 def main(args: Namespace):
    model = args.model_type
    method = args.method
+    seed = args.seed

    if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS)
+        run_generate(model, QUESTION, IMAGE_URLS, seed)
    elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS)
+        run_chat(model, QUESTION, IMAGE_URLS, seed)
    else:
        raise ValueError(f"Invalid method: {method}")

@ -632,6 +624,10 @@ if __name__ == "__main__":
                        default="generate",
                        choices=["generate", "chat"],
                        help="The method to run in `vllm.LLM`.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")

    args = parser.parse_args()
    main(args)
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@ -42,7 +42,7 @@ def post_http_request(prompt: str,
 def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
-                                     delimiter=b"\0"):
+                                     delimiter=b"\n"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"]
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@ -8,6 +8,9 @@ set -xe
 echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
 sleep 1

+# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'cleanup' INT

@ -44,18 +47,20 @@ wait_for_server() {
 # You can also adjust --kv-ip and --kv-port for distributed inference.

 # prefilling instance, which is the KV producer
-CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
    --port 8100 \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &

 # decoding instance, which is the KV consumer
-CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
    --port 8200 \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &

@ -78,7 +83,7 @@ sleep 1
 output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "San Francisco is a",
 "max_tokens": 10,
 "temperature": 0
@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 output2=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "Santa Clara is a",
 "max_tokens": 10,
 "temperature": 0
--- a/examples/online_serving/disaggregated_prefill_zmq.sh
+++ b/examples/online_serving/disaggregated_prefill_zmq.sh
@ -0,0 +1,123 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling with ZMQ
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
+sleep 1
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# a function that waits vLLM connect to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# a function that waits vLLM disagg to start
+wait_for_disagg_server() {
+  local log_file=$1
+  timeout 1200 bash -c "
+    until grep -q 'PDWorker is ready' $log_file; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# You can also adjust --kv-ip and --kv-port for distributed inference.
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+CONTROLLER_ADDR=controller.ipc
+PREFILL_WORKER_ADDR=prefill.ipc
+DECODE_WORKER_ADDR=decode.ipc
+PORT=8001
+
+# prefilling instance, which is the KV producer
+CUDA_VISIBLE_DEVICES=0 python3 ../../vllm/entrypoints/disaggregated/worker.py \
+    --model $MODEL \
+    --controller-addr $CONTROLLER_ADDR \
+    --worker-addr $PREFILL_WORKER_ADDR \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' > vllm_disagg_prefill.log 2>&1 &
+
+# decoding instance, which is the KV consumer
+CUDA_VISIBLE_DEVICES=1 python3 ../../vllm/entrypoints/disaggregated/worker.py \
+    --model $MODEL \
+    --controller-addr $CONTROLLER_ADDR \
+    --worker-addr $DECODE_WORKER_ADDR \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' > vllm_disagg_decode.log 2>&1 &
+
+# launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - Send req to prefill instance, wait until complete.
+# - Send req to decode instance, streaming tokens.
+python3 ../../vllm/entrypoints/disaggregated/api_server.py \
+    --port $PORT \
+    --model $MODEL \
+    --controller-addr $CONTROLLER_ADDR \
+    --prefill-addr $PREFILL_WORKER_ADDR \
+    --decode-addr $DECODE_WORKER_ADDR &
+
+# wait until prefill, decode instances and proxy are ready
+wait_for_server $PORT
+wait_for_disagg_server vllm_disagg_prefill.log
+wait_for_disagg_server vllm_disagg_decode.log 
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8001/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Llama-3.1-8B-Instruct",
+"prompt": "San Francisco is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8001/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Llama-3.1-8B-Instruct",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+
+# Cleanup commands
+pgrep python | xargs kill -9
+pkill -f python
+
+echo ""
+
+sleep 1
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@ -21,7 +21,7 @@ def http_bot(prompt):

    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
-                                     delimiter=b"\0"):
+                                     delimiter=b"\n"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"][0]
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@ -127,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
    "vllm": {
      "handlers": ["vllm"],
      "level": "DEBUG",
-      "propagage": false
+      "propagate": false
    },
    "vllm.example_noisy_logger": {
      "propagate": false
--- a/examples/template_teleflm.jinja
+++ b/examples/template_teleflm.jinja
@ -0,0 +1,12 @@
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<_user>' + message['content']|trim }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<_system>' + message['content']|trim }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '<_bot>' + message['content'] }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<_bot>' }}
+{%- endif %}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging",
    "setuptools>=61",
    "setuptools-scm>=8.0",
-    "torch == 2.5.1",
+    "torch == 2.6.0",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.6.0
 wheel
-jinja2
+jinja2>=3.1.6
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -1,3 +1,4 @@
+cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
@ -17,9 +18,10 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
+llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@ -27,7 +29,7 @@ pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common[opencv] >= 1.5.0
+mistral_common[opencv] >= 1.5.4
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
@ -38,3 +40,4 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
+ninja # Required for xgrammar, rocm, tpu, xpu
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -3,7 +3,8 @@

 # Dependencies for CPUs
 torch==2.6.0+cpu; platform_machine == "x86_64"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.6.0; platform_system == "Darwin"
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -4,9 +4,9 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
-ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch == 2.5.1
-torchaudio==2.5.1
+ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+torch==2.6.0
+torchaudio==2.6.0
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
+torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -9,12 +9,13 @@ msgspec
 cloudpickle

 # packages to install to build the documentation
+cachetools
 pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.5.0
+mistral_common >= 1.5.4
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
--- a/requirements/openvino.txt
+++ b/requirements/openvino.txt
@ -1,8 +0,0 @@
-# Common dependencies
-r common.txt
-
-torch == 2.5.1 #  should be aligned with "common" vLLM torch version
-openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
-
-optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -1,16 +1,15 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2
-torch==2.5.1
-torchvision==0.20.1
-torchaudio==2.5.1
+--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0

 cmake>=3.26
-ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
 wheel
-jinja2
+jinja2>=3.1.6
 amdsmi==6.2.4
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@ -0,0 +1,23 @@
+
+# entrypoints test
+# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+audioread==3.0.1
+cffi==1.17.1
+decorator==5.2.1
+lazy-loader==0.4
+platformdirs==4.3.6
+pooch==1.8.2
+#pycparse==2.22
+soundfile==0.13.1
+soxr==0.5.0.post1
+librosa==0.10.2.post1
+
+# entrypoints test
+#vllm[video] # required by entrypoints/openai/test_video.py
+decord==0.6.0
+
+# entrypoints test
+#sentence-transformers # required by entrypoints/openai/test_score.py
+sentence-transformers==3.4.1
+
+
--- a/Show More
+++ b/Show More