[V1][Scheduler] Use dict for running queue

This is just a random idea, still need to benchmark Potential advantages for large batch sizes: - Don't need to copy entire list every iteration - O(1) removal of aborted requests Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-13 16:11:07 -04:00
552 changed files with 10120 additions and 34056 deletions
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.231
+    value: 0.233
  - name: "exact_match,flexible-extract"
-    value: 0.22
+    value: 0.236
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -13,7 +13,6 @@ from pathlib import Path
 import lm_eval
 import numpy
 import pytest
 import yaml
 RTOL = 0.05
@ -47,10 +46,6 @@ def test_lm_eval_correctness():
    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
    if eval_config[
            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
        pytest.skip("FBGEMM is currently failing on main.")
    # Launch eval requests.
    results = launch_lm_eval(eval_config)
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -57,6 +57,8 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
      - "rm -f /var/log/syslog"
      - "rm -f /var/log/kern.log"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -101,30 +101,16 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_permute_cols.py"
 fi
-#ignore certain Entrypoints/openai tests
+#ignore certain Entrypoints tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_accuracy.py \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_chat.py \
+  --ignore=entrypoints/openai/test_encoder_decoder.py \
-  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_embedding.py \
-  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_oot_registration.py "}
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -44,11 +44,11 @@ remove_docker_container() {
 trap remove_docker_container EXIT
 # Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 set -e
 # Build the docker image.
 docker build -f Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -1,36 +0,0 @@
 #!/bin/bash
 set -e
 # Build the docker image.
 docker build -f Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && echo TEST_1 \
    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
    && echo TEST_3 \
    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
    && echo TEST_4 \
    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && echo TEST_5 \
    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -4,28 +4,16 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
-docker build -t ${image_name} -f Dockerfile.xpu .
+docker build -t xpu-test -f Dockerfile.xpu .
 # Setup cleanup
-remove_docker_container() { 
+remove_docker_container() { docker rm -f xpu-test || true; }
  docker rm -f "${container_name}" || true; 
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference/tensor parallel
-docker run \
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    --device /dev/dri \
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    -v /dev/dri/by-path:/dev/dri/by-path \
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,6 +41,7 @@ steps:
  - grep \"sig sig-object py\" build/html/api/inference_params.html
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@ -117,14 +118,15 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
@ -136,7 +138,7 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - python3 ../examples/offline_inference/data_parallel.py
+  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -150,6 +152,7 @@ steps:
 - label: Metrics, Tracing Test # 10min
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
@ -197,19 +200,16 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - pytest -v -s v1/entrypoints
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - pytest -v -s v1/entrypoints
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
-    - pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - pytest -v -s v1/structured_output
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -227,17 +227,14 @@ steps:
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py
-    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py
-    - python3 offline_inference/vision_language_embedding.py --seed 0
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
@ -287,6 +284,7 @@ steps:
  parallelism: 4
 - label: PyTorch Fullgraph Smoke Test # 9min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -381,8 +379,7 @@ steps:
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+    - pytest -v -s models/test_initialization.py
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -525,12 +522,13 @@ steps:
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -53,7 +53,7 @@ repos:
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
 #
 # Try to find python package with an executable that exactly matches
@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build AllSpark kernels if we are building for at least some compatible archs.
  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (ALLSPARK_ARCHS)
+  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
    set(ALLSPARK_SRCS
       "csrc/quantization/gptq_allspark/allspark_repack.cu"
       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
  else()
    message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                   " in CUDA target architectures, or CUDA not >= 12.0")
  endif()
--- a/2
+++ b/2
@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -61,7 +61,6 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
    && pip install -U -r requirements/rocm.txt \
    && pip install -U -r requirements/rocm-test.txt \
    && pip uninstall -y vllm \
    && pip install *.whl
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,7 +1,11 @@
-# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
 FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
-RUN rm /etc/apt/sources.list.d/intel-graphics.list
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@ -17,6 +21,8 @@ RUN apt-get update -y && \
    python3 \
    python3-dev \
    python3-pip \
    libze-intel-gpu-dev \
    libze-intel-gpu1 \
    wget
 WORKDIR /workspace/vllm
--- a/README.md
+++ b/README.md
@ -13,10 +13,18 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
 We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!  
 Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
 👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!  
 ---
 *Latest News* 🔥
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -43,26 +43,20 @@ become available.
    <tr>
      <td><strong>HuggingFace</strong></td>
      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🟡</td>
+      <td style="text-align: center;">🚧</td>
      <td>Specify your dataset path on HuggingFace</td>
    </tr>
    <tr>
      <td><strong>VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🚧</td>
      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
    </tr>
  </tbody>
 </table>
-
+✅: supported  
 ✅: supported
 🚧: to be supported
 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
 similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
 formats, please consider contributing.
 **Note**: VisionArena’s `dataset-name` should be set to `hf`
 ---
@ -82,10 +76,10 @@ Then run the benchmarking script
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
-BACKEND="vllm"
+BACKEND="openai-chat"
 DATASET_NAME="sharegpt"
 DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```
 If successful, you will see the following output
@ -129,7 +123,7 @@ DATASET_NAME="hf"
 DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
 DATASET_SPLIT='train'
-python3 vllm/benchmarks/benchmark_serving.py \
+python3 benchmarks/benchmark_serving.py \
  --backend "${BACKEND}" \
  --model "${MODEL_NAME}" \
  --endpoint "/v1/chat/completions" \
@ -146,65 +140,35 @@ python3 vllm/benchmarks/benchmark_serving.py \
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
 DATASET_NAME="sonnet"
-DATASET_PATH="vllm/benchmarks/sonnet.txt"
+DATASET_PATH="benchmarks/sonnet.txt"
-python3 vllm/benchmarks/benchmark_throughput.py \
+python3 benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --dataset-name "${DATASET_NAME}" \
  --dataset-path "${DATASET_PATH}" \
  --num-prompts "${NUM_PROMPTS}"
-```
+  ```
 If successful, you will see the following output
 ```
-Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
 Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 ### VisionArena Benchmark for Vision Language Models
 ``` bash
 MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 NUM_PROMPTS=10
 DATASET_NAME="hf"
 DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
 DATASET_SPLIT="train"
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --backend "vllm-chat" \
  --dataset-name "${DATASET_NAME}" \
  --dataset-path "${DATASET_PATH}" \
  --num-prompts "${NUM_PROMPTS}" \
  --hf-split "${DATASET_SPLIT}"
 ```
 The `num prompt tokens` now includes image token counts
 ```
 Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
 Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 ### Benchmark with LoRA Adapters
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="meta-llama/Llama-2-7b-hf"
 BACKEND="vllm"
 DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
 NUM_PROMPTS=10
 MAX_LORAS=2
 MAX_LORA_RANK=8
 ENABLE_LORA="--enable-lora"
 LORA_PATH="yard1/llama-2-7b-sql-lora-test"
-python3 vllm/benchmarks/benchmark_throughput.py \
+python3 benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --backend "${BACKEND}" \
  --dataset_path "${DATASET_PATH}" \
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -14,8 +14,7 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)
-# NOTE(simon): do not import vLLM here so the benchmark script
+from vllm.model_executor.model_loader.weight_utils import get_lock
 # can run without vLLM installed.
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -334,7 +333,7 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        ("chat/completions", "profile")
+        "chat/completions"
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
    async with aiohttp.ClientSession(trust_env=True,
@ -428,8 +427,6 @@ def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
        from vllm.model_executor.model_loader.weight_utils import get_lock
        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
        with get_lock(pretrained_model_name_or_path):
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -46,7 +46,7 @@ class SampleRequest:
    Represents a single inference request for benchmarking.
    """
-    prompt: Union[str, Any]
+    prompt: str
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@ -84,20 +84,6 @@ class BenchmarkDataset(ABC):
                            if random_seed is not None else self.DEFAULT_SEED)
        self.data = None
    def apply_multimodal_chat_transformation(
            self,
            prompt: str,
            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
        This method is used for chat models that expect a specific 
        conversation format.
        """
        content = [{"text": prompt, "type": "text"}]
        if mm_content is not None:
            content.append(mm_content)
        return [{"role": "user", "content": content}]
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
@ -352,7 +338,6 @@ class ShareGPTDataset(BenchmarkDataset):
               lora_path: Optional[str] = None,
               max_loras: Optional[int] = None,
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        samples: list = []
        for entry in self.data:
@ -373,9 +358,6 @@ class ShareGPTDataset(BenchmarkDataset):
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
@ -568,13 +550,10 @@ class HuggingFaceDataset(BenchmarkDataset):
            split=self.dataset_split,
            streaming=True,
        )
-        if self.data.features is None or "conversations" \
+
-            not in self.data.features:
+        if "conversations" not in self.data.features:
-            raise ValueError(
+            raise ValueError("HF Dataset must have a 'conversations' column.")
-                "HuggingFaceDataset currently only supports datasets with "
+
                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
                "Please consider contributing if you would like to add "
                "support for additional dataset formats.")
        # Shuffle and filter examples with at least 2 conversations.
        self.data = self.data.shuffle(seed=self.random_seed).filter(
            lambda x: len(x["conversations"]) >= 2)
@ -582,8 +561,9 @@ class HuggingFaceDataset(BenchmarkDataset):
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
               lora_path: Optional[str] = None,
               max_loras: Optional[int] = None,
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        sampled_requests = []
        dynamic_output = output_len is None
@ -591,9 +571,13 @@ class HuggingFaceDataset(BenchmarkDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            conv = item["conversations"]
            prompt, completion = conv[0]["value"], conv[1]["value"]
            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer, lora_path=lora_path, max_loras=max_loras)
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
@ -603,20 +587,16 @@ class HuggingFaceDataset(BenchmarkDataset):
            if dynamic_output and not is_valid_sequence(
                    prompt_len, completion_len):
                continue
            mm_content = process_image(
                item["image"]) if "image" in item else None
            if enable_multimodal_chat:
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len and output len
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                    lora_request=lora_request,
                ))
        return sampled_requests
@ -626,7 +606,7 @@ class HuggingFaceDataset(BenchmarkDataset):
 # -----------------------------------------------------------------------------
-class VisionArenaDataset(HuggingFaceDataset):
+class VisionArenaDataset(BenchmarkDataset):
    """
    Vision Arena Dataset.
    """
@ -637,9 +617,14 @@ class VisionArenaDataset(HuggingFaceDataset):
    def __init__(
        self,
        dataset_split: str,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
            raise ValueError(f"Only support Vision Arena dataset.\
                    This data path {self.dataset_path} is not valid.")
@ -660,9 +645,9 @@ class VisionArenaDataset(HuggingFaceDataset):
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
-               output_len: Optional[int] = None,
+               output_len: int = DEFAULT_OUTPUT_LEN,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        # TODO (jenniferzhao): Add support for offline benchmark sampling
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
@ -670,14 +655,8 @@ class VisionArenaDataset(HuggingFaceDataset):
            if len(sampled_requests) >= num_requests:
                break
            prompt = item["turns"][0][0]["content"]
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
-            if enable_multimodal_chat:
+            mm_content = process_image(item["images"][0])
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -684,15 +684,6 @@ def main(args: argparse.Namespace):
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
        if not args.save_detailed:
            # Remove fields with too many data points
            for field in [
                    "input_lens", "output_lens", "ttfts", "itls",
                    "generated_texts", "errors"
            ]:
                if field in result_json:
                    del result_json[field]
        # Traffic
        result_json["request_rate"] = (args.request_rate if args.request_rate
                                       < float("inf") else "inf")
@ -837,12 +828,6 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
    parser.add_argument(
        "--save-detailed",
        action="store_true",
        help="When saving the results, whether to include per request "
        "information such as response, error, ttfs, tpots, etc.",
    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -732,11 +732,8 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"
-    tokenizer = get_tokenizer(
+    tokenizer = get_tokenizer(tokenizer_id,
-        tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
        trust_remote_code=args.trust_remote_code,
        tokenizer_mode=args.tokenizer_mode,
    )
    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@ -879,13 +876,6 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
        type=int,
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,9 +11,8 @@ from typing import Any, Optional, Union
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               ShareGPTDataset, SonnetDataset)
                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -24,7 +23,6 @@ from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@ -34,7 +32,7 @@ def run_vllm(
    n: int,
    engine_args: EngineArgs,
    disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
+) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
@ -68,13 +66,12 @@ def run_vllm(
    use_beam_search = False
    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
-        outputs = llm.generate(prompts,
+        llm.generate(prompts,
-                               sampling_params,
+                     sampling_params,
-                               lora_request=lora_requests,
+                     lora_request=lora_requests,
-                               use_tqdm=True)
+                     use_tqdm=True)
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
@ -92,46 +89,7 @@ def run_vllm(
                ignore_eos=True,
            ))
        end = time.perf_counter()
-    return end - start, outputs
+    return end - start
 def run_vllm_chat(
        requests: list[SampleRequest],
        n: int,
        engine_args: EngineArgs,
        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
    """
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    """
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
        llm.llm_engine.model_config.max_model_len >= (
            request.prompt_len + request.expected_output_len)
        for request in requests), (
            "Please ensure that max_model_len is greater than the sum of "
            "prompt_len and expected_output_len for all requests.")
    prompts = []
    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(request.prompt)
        sampling_params.append(
            SamplingParams(
                n=n,
                temperature=1.0,
                top_p=1.0,
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
            ))
    start = time.perf_counter()
    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
    return end - start, outputs
 async def run_vllm_async(
@ -306,8 +264,6 @@ def get_requests(args, tokenizer):
        dataset_cls = RandomDataset
    elif args.dataset_name == "sharegpt":
        dataset_cls = ShareGPTDataset
        if args.backend == "vllm-chat":
            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
            "Tokenizer/model must have chat template for sonnet dataset.")
@ -316,19 +272,6 @@ def get_requests(args, tokenizer):
        sample_kwargs["return_prompt_formatted"] = True
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
        if args.backend != "vllm-chat":
            raise ValueError(
                "hf datasets only are supported by vllm-chat backend")
        # Choose between VisionArenaDataset and HuggingFaceDataset based on
        # provided parameters.
        dataset_cls = (VisionArenaDataset if args.dataset_path
                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
                       and args.hf_subset is None else HuggingFaceDataset)
        common_kwargs['dataset_subset'] = args.hf_subset
        common_kwargs['dataset_split'] = args.hf_split
        sample_kwargs["enable_multimodal_chat"] = True
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -347,7 +290,6 @@ def main(args: argparse.Namespace):
    requests = get_requests(args, tokenizer)
    is_multi_modal = any(request.multi_modal_data is not None
                         for request in requests)
    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
            elapsed_time = uvloop.run(
@ -359,9 +301,9 @@ def main(args: argparse.Namespace):
                    args.disable_detokenize,
                ))
        else:
-            elapsed_time, request_outputs = run_vllm(
+            elapsed_time = run_vllm(requests, args.n,
-                requests, args.n, EngineArgs.from_cli_args(args),
+                                    EngineArgs.from_cli_args(args),
-                args.disable_detokenize)
+                                    args.disable_detokenize)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -370,45 +312,20 @@ def main(args: argparse.Namespace):
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
    elif args.backend == "vllm-chat":
        elapsed_time, request_outputs = run_vllm_chat(
            requests, args.n, EngineArgs.from_cli_args(args),
            args.disable_detokenize)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
-    if request_outputs:
+                           for request in requests)
-        # Note: with the vllm and vllm-chat backends,
+    total_output_tokens = sum(request.expected_output_len
-        # we have request_outputs, which we use to count tokens.
+                              for request in requests)
-        total_prompt_tokens = 0
+    if is_multi_modal:
-        total_output_tokens = 0
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
        for ro in request_outputs:
            if not isinstance(ro, RequestOutput):
                continue
            total_prompt_tokens += len(
                ro.prompt_token_ids) if ro.prompt_token_ids else 0
            total_output_tokens += sum(
                len(o.token_ids) for o in ro.outputs if o)
        total_num_tokens = total_prompt_tokens + total_output_tokens
    else:
        total_num_tokens = sum(r.prompt_len + r.expected_output_len
                               for r in requests)
        total_output_tokens = sum(r.expected_output_len for r in requests)
        total_prompt_tokens = total_num_tokens - total_output_tokens
    if is_multi_modal and args.backend != "vllm-chat":
        print("\033[91mWARNING\033[0m: Multi-modal request with "
              f"{args.backend} backend detected. The "
              "following metrics are not accurate because image tokens are not"
              " counted. See vllm-project/vllm/issues/9778 for details.")
        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
        # vllm-chat backend counts the image tokens now
    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
    print(f"Total num prompt tokens:  {total_prompt_tokens}")
    print(f"Total num output tokens:  {total_output_tokens}")
    # Output JSON results if specified
    if args.output_json:
@ -424,100 +341,17 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, results)
 def validate_args(args):
    """
    Validate command-line arguments.
    """
    # === Deprecation and Defaulting ===
    if args.dataset is not None:
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next release. "
            "Please use '--dataset-name' and '--dataset-path' instead.",
            stacklevel=2)
        args.dataset_path = args.dataset
    if not getattr(args, "tokenizer", None):
        args.tokenizer = args.model
    # === Backend Validation ===
    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
    if args.backend not in valid_backends:
        raise ValueError(f"Unsupported backend: {args.backend}")
    # === Dataset Configuration ===
    if not args.dataset and not args.dataset_path:
        print(
            "When dataset path is not set, it will default to random dataset")
        args.dataset_name = 'random'
        if args.input_len is None:
            raise ValueError("input_len must be provided for a random dataset")
    # === Dataset Name Specific Checks ===
    # --hf-subset and --hf-split: only used
    # when dataset_name is 'hf'
    if args.dataset_name != "hf" and (
            getattr(args, "hf_subset", None) is not None
            or getattr(args, "hf_split", None) is not None):
        warnings.warn("--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
                      stacklevel=2)
    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
        raise ValueError(
            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
    # --random-range-ratio: only used when dataset_name is 'random'
    if args.dataset_name != 'random' and args.random_range_ratio is not None:
        warnings.warn("--random-range-ratio will be ignored since \
                --dataset-name is not 'random'.",
                      stacklevel=2)
    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
    # set.
    if args.dataset_name not in {"random", "sonnet", None
                                 } and args.prefix_len is not None:
        warnings.warn("--prefix-len will be ignored since --dataset-name\
                 is not 'random', 'sonnet', or not set.",
                      stacklevel=2)
    # === LoRA Settings ===
    if getattr(args, "enable_lora", False) and args.backend != "vllm":
        raise ValueError(
            "LoRA benchmarking is only supported for vLLM backend")
    if getattr(args, "enable_lora", False) and args.lora_path is None:
        raise ValueError("LoRA path must be provided when enable_lora is True")
    # === Backend-specific Validations ===
    if args.backend == "hf" and args.hf_max_batch_size is None:
        raise ValueError("HF max batch size is required for HF backend")
    if args.backend != "hf" and args.hf_max_batch_size is not None:
        raise ValueError("HF max batch size is only for HF backend.")
    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
                                                 None) is not None:
        raise ValueError("Quantization is only for vLLM backend.")
    if args.backend == "mii" and args.dtype != "auto":
        raise ValueError("dtype must be auto for MII backend.")
    if args.backend == "mii" and args.n != 1:
        raise ValueError("n must be 1 for MII backend.")
    if args.backend == "mii" and args.tokenizer != args.model:
        raise ValueError(
            "Tokenizer must be the same as the model for MII backend.")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
-                        choices=["vllm", "hf", "mii", "vllm-chat"],
+                        choices=["vllm", "hf", "mii"],
                        default="vllm")
-    parser.add_argument(
+    parser.add_argument("--dataset-name",
-        "--dataset-name",
+                        type=str,
-        type=str,
+                        choices=["sharegpt", "random", "sonnet", "burstgpt"],
-        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+                        help="Name of the dataset to benchmark on.",
-        help="Name of the dataset to benchmark on.",
+                        default="sharegpt")
        default="sharegpt")
    parser.add_argument(
        "--dataset",
        type=str,
@ -585,24 +419,55 @@ if __name__ == "__main__":
    parser.add_argument(
        "--random-range-ratio",
        type=float,
-        default=None,
+        default=1.0,
        help="Range of sampled ratio of input/output length, "
        "used only for RandomDataSet.",
    )
    # hf dtaset
    parser.add_argument("--hf-subset",
                        type=str,
                        default=None,
                        help="Subset of the HF dataset.")
    parser.add_argument("--hf-split",
                        type=str,
                        default=None,
                        help="Split of the HF dataset.")
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
-    validate_args(args)
+    if args.dataset is not None:
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next "
            "release. Please use '--dataset-name' and "
            "'--dataset-path' in the future runs.",
            stacklevel=2)
        args.dataset_path = args.dataset
    if args.dataset is None and args.dataset_path is None:
        # for random dataset, the default sampling setting is in
        # benchmark_dataset.RandomDataset
        print("When dataset is not set, it will default to random dataset")
    else:
        assert args.input_len is None
    if args.enable_lora:
        assert args.lora_path is not None
    if args.backend == "vllm":
        if args.hf_max_batch_size is not None:
            raise ValueError("HF max batch size is only for HF backend.")
    elif args.backend == "hf":
        if args.hf_max_batch_size is None:
            raise ValueError("HF max batch size is required for HF backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.enable_lora is not None:
            raise ValueError("LoRA benchmarking is only supported for vLLM"
                             " backend")
    elif args.backend == "mii":
        if args.dtype != "auto":
            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None:
            raise ValueError("HF max batch size is only for HF backend.")
        if args.tokenizer != args.model:
            raise ValueError("Tokenizer must be the same as the model for MII "
                             "backend.")
        if args.enable_lora is not None:
            raise ValueError("LoRA benchmarking is only supported for vLLM"
                             " backend")
    main(args)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -17,8 +17,13 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
-from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
 from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -162,25 +167,69 @@ class OpType(Enum):
    """
    LoRA Ops to benchmark and its properties.
    """
-    LORA_SHRINK = auto()
+    SGMV_SHRINK = auto()
-    LORA_EXPAND = auto()
+    BGMV_SHRINK = auto()
    SGMV_EXPAND = auto()
    BGMV_EXPAND = auto()
    BGMV_EXPAND_SLICE = auto()
    V1_SHRINK = auto()
    V1_EXPAND = auto()
    @staticmethod
    def from_str(s: str) -> "OpType":
-        if s.lower() == "lora_shrink":
+        if s.lower() == 'sgmv_shrink':
-            return OpType.LORA_SHRINK
+            return OpType.SGMV_SHRINK
-        if s.lower() == "lora_expand":
+        if s.lower() == 'sgmv_expand':
-            return OpType.LORA_EXPAND
+            return OpType.SGMV_EXPAND
        if s.lower() == 'bgmv_shrink':
            return OpType.BGMV_SHRINK
        if s.lower() == 'bgmv_expand':
            return OpType.BGMV_EXPAND
        if s.lower() == "bgmv_expand_slice":
            return OpType.BGMV_EXPAND_SLICE
        if s.lower() == "v1_shrink":
            return OpType.V1_SHRINK
        if s.lower() == "v1_expand":
            return OpType.V1_EXPAND
        raise ValueError(f"Unrecognized str {s} to convert to OpType")
    def is_shrink_fn(self) -> bool:
-        return self in [OpType.LORA_SHRINK]
+        return self in [
            OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
        ]
    def is_expand_fn(self) -> bool:
-        return self in [OpType.LORA_EXPAND]
+        return self in [
            OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
        ]
    def is_prefill_op(self) -> bool:
        return self in [
            OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
            OpType.V1_EXPAND
        ]
    def is_decode_op(self) -> bool:
        return self in [
            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
            OpType.V1_SHRINK, OpType.V1_EXPAND
        ]
    def is_expand_slice_fn(self) -> bool:
        return self in [OpType.BGMV_EXPAND_SLICE]
    def num_slices(self) -> list[int]:
-        return [1, 2, 3]
+        if self in [
                OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
                OpType.V1_EXPAND
        ]:
            # SGMV kernels and v1 kernels supports slices
            return [1, 2, 3]
        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
            return [1]
        if self in [OpType.BGMV_EXPAND_SLICE]:
            return [2, 3]
        raise ValueError(f"Unrecognized OpType {self}")
    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
            lora_rank: int) -> tuple[int, int, int]:
@ -190,7 +239,7 @@ class OpType(Enum):
            k = hidden_size
            n = lora_rank
        else:
-            assert self.is_expand_fn()
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
            m = num_tokens
            k = lora_rank
            n = hidden_size
@ -205,7 +254,7 @@ class OpType(Enum):
        if self.is_shrink_fn():
            return op_dtype, op_dtype, torch.float32
        else:
-            assert self.is_expand_fn()
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
            return torch.float32, op_dtype, op_dtype
    def matmul_shapes(
@ -219,19 +268,43 @@ class OpType(Enum):
        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
        b_shape = (num_loras, n, k)  # col-major
-        if self in [OpType.LORA_SHRINK]:
+        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
-            # LoRA shrink kernels support num_slices inherently in the kernel.
+            # SGMV shrink and V1 shrink kernels support num_slices inherently
            # in the kernel.
            return ((m, k), b_shape, (num_slices, m, n))
-        if self in [OpType.LORA_EXPAND]:
+        if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            # LoRA expand kernels support num_slices inherently in the kernel
+            # SGMV expand and V1 expand kernels support num_slices inherently
            # in the kernel
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
        if self == OpType.BGMV_SHRINK:
            return ((m, k), b_shape, (m, n))
        if self == OpType.BGMV_EXPAND:
            return ((m, k), b_shape, (m, n))
        if self == OpType.BGMV_EXPAND_SLICE:
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
        raise ValueError(f"Unrecognized op_type {self}")
    def bench_fn(self) -> Callable:
-        if self == OpType.LORA_SHRINK:
+
-            return lora_shrink
+        def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
-        if self == OpType.LORA_EXPAND:
+            for x in kwargs_list:
-            return lora_expand
+                bgmv_expand_slice(**x)
        if self == OpType.SGMV_SHRINK:
            return sgmv_shrink
        if self == OpType.SGMV_EXPAND:
            return sgmv_expand
        if self == OpType.BGMV_SHRINK:
            return bgmv_shrink
        if self == OpType.BGMV_EXPAND:
            return bgmv_expand
        if self == OpType.BGMV_EXPAND_SLICE:
            return emulate_bgmv_expand_slice
        if self == OpType.V1_SHRINK:
            return v1_shrink
        if self == OpType.V1_EXPAND:
            return v1_expand
        raise ValueError(f"Unrecognized optype {self}")
@ -245,13 +318,34 @@ class OpType(Enum):
        """
        w_dtype = lora_weights[0].dtype
        num_slices = len(lora_weights)
-        if self in [OpType.LORA_SHRINK]:
+        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
            for slice_idx in range(num_slices):
                ref_group_gemm(ref_out=output[slice_idx, :],
                               input=input,
                               lora_weights=lora_weights[slice_idx],
                               **kwargs)
-        elif self in [OpType.LORA_EXPAND]:
+        elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
                ref_group_gemm(
                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
                    input=input[slice_idx].clone().to(dtype=w_dtype),
                    lora_weights=lora_weights[slice_idx],
                    **kwargs)
        elif self == OpType.BGMV_SHRINK:
            assert num_slices == 1
            ref_group_gemm(ref_out=output,
                           input=input,
                           lora_weights=lora_weights[0],
                           **kwargs)
        elif self == OpType.BGMV_EXPAND:
            assert num_slices == 1
            ref_group_gemm(ref_out=output,
                           input=input.clone().to(dtype=w_dtype),
                           lora_weights=lora_weights[0],
                           **kwargs)
        elif self == OpType.BGMV_EXPAND_SLICE:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
@ -317,11 +411,13 @@ class BenchmarkTensors:
    input: torch.Tensor
    lora_weights_lst: list[torch.Tensor]
    output: torch.Tensor
-    # LoRA kernel metadata
+    # metadata tensors
    lora_kernel_meta: LoRAKernelMeta
    # Metadata tensors used in testing correctness
    seq_lens: torch.Tensor
    seq_start_loc: torch.Tensor
    prompt_lora_mapping: torch.Tensor
    token_lora_mapping: torch.Tensor
    # v1 kernel metadata
    v1_kernel_meta: Optional[V1KernelMeta] = None
    def io_types(self) -> str:
        return (f"{dtype_to_str(self.input.dtype)}x"
@ -348,29 +444,35 @@ class BenchmarkTensors:
        assert ctx.num_active_loras <= ctx.num_loras
        total_tokens = ctx.batch_size * ctx.seq_length
        # Make metadata tensors involved in correctness testing.
        # Prepare seq lens tensor
        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
                                       (ctx.batch_size, ))
        # Prepare seq_start_loc tensor
        seq_start_loc_tensor = torch.cumsum(torch.tensor(
            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
                                            dim=0)
        assert total_tokens == seq_len_tensor.sum()
        # Prepare prompt lora indices tensor
        prompt_lora_indices_tensor = make_prompt_lora_mapping(
            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
-
+        # Prepare token lora indices tensor
        # Make LoRAKernelMeta
        token_lora_indices_tensor = make_token_lora_mapping(
            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
            seq_len_tensor, "cpu")
-        lora_kernel_meta = LoRAKernelMeta.make(
+
-            max_loras=ctx.num_loras,
+        v1_kernel_meta = None
-            max_num_tokens=token_lora_indices_tensor.size(0),
+        if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
-            device="cpu")
+            v1_kernel_meta = V1KernelMeta.make(
-        lora_kernel_meta.prepare_tensors(
+                max_loras=ctx.num_loras,
-            token_lora_mapping=token_lora_indices_tensor)
+                max_num_tokens=token_lora_indices_tensor.size(0),
                device="cpu")
            v1_kernel_meta.prepare_tensors(
                token_lora_mapping=token_lora_indices_tensor)
        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
-                                lora_kernel_meta, seq_len_tensor,
+                                seq_len_tensor, seq_start_loc_tensor,
-                                prompt_lora_indices_tensor)
+                                prompt_lora_indices_tensor,
                                token_lora_indices_tensor, v1_kernel_meta)
    def sanity_check(self) -> None:
        """
@ -380,9 +482,9 @@ class BenchmarkTensors:
        # check metadata tensors
        assert torch.sum(self.seq_lens) == num_tokens
        num_seqs = self.seq_lens.shape[0]
-        #assert self.seq_start_loc.shape[0] == num_seqs
+        assert self.seq_start_loc.shape[0] == num_seqs
        assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
+        assert self.token_lora_mapping.shape[0] == num_tokens
    def to_device(self, device: str):
        """
@ -397,27 +499,220 @@ class BenchmarkTensors:
        self.input = to_device(self.input)
        self.output = to_device(self.output)
        self.seq_lens = to_device(self.seq_lens)
        self.seq_start_loc = to_device(self.seq_start_loc)
        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
        self.token_lora_mapping = to_device(self.token_lora_mapping)
        for i in range(len(self.lora_weights_lst)):
            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
-        # LoRA meta
+        # v1 meta
-        for field_name in LoRAKernelMeta.__dataclass_fields__:
+        if self.v1_kernel_meta:
-            field = getattr(self.lora_kernel_meta, field_name)
+            for field_name in V1KernelMeta.__dataclass_fields__:
-            assert isinstance(field, torch.Tensor)
+                field = getattr(self.v1_kernel_meta, field_name)
-            setattr(self.lora_kernel_meta, field_name, to_device(field))
+                assert isinstance(field, torch.Tensor)
                setattr(self.v1_kernel_meta, field_name, to_device(field))
    def metadata(self) -> tuple[int, int, int]:
        """
        Return num_seqs, num_tokens and max_seq_len
        """
        num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
+        num_tokens = self.token_lora_mapping.shape[0]
        max_seq_len = torch.max(self.seq_lens).item()
        num_slices = len(self.lora_weights_lst)
        return num_seqs, num_tokens, max_seq_len, num_slices
-    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
+    def convert_to_sgmv_benchmark_tensors(self):
        """
        For sgmv punica kernels, when consecutive sequences have the
        same LoRA ID, we just merge them together.
        This happens in punica.py::compute_metadata
        """
        # Collapse seq_lens and seq_start_loc
        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
                                               return_counts=True)
        cum_result = torch.cumsum(seq_lens, dim=0)
        seq_start_loc = torch.zeros_like(seq_lens)
        seq_start_loc[1:].copy_(cum_result[:-1])
        # Collapse prompt mapping
        prompt_lora_mapping = torch.unique_consecutive(
            self.prompt_lora_mapping)
        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
        self.prompt_lora_mapping = prompt_lora_mapping.to(
            dtype=self.prompt_lora_mapping.dtype)
        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
    def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
        self.convert_to_sgmv_benchmark_tensors()
        self.sanity_check()
        self.to_device(self.input.device)
        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
        # Sanity check matrix shapes.
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_tokens, hidden_size]
        assert len(i_shape) == 2
        assert i_shape[0] == num_tokens
        hidden_size = i_shape[1]
        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
        assert len(lw_shape) == 3
        assert lw_shape[2] == hidden_size
        lora_rank = lw_shape[1]
        # Expected output shape [num_slices, num_tokens, lora_rank]
        assert len(o_shape) == 3
        assert o_shape == (num_slices, num_tokens, lora_rank)
        return {
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst,
            'output_tensor': self.output,
            'b_seq_start_loc': self.seq_start_loc,
            'seq_len_tensor': self.seq_lens,
            'lora_indices_tensor': self.prompt_lora_mapping,
            'batches': num_seqs,
            'max_seq_length': max_seq_len,
            'token_nums': num_tokens,
            'scaling': 1.0,
        }
    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
        self.convert_to_sgmv_benchmark_tensors()
        self.sanity_check()
        self.to_device(self.input.device)
        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
        # Sanity check matrix shapes.
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape : [num_slices, num_tokens, lora_rank]
        assert len(i_shape) == 3
        assert i_shape[0] == num_slices
        assert i_shape[1] == num_tokens
        lora_rank = i_shape[2]
        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
        assert len(lw_shape) == 3
        assert lw_shape[2] == lora_rank
        hidden_size = lw_shape[1]
        # Expected output shape : [num_tokens, hidden_size * num_slices]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, hidden_size * num_slices)
        return {
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst,
            'output_tensor': self.output,
            'b_seq_start_loc': self.seq_start_loc,
            'seq_len_tensor': self.seq_lens,
            'lora_indices_tensor': self.prompt_lora_mapping,
            'batches': num_seqs,
            'max_seq_length': max_seq_len,
            'token_nums': num_tokens,
            'offset_start': 0,
            'add_inputs': add_inputs,
        }
    def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
        assert len(self.lora_weights_lst) == 1
        self.to_device(self.input.device)
        _, num_tokens, _, _ = self.metadata()
        # Sanity check shapes
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_tokens, hidden_size]
        assert len(i_shape) == 2
        assert i_shape[0] == num_tokens
        hidden_size = i_shape[1]
        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
        assert len(lw_shape) == 3
        assert lw_shape[2] == hidden_size
        lora_rank = lw_shape[1]
        # Expected output shape [num_tokens, lora_rank]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, lora_rank)
        return {
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst[0],
            'output_tensor': self.output,
            'lora_indices_tensor': self.token_lora_mapping,
            'scaling': 1.0
        }
    def as_bgmv_expand_kwargs(self, add_inputs: bool):
        assert len(self.lora_weights_lst) == 1
        self.to_device(self.input.device)
        _, num_tokens, _, _ = self.metadata()
        # Sanity check shapes
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_tokens, lora_rank]
        assert len(i_shape) == 2
        assert i_shape[0] == num_tokens
        lora_rank = i_shape[1]
        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
        assert len(lw_shape) == 3
        assert lw_shape[2] == lora_rank
        hidden_size = lw_shape[1]
        # Expected output shape [num_tokens, hidden_size]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, hidden_size)
        return {
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst[0],
            'output_tensor': self.output,
            'lora_indices_tensor': self.token_lora_mapping,
            'add_inputs': add_inputs
        }
    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
        _, num_tokens, _, num_slices = self.metadata()
        # Sanity check shapes
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_slices, num_tokens, lora_rank]
        assert len(i_shape) == 3
        assert i_shape[0] == num_slices
        assert i_shape[1] == num_tokens
        lora_rank = i_shape[2]
        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
        assert len(lw_shape) == 3
        assert lw_shape[2] == lora_rank
        hidden_size = lw_shape[1]
        # Expected output shape [num_tokens, hidden_size * num_slices]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, hidden_size * num_slices)
        self.to_device(self.input.device)
        kwargs_list = []
        for i in range(num_slices):
            kwargs_list.append({
                'inputs': self.input[i],
                'lora_b_weights': self.lora_weights_lst[i],
                'output_tensor': self.output,
                'lora_indices_tensor': self.token_lora_mapping,
                'slice_offset': i * hidden_size,
                'slice_size': hidden_size,
                'add_inputs': add_inputs,
            })
        return {'kwargs_list': kwargs_list}
    def as_v1_shrink_kwargs(self) -> dict[str, Any]:
        assert self.v1_kernel_meta is not None
        self.sanity_check()
        self.to_device(self.input.device)
@ -442,16 +737,17 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.lora_kernel_meta.active_lora_ids,
+            'lora_ids': self.v1_kernel_meta.active_lora_ids,
            'scaling': 1.0,
        }
-    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+    def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
        assert self.v1_kernel_meta is not None
        self.sanity_check()
        self.to_device(self.input.device)
@ -477,12 +773,12 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.lora_kernel_meta.active_lora_ids,
+            'lora_ids': self.v1_kernel_meta.active_lora_ids,
            'offset_start': 0,
            'add_inputs': add_inputs,
        }
@ -495,10 +791,20 @@ class BenchmarkTensors:
        else:
            assert add_inputs is not None
-        if op_type == OpType.LORA_SHRINK:
+        if op_type == OpType.SGMV_SHRINK:
-            return self.as_lora_shrink_kwargs()
+            return self.as_sgmv_shrink_kwargs()
-        if op_type == OpType.LORA_EXPAND:
+        if op_type == OpType.SGMV_EXPAND:
-            return self.as_lora_expand_kwargs(add_inputs)
+            return self.as_sgmv_expand_kwargs(add_inputs)
        if op_type == OpType.BGMV_SHRINK:
            return self.as_bgmv_shrink_kwargs()
        if op_type == OpType.BGMV_EXPAND:
            return self.as_bgmv_expand_kwargs(add_inputs)
        if op_type == OpType.BGMV_EXPAND_SLICE:
            return self.as_bgmv_expand_slice_kwargs(add_inputs)
        if op_type == OpType.V1_SHRINK:
            return self.as_v1_shrink_kwargs()
        if op_type == OpType.V1_EXPAND:
            return self.as_v1_expand_kwargs(add_inputs)
        raise ValueError(f"Unrecognized optype {self}")
    def test_correctness(self, op_type: OpType,
@ -687,6 +993,10 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
    for bench_ctx in bench_ctxs:
        for seq_len in args.seq_lengths:
            bench_ops: list[OpType] = args.op_types
            if seq_len > 1:
                # bench only prefill ops
                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
            seq_len_timers = []
            for bench_op in bench_ops:
                for num_slices in bench_op.num_slices():
@ -896,13 +1206,13 @@ Benchmark LoRA kernels:
    {use_cuda_graph_recommendation()}
    list_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
    model_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
    range_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter)
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@ -54,7 +54,6 @@ for qps in "${QPS_VALUES[@]}"; do
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}
  echo "Completed benchmark with QPS: $qps"
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
 }  // namespace vllm
-// KV_T is the data type of key and value tensors.
+// KV_T is the stored data type of kv-cache.
-// CACHE_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@ -393,8 +393,8 @@ void reshape_and_cache(
                             CALL_RESHAPE_AND_CACHE)
 }
-// KV_T is the data type of key and value tensors.
+// KV_T is the stored data type of kv-cache.
-// CACHE_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@ -446,8 +446,8 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }
-// KV_T is the data type of key and value tensors.
+// KV_T is the stored data type of kv-cache.
-// CACHE_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -3,12 +3,6 @@
 #include "cpu_types.hpp"
 #if defined(__x86_64__)
  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
 #else
  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
 #endif
 namespace {
 template <typename scalar_t>
 void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@ -101,12 +95,13 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
  }
  const int element_num_per_block = key_caches[0][0].numel();
-  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+  VLLM_DISPATCH_FLOATING_TYPES(
-    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+      key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
+        CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-                                   element_num_per_block, num_layers);
+        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+                                       element_num_per_block, num_layers);
-  });
+        CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
      });
 }
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@ -123,15 +118,16 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
-  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+  VLLM_DISPATCH_FLOATING_TYPES(
-    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
+      key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-    reshape_and_cache_cpu_impl<scalar_t>(
+        CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+        reshape_and_cache_cpu_impl<scalar_t>(
-        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+            key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
+            key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-        num_heads, head_size, block_size, x);
+            slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
+            value_stride, num_heads, head_size, block_size, x);
-  });
+        CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
      });
 }
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -16,18 +16,9 @@ namespace vec_op {
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME,                                \
                     VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
 #ifndef CPU_OP_GUARD
  #define CPU_KERNEL_GUARD_IN(NAME)
  #define CPU_KERNEL_GUARD_OUT(NAME)
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
-  int num_tokens = positions.numel();
+  int num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;
  int num_kv_heads = key.size(-1) / head_size;
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@ -274,7 +274,7 @@ void advance_step_flashinfer(
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
-  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
+  int block_tables_stride = block_tables.stride(0);
  TORCH_CHECK((blocks * threads > num_queries),
              "multi-step: not enough threads to map to num_queries = ",
              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@ -19,24 +19,12 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
  return {};
 }
 // __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
 // HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
 // its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
 // on ROCm instantiates both OCP and FNUZ kernels, we need to replace
 // the new HW cvt with something reasonable that doesn't rely on the
 // ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
 template <>
 __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
    #if HIP_FP8_TYPE_OCP
  return c10::Float8_e4m3fn(
      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
                             __hip_fp8_e4m3::__default_interpret),
      c10::Float8_e4m3fn::from_bits());
    #else
  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
  return static_cast<c10::Float8_e4m3fn>(r);
    #endif
 }
 template <>
@ -446,7 +434,7 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
 template <>
 __inline__ __device__ uint32_t
 scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
-  [[maybe_unused]] __half2_raw h2r =
+  __half2_raw h2r =
      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
  union {
    __half2_raw h2r;
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
+  int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
  int n = offset_n + t * 4;
@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
+  int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
  int n = offset_n + t * 4;
@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
+  int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
  int n = offset_n + t * 4;
@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
+  int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
  int n = offset_n + t * 4;
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -437,10 +437,9 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
  #pragma unroll
        for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 =
+          FType low16 = static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2]);
              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
          FType high16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+              static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
          uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                         (reinterpret_cast<uint32_t&>(high16) << 16);
          int sts_offset =
@ -794,7 +793,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
  FT scale_reg[4];
  *(reinterpret_cast<uint2*>(scale_reg)) =
      *(reinterpret_cast<const uint2*>(scales + params_nidx));
-  FT zero_reg[4];
+  FT zero_reg[4] = {0};
  if (zeros != nullptr) {
    *(reinterpret_cast<uint2*>(zero_reg)) =
        *(reinterpret_cast<const uint2*>(zeros + params_nidx));
@ -810,10 +809,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
        reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
  #pragma unroll
    for (int ki = 0; ki < 4; ++ki) {
-      if (zeros != nullptr) {
+      fval_reg[ni * 4 + ki] =
-        fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]);
+          (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
      }
      fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
      int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
                       ((ni + lane_id % 4) % 4) * 8;
      smem[sts_offset] = fval_reg[ni * 4 + ki];
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@ -7,8 +7,6 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <iostream>
 #include "../gptq_marlin/marlin_dtypes.cuh"
 using marlin::ScalarType;
 namespace allspark {
@ -68,14 +66,14 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
    return;
  }
-  float sum = 0.f;
+  FType sum(0);
  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
  for (int i = 0; i < n_mat; ++i) {
-    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
+    sum += C_split[idx + i * matrix_size];
  }
-  C[idx] = ScalarType<FType>::float2num(sum);
+  C[idx] = sum;
 }
 template <typename FType>
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
-  [[maybe_unused]] union tmpcvt {
+  union tmpcvt {
    uint16_t u;
    _Float16 f;
    __hip_bfloat16 b;
@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
                                        const _B16x4& inp2) {
-  [[maybe_unused]] union tmpcvt {
+  union tmpcvt {
    uint16_t u;
    _Float16 f;
    __hip_bfloat16 b;
@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
-  [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
+  __shared__ float shared_qk_max[NWARPS][16 + 1];
-  [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  __shared__ float shared_exp_sum[NWARPS][16 + 1];
  // shared_logits is used for multiple purposes
  __shared__ _B16x4 shared_logits[NWARPS][4][16][4];
@ -426,8 +426,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    [[maybe_unused]] const int kglobal_token_idx =
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
        partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
@ -1273,9 +1272,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
  const int seq_idx = blockIdx.y;
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
+  const int laneid = threadIdx.x % WARP_SIZE;
  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
      "bool");
  ops.impl("cutlass_scaled_mm_supports_block_fp8",
-           &cutlass_scaled_mm_supports_block_fp8);
+           &cutlass_scaled_mm_supports_fp8);
  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
  // given capability
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -4,7 +4,6 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@ -34,8 +34,7 @@ Further update the model as follows:
            image_features = self.vision_encoder(image_input)
            return self.multi_modal_projector(image_features)
-        def get_multimodal_embeddings(
+        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
            # Validate the multimodal input keyword arguments
            image_input = self._parse_and_validate_image_input(**kwargs)
@ -62,7 +61,7 @@ Further update the model as follows:
        def get_input_embeddings(
            self,
            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+            multimodal_embeddings: Optional[NestedTensors] = None,
        ) -> torch.Tensor:
            # `get_input_embeddings` should already be implemented for the language 
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.0
+FROM vllm/vllm-openai:v0.7.3
 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install vllm[audio,video]==0.8.0
+RUN uv pip install --system vllm[audio,video]==0.7.3
 ```
 :::
@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest
-RUN uv pip install git+https://github.com/huggingface/transformers.git
+RUN uv pip install --system git+https://github.com/huggingface/transformers.git
 ```
 :::
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@ -4,9 +4,9 @@
 A Helm chart to deploy vLLM for Kubernetes
-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
 ## Prerequisites
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -4,19 +4,17 @@
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
-Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+--------
-* [Helm](frameworks/helm.md)
+
-* [InftyAI/llmaz](integrations/llmaz.md)
+Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
-* [KServe](integrations/kserve.md)
+
-* [kubernetes-sigs/lws](frameworks/lws.md)
+* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
-* [meta-llama/llama-stack](integrations/llamastack.md)
+
-* [substratusai/kubeai](integrations/kubeai.md)
+--------
 * [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
 * [vllm-project/production-stack](integrations/production-stack.md)
 ## Pre-requisite
-Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
+Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
 ## Deployment using native K8s
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@ -419,7 +419,7 @@ List of `v_vec` for one thread
  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
  multiple inner iterations, each warp will process one block of value
  tokens. And with multiple outer iterations, the whole context value
-  tokens are processed
+  tokens are processd
  ```cpp
  float accs[NUM_ROWS_PER_THREAD];
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
 Metrics in vLLM can be categorized as follows:
 1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
-2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
 The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:tokens_total` (Counter)
 - `vllm:iteration_tokens_total` (Histogram)
 - `vllm:time_in_queue_requests` (Histogram)
- `vllm:model_forward_time_milliseconds` (Histogram)
+- `vllm:model_forward_time_milliseconds` (Histogram
 - `vllm:model_execute_time_milliseconds` (Histogram)
 - `vllm:request_params_n` (Histogram)
 - `vllm:request_params_max_tokens` (Histogram)
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we
 In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
 :::{image} /assets/design/v1/prefix_caching/example-time-1.png
 :alt: Example Time 1
@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 3
 :::
-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
 :::{image} /assets/design/v1/prefix_caching/example-time-4.png
 :alt: Example Time 4
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.
-Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
+Note: Enabling this feature in production environments is risky as user may participate model adapter management.
 To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
 is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models
 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>).
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
 ```python
 from vllm import LLM, SamplingParams
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@ -15,7 +15,7 @@ more are listed [here](#supported-models).
 By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
 [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
-but they are expected to be inferior to models that are specifically trained on embedding tasks.
+but they are expected be inferior to models that are specifically trained on embedding tasks.
 ______________________________________________________________________
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un
 ## Supported configurations
-The following configurations have been validated to function with
+The following configurations have been validated to be function with
 Gaudi2 devices. Configurations that are not listed may or may not work.
 - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@ -19,7 +19,7 @@ Currently, there are no pre-built OpenVINO wheels.
 ### Build wheel from source
-First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
+First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
 ```console
 sudo apt-get update  -y
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -189,13 +189,12 @@ vLLM CPU backend supports the following vLLM features:
 - Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
- FP8-E5M2 KV cache
+- FP8-E5M2 KV-Caching (TODO)
 ## Related runtime environment variables
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 ## Performance tips
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -131,8 +131,6 @@ Building from source requires a lot of compilation. If you are building from sou
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
 As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 :::
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -1,6 +1,6 @@
 # Installation
-vLLM initially supports basic model inference and serving on Intel GPU platform.
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 :::{attention}
 There are no pre-built wheels or images for this device, so you must build vLLM from source.
@ -65,7 +65,7 @@ $ docker run -it \
 ## Supported features
-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
 ```console
 python -m vllm.entrypoints.openai.api_server \
@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \
     -tp=8
 ```
-By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
-There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
+There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -1,161 +0,0 @@
 # vLLM V1 User Guide
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
 ## Why vLLM V1?
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
 Building on V0’s success, vLLM V1 retains the stable and proven components from V0
 (such as the models, GPU kernels, and utilities). At the same time, it significantly
 re-architects the core systems, covering the scheduler, KV cache manager, worker,
 sampler, and API server, to provide a cohesive, maintainable framework that better
 accommodates continued growth and innovation.
 Specifically, V1 aims to:
 - Provide a **simple, modular, and easy-to-hack codebase**.
 - Ensure **high performance** with near-zero CPU overhead.
 - **Combine key optimizations** into a unified architecture.
 - Require **zero configs** by enabling features/optimizations by default.
 We see significant performance improvements from upgrading to V1 core engine, in
 particular for long context scenarios. Please see performance benchmark (To be
 added).
 For more details, check out the vLLM V1 blog post [vLLM V1: A Major
 Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
 This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
 ### Supports Overview
 #### Hardware
 | Hardware | Status                                   |
 |----------|------------------------------------------|
 | **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
 | **AMD**    | <nobr>🚧 WIP</nobr>           |
 | **TPU**    | <nobr>🚧 WIP</nobr>           |
 #### Feature / Model
 | Feature / Model | Status |
 |-----------------|-----------------------------------------------------------------------------------|
 | **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
 | **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
 | **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
 | **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
 | **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Encoder-Decoder Models**                  | <nobr>🟡 Planned</nobr>                                                           |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
 | **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
 | **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
 | **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
 - **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
 - **🟢 Functional**: Fully operational, with ongoing optimizations.  
 - **🚧 WIP**: Under active development.  
 - **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
 - **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
 **Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
 way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
 allocate a fixed token budget per request, enabling features like chunked prefills,
 prefix caching, and speculative decoding without a strict separation between prefill
 and decode phases.
 ### Semantic Changes and Deprecated Features
 #### Logprobs
 vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
 differences compared to V0:
 **Logprobs Calculation**
 Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
 before applying any logits post-processing such as temperature scaling or penalty
 adjustments). As a result, the returned logprobs do not reflect the final adjusted
 probabilities used during sampling.
 Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
 **Prompt Logprobs with Prefix Caching**
 Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
 #### Deprecated Features
 As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
 **Sampling features**
 - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
  processing functions to adjust logits on a per-request basis. In vLLM V1, this
  feature has been deprecated. Instead, the design is moving toward supporting **global logits
  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
 **KV Cache features**
 - **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
 to handle request preemptions.
 **Structured Output features**
 - **Request-level Structured Output Backend**: Deprecated, alternative backends
  (outlines, guidance) with fallbacks is WIP.
 ### Feature & Model Support in Progress
 Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
 #### Features to Be Optimized
 These features are already supported in vLLM V1, but their optimization is still
 in progress.
 - **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
  inferior to that of V0. The team is actively working on improving its
  performance
 (e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
 - **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
 #### Features to Be Supported
 - **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
 - **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
  supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
  Details about the structured outputs can be found
  [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
 #### Models to Be Supported
 vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
 and the majority fall into the following categories. V1 support for these models will be added eventually.
 **Embedding Models**  
 Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
 **Mamba Models**  
 Models using selective state-space mechanisms (instead of standard transformer attention)
 are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
 **Encoder-Decoder Models**  
 vLLM V1 is currently optimized for decoder-only transformers. Models requiring
  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
 ## FAQ
 TODO
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -67,8 +67,6 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
 getting_started/v1_user_guide
 :::
 % What does vLLM support?
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -101,7 +101,7 @@ class MyAttention(nn.Module):
  def forward(self, hidden_states, **kwargs): # <- kwargs are required
    ...
-    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
    attn_output, attn_weights = attention_interface(
      self,
      query_states,
@ -477,11 +477,6 @@ See [this page](#generative-models) for more information on how to use generativ
  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
  * ✅︎
  * ✅︎
 - * `Zamba2ForCausalLM`
  * Zamba2
  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
  *
  *
 :::
 :::{note}
@ -884,7 +879,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>
-  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
+  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
  *
  * ✅︎
  * ✅︎
@ -951,7 +946,7 @@ V0 correctly implements the model's attention pattern:
 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs
 - Will be updated in the future to support the correct behavior
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b
 ## Running vLLM on a single node
-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size`
 ```python
 from vllm import LLM
 llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-output = llm.generate("San Francisco is a")
+output = llm.generate("San Franciso is a")
 ```
 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@ -39,16 +39,7 @@ The following metrics are exposed:
 The following metrics are deprecated and due to be removed in a future version:
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
+- *(No metrics are currently deprecated)*
  `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
  used in V1.
 - `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
  counters in V1.
 - `vllm:time_in_queue_requests` because it duplicates
  `vllm:request_queue_time_seconds`.
 - `vllm:model_forward_time_milliseconds` and
  `vllm:model_execute_time_milliseconds` because
  prefill/decode/inference time metrics should be used instead.
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/docs/source/training/rlhf.md
+++ b/docs/source/training/rlhf.md
@ -1,6 +1,6 @@
 # Reinforcement Learning from Human Feedback
-Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
 vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -7,13 +7,11 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 import os
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
@ -25,31 +23,21 @@ question_per_audio_count = {
    2: "What sport and what nursery rhyme are referenced?"
 }
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.
 # MiniCPM-O
-def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
+def run_minicpmo(question: str, audio_count: int):
    model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              trust_remote_code=True,
-        trust_remote_code=True,
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=5,
-        max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
        limit_mm_per_prompt={"audio": audio_count},
    )
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@ -64,16 +52,11 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=audio_chat_template)
-
+    return llm, prompt, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
    )
 # Phi-4-multimodal-instruct
-def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
+def run_phi4mm(questions: str, audio_count: int):
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process audio inputs.
@ -84,35 +67,36 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
    speech_lora_path = os.path.join(model_path, "speech-lora")
    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
-    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+    prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        lora_extra_vocab_size=0,
        limit_mm_per_prompt={"audio": audio_count},
    )
    lora_request = LoRARequest("speech", 1, speech_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
-    return ModelRequestData(
+    stop_token_ids = None
-        engine_args=engine_args,
+    return llm, prompts, stop_token_ids
        prompt=prompts,
        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
    )
 # Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
+def run_qwen2_audio(question: str, audio_count: int):
    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=5,
-        max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
        limit_mm_per_prompt={"audio": audio_count},
    )
    audio_in_prompt = "".join([
        f"Audio {idx+1}: "
@ -123,15 +107,12 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
              "<|im_start|>user\n"
              f"{audio_in_prompt}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompt, stop_token_ids
        engine_args=engine_args,
        prompt=prompt,
    )
 # Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
+def run_ultravox(question: str, audio_count: int):
    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
@ -143,39 +124,29 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=5,
-        max_num_seqs=5,
+              trust_remote_code=True,
-        trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
-        limit_mm_per_prompt={"audio": audio_count},
+    stop_token_ids = None
-    )
+    return llm, prompt, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )
 # Whisper
-def run_whisper(question: str, audio_count: int) -> ModelRequestData:
+def run_whisper(question: str, audio_count: int):
    assert audio_count == 1, (
        "Whisper only support single audio input per prompt")
    model_name = "openai/whisper-large-v3-turbo"
    prompt = "<|startoftranscript|>"
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=448,
-        max_model_len=448,
+              max_num_seqs=5,
-        max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
-        limit_mm_per_prompt={"audio": audio_count},
+    stop_token_ids = None
-    )
+    return llm, prompt, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )
 model_example_map = {
@ -193,24 +164,14 @@ def main(args):
        raise ValueError(f"Model type {model} is not supported.")
    audio_count = args.num_audios
-    req_data = model_example_map[model](question_per_audio_count[audio_count],
+    llm, prompt, stop_token_ids = model_example_map[model](
-                                        audio_count)
+        question_per_audio_count[audio_count], audio_count)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
+                                     stop_token_ids=stop_token_ids)
    mm_data = {}
    if audio_count > 0:
@ -222,7 +183,7 @@ def main(args):
        }
    assert args.num_prompts > 0
-    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
+    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
@ -253,10 +214,6 @@ if __name__ == "__main__":
                        default=1,
                        choices=[0, 1, 2],
                        help="Number of audio items per prompt.")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -76,10 +76,5 @@ if __name__ == "__main__":
                             GPUs_per_dp_rank))
        proc.start()
        procs.append(proc)
    exit_code = 0
    for proc in procs:
        proc.join()
        if proc.exitcode:
            exit_code = proc.exitcode
    exit(exit_code)
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -1,93 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import json
 import os
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--dataset",
    type=str,
    default="./examples/data/gsm8k.jsonl",
    help="downloaded from the eagle repo " \
    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
 )
 parser.add_argument("--max_num_seqs", type=int, default=8)
 parser.add_argument("--num_prompts", type=int, default=80)
 parser.add_argument("--num_spec_tokens", type=int, default=2)
 parser.add_argument("--tp", type=int, default=1)
 parser.add_argument("--draft_tp", type=int, default=1)
 parser.add_argument("--enforce_eager", action='store_true')
 parser.add_argument("--enable_chunked_prefill", action='store_true')
 parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
 parser.add_argument("--temp", type=float, default=0)
 args = parser.parse_args()
 print(args)
 model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
 eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
 max_model_len = 2048
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
 if os.path.exists(args.dataset):
    prompts = []
    num_prompts = args.num_prompts
    with open(args.dataset) as f:
        for line in f:
            data = json.loads(line)
            prompts.append(data["turns"][0])
 else:
    prompts = ["The future of AI is", "The president of the United States is"]
 prompts = prompts[:args.num_prompts]
 num_prompts = len(prompts)
 prompt_ids = [
    tokenizer.apply_chat_template([{
        "role": "user",
        "content": prompt
    }],
                                  add_generation_prompt=True)
    for prompt in prompts
 ]
 llm = LLM(
    model=model_dir,
    trust_remote_code=True,
    tensor_parallel_size=args.tp,
    enable_chunked_prefill=args.enable_chunked_prefill,
    max_num_batched_tokens=args.max_num_batched_tokens,
    enforce_eager=args.enforce_eager,
    max_model_len=max_model_len,
    max_num_seqs=args.max_num_seqs,
    gpu_memory_utilization=0.8,
    speculative_model=eagle_dir,
    num_speculative_tokens=args.num_spec_tokens,
    speculative_draft_tensor_parallel_size=args.draft_tp,
    speculative_max_model_len=max_model_len,
    disable_log_stats=False,
 )
 sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
 outputs = llm.generate(prompt_token_ids=prompt_ids,
                       sampling_params=sampling_params)
 # calculate the average number of accepted tokens per forward pass, +1 is
 # to account for the token from the target model that's always going to be
 # accepted
 acceptance_counts = [0] * (args.num_spec_tokens + 1)
 for output in outputs:
    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
        acceptance_counts[step] += count
 print(f"mean acceptance length: \
    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -4,23 +4,16 @@ This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
 from typing import NamedTuple
-from vllm import LLM, EngineArgs, PromptType, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: Sequence[PromptType]
 def run_florence2():
-    engine_args = EngineArgs(
+    # Create a Florence-2 encoder/decoder model instance
    llm = LLM(
        model="microsoft/Florence-2-large",
        tokenizer="facebook/bart-large",
        max_num_seqs=8,
@ -46,15 +39,12 @@ def run_florence2():
            "decoder_prompt": "",
        },
    ]
-
+    return llm, prompts
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 def run_mllama():
-    engine_args = EngineArgs(
+    # Create a Mllama encoder/decoder model instance
    llm = LLM(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
        max_model_len=4096,
        max_num_seqs=2,
@ -79,15 +69,12 @@ def run_mllama():
            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
        },
    ]
-
+    return llm, prompts
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 def run_whisper():
-    engine_args = EngineArgs(
+    # Create a Whisper encoder/decoder model instance
    llm = LLM(
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,
        max_num_seqs=16,
@ -112,11 +99,7 @@ def run_whisper():
            "decoder_prompt": "<|startoftranscript|>",
        }
    ]
-
+    return llm, prompts
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 model_example_map = {
@ -131,12 +114,7 @@ def main(args):
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
-    req_data = model_example_map[model]()
+    llm, prompts = model_example_map[model]()
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    prompts = req_data.prompts
    # Create a sampling params object.
    sampling_params = SamplingParams(
@ -175,10 +153,6 @@ if __name__ == "__main__":
                        default="mllama",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@ -6,14 +6,14 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
-# This script is an offline demo for running Mistral-Small-3
+# This script is an offline demo for running Pixtral.
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@ -23,7 +23,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+#     "model": "mistralai/Pixtral-12B-2409",
 #     "messages": [
 #       {
 #         "role": "user",
@ -43,18 +43,12 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py advanced
-def run_simple_demo(args: argparse.Namespace):
+def run_simple_demo():
-    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)
-    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
-    llm = LLM(
+    llm = LLM(model=model_name, tokenizer_mode="mistral")
        model=model_name,
        tokenizer_mode="mistral",
        max_model_len=4096,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
@ -82,8 +76,8 @@ def run_simple_demo(args: argparse.Namespace):
    print(outputs[0].outputs[0].text)
-def run_advanced_demo(args: argparse.Namespace):
+def run_advanced_demo():
-    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    model_name = "mistralai/Pixtral-12B-2409"
    max_img_per_msg = 5
    max_tokens_per_img = 4096
@ -93,7 +87,6 @@ def run_advanced_demo(args: argparse.Namespace):
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    prompt = "Describe the following image."
@ -160,19 +153,14 @@ def main():
        help="Specify the demo mode: 'simple' or 'advanced'",
    )
    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')
    args = parser.parse_args()
    if args.mode == "simple":
        print("Running simple demo...")
-        run_simple_demo(args)
+        run_simple_demo()
    elif args.mode == "advanced":
        print("Running advanced demo...")
-        run_advanced_demo(args)
+        run_advanced_demo()
 if __name__ == "__main__":
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -8,164 +8,122 @@ on HuggingFace model repository.
 """
 import os
 import random
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.
 # Aria
-def run_aria(questions: list[str], modality: str) -> ModelRequestData:
+def run_aria(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "rhymes-ai/Aria"
    # NOTE: Need L40 (or equivalent) to avoid OOM
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=2,
-        max_num_seqs=2,
+              dtype="bfloat16",
-        dtype="bfloat16",
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # BLIP-2
-def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
+def run_blip2(questions: list[str], modality: str):
    assert modality == "image"
    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
-    engine_args = EngineArgs(
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
-        model="Salesforce/blip2-opt-2.7b",
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Chameleon
-def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
+def run_chameleon(questions: list[str], modality: str):
    assert modality == "image"
    prompts = [f"{question}<image>" for question in questions]
-    engine_args = EngineArgs(
+    llm = LLM(model="facebook/chameleon-7b",
-        model="facebook/chameleon-7b",
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=2,
-        max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
+def run_deepseek_vl2(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "deepseek-ai/deepseek-vl2-tiny"
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=2,
-        max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
    )
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Florence2
-def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
+def run_florence2(question: str, modality: str):
    assert modality == "image"
-    engine_args = EngineArgs(
+    llm = LLM(model="microsoft/Florence-2-large",
-        model="microsoft/Florence-2-large",
+              tokenizer="facebook/bart-large",
-        tokenizer="facebook/bart-large",
+              max_num_seqs=8,
-        max_num_seqs=8,
+              trust_remote_code=True,
-        trust_remote_code=True,
+              dtype="bfloat16",
-        dtype="bfloat16",
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
+    prompt = "<MORE_DETAILED_CAPTION>"
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompt, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Fuyu
-def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
+def run_fuyu(questions: list[str], modality: str):
    assert modality == "image"
    prompts = [f"{question}\n" for question in questions]
-    engine_args = EngineArgs(
+    llm = LLM(model="adept/fuyu-8b",
-        model="adept/fuyu-8b",
+              max_model_len=2048,
-        max_model_len=2048,
+              max_num_seqs=2,
-        max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Gemma 3
-def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
+def run_gemma3(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
@ -177,27 +135,22 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # GLM-4v
-def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
+def run_glm4v(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=2048,
-        max_model_len=2048,
+              max_num_seqs=2,
-        max_num_seqs=2,
+              trust_remote_code=True,
-        trust_remote_code=True,
+              enforce_eager=True,
-        enforce_eager=True,
+              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@ -205,21 +158,16 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    ]
    stop_token_ids = [151329, 151336, 151338]
-
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
+def run_h2ovl(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "h2oai/h2ovl-mississippi-800m"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -239,20 +187,15 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]
-
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
+def run_idefics3(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -269,20 +212,17 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
    ) for question in questions]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # InternVL
-def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
+def run_internvl(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "OpenGVLab/InternVL2-2B"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -305,75 +245,53 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # LLaVA-1.5
-def run_llava(questions: list[str], modality: str) -> ModelRequestData:
+def run_llava(questions: list[str], modality: str):
    assert modality == "image"
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
-    engine_args = EngineArgs(
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
-        model="llava-hf/llava-1.5-7b-hf",
+              max_model_len=4096,
-        max_model_len=4096,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
+def run_llava_next(questions: list[str], modality: str):
    assert modality == "image"
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
-    engine_args = EngineArgs(
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
-        model="llava-hf/llava-v1.6-mistral-7b-hf",
+              max_model_len=8192,
-        max_model_len=8192,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(questions: list[str],
+def run_llava_next_video(questions: list[str], modality: str):
                         modality: str) -> ModelRequestData:
    assert modality == "video"
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
-    engine_args = EngineArgs(
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
-        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+              max_model_len=8192,
-        max_model_len=8192,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # LLaVA-OneVision
-def run_llava_onevision(questions: list[str],
+def run_llava_onevision(questions: list[str], modality: str):
                        modality: str) -> ModelRequestData:
    if modality == "video":
        prompts = [
@ -387,20 +305,15 @@ def run_llava_onevision(questions: list[str],
        <|im_start|>assistant\n" for question in questions
        ]
-    engine_args = EngineArgs(
+    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+              max_model_len=16384,
-        max_model_len=16384,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    stop_token_ids = None
-    )
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Mantis
-def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
+def run_mantis(questions: list[str], modality: str):
    assert modality == "image"
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
@ -409,19 +322,14 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
        for question in questions
    ]
-    engine_args = EngineArgs(
+    llm = LLM(
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    stop_token_ids = [128009]
-
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # MiniCPM-V
@ -449,7 +357,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
@ -481,24 +389,19 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
-
+    return llm, prompts, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
-def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
+def run_minicpmo(questions: list[str], modality: str):
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
-def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
+def run_minicpmv(questions: list[str], modality: str):
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 # LLama 3.2
-def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
+def run_mllama(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@ -508,7 +411,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    # You may lower either to run this example on lower-end GPUs.
    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@ -529,20 +432,17 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Molmo
-def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
+def run_molmo(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "allenai/Molmo-7B-D-0924"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
@ -553,21 +453,18 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # NVLM-D
-def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
+def run_nvlm_d(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "nvidia/NVLM-D-72B"
    # Adjust this as necessary to fit in GPU
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -584,47 +481,36 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # PaliGemma
-def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
+def run_paligemma(question: str, modality: str):
    assert modality == "image"
    # PaliGemma has special prompt format for VQA
-    prompts = ["caption en" for _ in questions]
+    prompt = ["caption en"]
-    engine_args = EngineArgs(
+    llm = LLM(model="google/paligemma-3b-mix-224",
-        model="google/paligemma-3b-mix-224",
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    stop_token_ids = None
-
+    return llm, prompt, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # PaliGemma 2
-def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
+def run_paligemma2(question: str, modality: str):
    assert modality == "image"
    # PaliGemma 2 has special prompt format for VQA
-    prompts = ["caption en" for _ in questions]
+    prompt = ["caption en"]
-    engine_args = EngineArgs(
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-        model="google/paligemma2-3b-ft-docci-448",
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    stop_token_ids = None
-
+    return llm, prompt, stop_token_ids
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Phi-3-Vision
-def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
+def run_phi3v(questions: list[str], modality: str):
    assert modality == "image"
    prompts = [
@ -644,7 +530,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    engine_args = EngineArgs(
+    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@ -653,15 +539,12 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
        mm_processor_kwargs={"num_crops": 16},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
+def run_phi4mm(questions: list[str], modality: str):
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
@ -675,30 +558,33 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )
    lora_request = LoRARequest("vision", 1, vision_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
-    return ModelRequestData(
+    stop_token_ids = None
-        engine_args=engine_args,
+    return llm, prompts, stop_token_ids
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
 # Pixtral HF-format
-def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
+def run_pixtral_hf(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "mistral-community/pixtral-12b"
    # NOTE: Need L40 (or equivalent) to avoid OOM
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -706,18 +592,15 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    )
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Qwen
-def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
+def run_qwen_vl(questions: list[str], modality: str):
    assert modality == "image"
-    engine_args = EngineArgs(
+    llm = LLM(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_model_len=1024,
@ -727,19 +610,16 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    )
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Qwen2-VL
-def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
+def run_qwen2_vl(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@ -762,19 +642,16 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 # Qwen2.5-VL
-def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
+def run_qwen2_5_vl(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@ -797,11 +674,8 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
-
+    stop_token_ids = None
-    return ModelRequestData(
+    return llm, prompts, stop_token_ids
        engine_args=engine_args,
        prompts=prompts,
    )
 model_example_map = {
@ -915,28 +789,18 @@ def main(args):
    data = mm_input["data"]
    questions = mm_input["questions"]
-    req_data = model_example_map[model](questions, modality)
+    llm, prompts, stop_token_ids = model_example_map[model](questions,
-
+                                                            modality)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+    prompts = prompts if args.use_different_prompt_per_request else [
-        req_data.prompts[0]
+        prompts[0]
    ]
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
+                                     stop_token_ids=stop_token_ids)
    assert args.num_prompts > 0
    if args.num_prompts == 1:
@ -1001,10 +865,6 @@ if __name__ == "__main__":
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    parser.add_argument(
        '--image-repeat-prob',
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@ -7,12 +7,11 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 from argparse import Namespace
 from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
 from PIL.Image import Image
-from vllm import LLM, EngineArgs
+from vllm import LLM
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@ -38,12 +37,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
 class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
+    llm: LLM
    prompt: str
    image: Optional[Image]
-def run_e5_v(query: Query) -> ModelRequestData:
+def run_e5_v(query: Query):
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
    if query["modality"] == "text":
@ -59,20 +58,20 @@ def run_e5_v(query: Query) -> ModelRequestData:
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")
-    engine_args = EngineArgs(
+    llm = LLM(
        model="royokong/e5-v",
        task="embed",
        max_model_len=4096,
    )
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        image=image,
    )
-def run_vlm2vec(query: Query) -> ModelRequestData:
+def run_vlm2vec(query: Query):
    if query["modality"] == "text":
        text = query["text"]
        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
@ -88,7 +87,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")
-    engine_args = EngineArgs(
+    llm = LLM(
        model="TIGER-Lab/VLM2Vec-Full",
        task="embed",
        trust_remote_code=True,
@ -96,7 +95,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
    )
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        image=image,
    )
@ -127,18 +126,15 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)
-def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+def run_encode(model: str, modality: QueryModality):
    query = get_query(modality)
    req_data = model_example_map[model](query)
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)
    mm_data = {}
    if req_data.image is not None:
        mm_data["image"] = req_data.image
-    outputs = llm.embed({
+    outputs = req_data.llm.embed({
        "prompt": req_data.prompt,
        "multi_modal_data": mm_data,
    })
@ -148,7 +144,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
 def main(args: Namespace):
-    run_encode(args.model_name, args.modality, args.seed)
+    run_encode(args.model_name, args.modality)
 model_example_map = {
@ -171,10 +167,5 @@ if __name__ == "__main__":
                        default="image",
                        choices=get_args(QueryModality),
                        help='Modality of the input.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -6,14 +6,13 @@ using the chat template defined by the model.
 """
 import os
 from argparse import Namespace
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@ -26,12 +25,11 @@ IMAGE_URLS = [
 class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
+    llm: LLM
    prompt: str
    stop_token_ids: Optional[list[int]]
    image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
+    chat_template: Optional[str]
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -39,55 +37,53 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
-def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_aria(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "rhymes-ai/Aria"
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              tokenizer_mode="slow",
-        tokenizer_mode="slow",
+              trust_remote_code=True,
-        trust_remote_code=True,
+              dtype="bfloat16",
-        dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_deepseek_vl2(question: str,
+def load_deepseek_vl2(question: str, image_urls: list[str]):
                      image_urls: list[str]) -> ModelRequestData:
    model_name = "deepseek-ai/deepseek-vl2-tiny"
-    engine_args = EngineArgs(
+    llm = LLM(model=model_name,
-        model=model_name,
+              max_model_len=4096,
-        max_model_len=4096,
+              max_num_seqs=2,
-        max_num_seqs=2,
+              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
-        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+              limit_mm_per_prompt={"image": len(image_urls)})
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholder = "".join(f"image_{i}:<image>\n"
                          for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "google/gemma-3-4b-it"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -116,16 +112,18 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
                                           add_generation_prompt=True)
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
 def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-800m"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -148,18 +146,19 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    stop_token_ids = [tokenizer.eos_token_id]
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
@ -178,16 +177,18 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -213,18 +214,19 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@ -234,17 +236,19 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "<|image|>" * len(image_urls)
    prompt = f"{placeholders}<|begin_of_text|>{question}"
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_nvlm_d(question: str, image_urls: list[str]):
    model_name = "nvidia/NVLM-D-72B"
    # Adjust this as necessary to fit in GPU
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -262,11 +266,14 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
@ -274,7 +281,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistral-community/pixtral-12b"
    # Adjust this as necessary to fit in GPU
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -284,11 +291,14 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
    stop_token_ids = None
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
@ -305,7 +315,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    engine_args = EngineArgs(
+    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@ -316,11 +326,14 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
@ -334,7 +347,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_path,
        trust_remote_code=True,
        max_model_len=10000,
@ -342,24 +355,32 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )
    lora_request = LoRARequest("vision", 1, vision_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    placeholders = "".join(f"<|image_{i}|>"
                           for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
    stop_token_ids = None
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+        chat_template=None,
    )
 def load_qwen_vl_chat(question: str,
                      image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
@ -390,7 +411,7 @@ def load_qwen_vl_chat(question: str,
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
@ -398,7 +419,7 @@ def load_qwen_vl_chat(question: str,
    )
-def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -410,7 +431,7 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    # Tested on L40
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
@ -439,19 +460,23 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=image_data,
        chat_template=None,
    )
-def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -462,7 +487,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-    engine_args = EngineArgs(
+    llm = LLM(
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
@ -491,6 +516,8 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
@ -498,9 +525,11 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
                                            return_video_kwargs=False)
    return ModelRequestData(
-        engine_args=engine_args,
+        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=image_data,
        chat_template=None,
    )
@ -522,25 +551,14 @@ model_example_map = {
 }
-def run_generate(model, question: str, image_urls: list[str],
+def run_generate(model, question: str, image_urls: list[str]):
                 seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=req_data.stop_token_ids)
-    outputs = llm.generate(
+    outputs = req_data.llm.generate(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": {
@ -554,24 +572,13 @@ def run_generate(model, question: str, image_urls: list[str],
        print(generated_text)
-def run_chat(model: str, question: str, image_urls: list[str],
+def run_chat(model: str, question: str, image_urls: list[str]):
             seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=req_data.stop_token_ids)
-    outputs = llm.chat(
+    outputs = req_data.llm.chat(
        [{
            "role":
            "user",
@ -600,12 +607,11 @@ def run_chat(model: str, question: str, image_urls: list[str],
 def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed
    if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS, seed)
+        run_generate(model, QUESTION, IMAGE_URLS)
    elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS, seed)
+        run_chat(model, QUESTION, IMAGE_URLS)
    else:
        raise ValueError(f"Invalid method: {method}")
@ -626,10 +632,6 @@ if __name__ == "__main__":
                        default="generate",
                        choices=["generate", "chat"],
                        help="The method to run in `vllm.LLM`.")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@ -42,7 +42,7 @@ def post_http_request(prompt: str,
 def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
-                                     delimiter=b"\n"):
+                                     delimiter=b"\0"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"]
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@ -21,7 +21,7 @@ def http_bot(prompt):
    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
-                                     delimiter=b"\n"):
+                                     delimiter=b"\0"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"][0]
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@ -127,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
    "vllm": {
      "handlers": ["vllm"],
      "level": "DEBUG",
-      "propagate": false
+      "propagage": false
    },
    "vllm.example_noisy_logger": {
      "propagate": false
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging",
    "setuptools>=61",
    "setuptools-scm>=8.0",
-    "torch == 2.6.0",
+    "torch == 2.5.1",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.6.0
+torch==2.5.1
 wheel
-jinja2>=3.1.6
+jinja2
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -1,4 +1,3 @@
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
@ -20,7 +19,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@ -28,7 +27,7 @@ pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common[opencv] >= 1.5.4
+mistral_common[opencv] >= 1.5.0
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
@ -39,4 +38,3 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -4,9 +4,9 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.6.0
+torch == 2.5.1
-torchaudio==2.6.0
+torchaudio==2.5.1
 # These must be updated alongside torch
-torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
+xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -9,13 +9,12 @@ msgspec
 cloudpickle
 # packages to install to build the documentation
 cachetools
 pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.5.4
+mistral_common >= 1.5.0
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -7,9 +7,10 @@ torchvision==0.20.1
 torchaudio==2.5.1
 cmake>=3.26
 ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
 wheel
-jinja2>=3.1.6
+jinja2
 amdsmi==6.2.4
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@ -1,23 +0,0 @@
 # entrypoints test
 # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
 audioread==3.0.1
 cffi==1.17.1
 decorator==5.2.1
 lazy-loader==0.4
 platformdirs==4.3.6
 pooch==1.8.2
 #pycparse==2.22
 soundfile==0.13.1
 soxr==0.5.0.post1
 librosa==0.10.2.post1
 # entrypoints test
 #vllm[video] # required by entrypoints/openai/test_video.py
 decord==0.6.0
 # entrypoints test
 #sentence-transformers # required by entrypoints/openai/test_score.py
 sentence-transformers==3.4.1
--- a/requirements/test.in
+++ b/requirements/test.in
@ -8,7 +8,6 @@ pytest-shard
 # testing utils
 awscli
 backoff # required for phi4mm test
 decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
@ -22,17 +21,16 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.6.0
+torch==2.5.1
-torchaudio==2.6.0
+torchaudio==2.5.1
 torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.5.4 # required for pixtral test
+mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.48.2 
 # quantization
-bitsandbytes>=0.45.3
+bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 genai_perf==0.0.8
@ -40,4 +38,4 @@ tritonclient==2.51.0
 numpy < 2.0.0
 runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
+runai-model-streamer-s3==0.11.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -8,7 +8,7 @@ accelerate==1.0.1
    #   peft
 aiohappyeyeballs==2.4.3
    # via aiohttp
-aiohttp==3.10.11
+aiohttp==3.10.10
    # via
    #   datasets
    #   fsspec
@ -33,9 +33,7 @@ audioread==3.0.1
    # via librosa
 awscli==1.35.23
    # via -r requirements/test.in
-backoff==2.2.1
+bitsandbytes==0.45.0
    # via -r requirements/test.in
 bitsandbytes==0.45.3
    # via -r requirements/test.in
 black==24.10.0
    # via datamodel-code-generator
@ -129,6 +127,7 @@ filelock==3.16.1
    #   ray
    #   torch
    #   transformers
    #   triton
 fonttools==4.54.1
    # via matplotlib
 frozendict==2.4.6
@ -183,7 +182,7 @@ iniconfig==2.0.0
    # via pytest
 isort==5.13.2
    # via datamodel-code-generator
-jinja2==3.1.6
+jinja2==3.1.4
    # via
    #   datamodel-code-generator
    #   torch
@ -235,7 +234,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.5.4
+mistral-common==1.5.1
    # via -r requirements/test.in
 more-itertools==10.5.0
    # via lm-eval
@ -321,8 +320,6 @@ nvidia-cusparse-cu12==12.3.1.170
    # via
    #   nvidia-cusolver-cu12
    #   torch
 nvidia-cusparselt-cu12==0.6.2
    # via torch
 nvidia-nccl-cu12==2.21.5
    # via torch
 nvidia-nvjitlink-cu12==12.4.127
@ -594,7 +591,7 @@ timm==1.0.11
    # via -r requirements/test.in
 tokenizers==0.21.0
    # via transformers
-torch==2.6.0
+torch==2.5.1
    # via
    #   -r requirements/test.in
    #   accelerate
@ -610,15 +607,13 @@ torch==2.6.0
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.6.0
+torchaudio==2.5.1
    # via
    #   -r requirements/test.in
    #   encodec
    #   vocos
-torchvision==0.21.0
+torchvision==0.20.1
-    # via
+    # via timm
    #   -r requirements/test.in
    #   timm
 tqdm==4.66.6
    # via
    #   datasets
@ -643,7 +638,7 @@ transformers==4.48.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.2.0
+triton==3.1.0
    # via torch
 tritonclient==2.51.0
    # via
@ -656,6 +651,7 @@ typepy==1.3.2
    #   tabledata
 typing-extensions==4.12.2
    # via
    #   bitsandbytes
    #   huggingface-hub
    #   librosa
    #   mistral-common
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -3,10 +3,11 @@
 # Dependencies for TPU
 cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8
 wheel
-jinja2>=3.1.6
+jinja2
 ray[default]
 ray[data]
@ -17,9 +18,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@ -3,11 +3,12 @@
 ray>=2.9
 cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8
 setuptools>=75.8.0
 wheel
-jinja2>=3.1.6
+jinja2
 datasets # for benchmark scripts
 torch==2.6.0+xpu
@ -20,4 +21,4 @@ pytorch-triton-xpu
 # FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
 # intel-extension-for-pytorch==2.6.10+xpu
 oneccl_bind_pt==2.6.0+xpu
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
--- a/setup.py
+++ b/setup.py
@ -294,28 +294,26 @@ class repackage_wheel(build_ext):
            ]).decode("utf-8")
            upstream_main_commit = json.loads(resp_json)["sha"]
-            # Check if the upstream_main_commit exists in the local repo
+            # Check if the local main branch is up-to-date. This is to ensure
-            try:
+            # the base commit we found is the most recent commit on the main
-                subprocess.check_output(
+            # branch.
-                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+            local_main_commit = subprocess.check_output(
-            except subprocess.CalledProcessError:
+                ["git", "rev-parse", "main"]).decode("utf-8").strip()
-                # If not present, fetch it from the remote repository.
+            if local_main_commit != upstream_main_commit:
-                # Note that this does not update any local branches,
+                raise ValueError(
-                # but ensures that this commit ref and its history are
+                    f"Local main branch ({local_main_commit}) is not "
-                # available in our local repo.
+                    "up-to-date with upstream main branch "
-                subprocess.check_call([
+                    f"({upstream_main_commit}). Please pull the latest "
-                    "git", "fetch", "https://github.com/vllm-project/vllm",
+                    "changes from upstream main branch first.")
                    "main"
                ])
            # Then get the commit hash of the current branch that is the same as
            # the upstream main commit.
            current_branch = subprocess.check_output(
                ["git", "branch", "--show-current"]).decode("utf-8").strip()
-            base_commit = subprocess.check_output([
+            base_commit = subprocess.check_output(
-                "git", "merge-base", f"{upstream_main_commit}", current_branch
+                ["git", "merge-base", "main",
-            ]).decode("utf-8").strip()
+                 current_branch]).decode("utf-8").strip()
            return base_commit
        except ValueError as err:
            raise ValueError(err) from None
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@ -1,11 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import subprocess
 import sys
 import time
@ -45,10 +44,7 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
        distributed_executor_backend,
    ]
-    # API Server Test Requires V0.
+    uvicorn_process = subprocess.Popen(commands)
    my_env = os.environ.copy()
    my_env["VLLM_USE_V1"] = "0"
    uvicorn_process = subprocess.Popen(commands, env=my_env)
    yield
    uvicorn_process.terminate()
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@ -151,10 +151,6 @@ def uid() -> str:
@pytest_asyncio.fixture(scope="module")
 async def async_engine():
    # We cannot use monkeypatch since this is a module
    # scoped fixture and monkeypatch is function scoped.
    previous_value = os.getenv("VLLM_USE_V1", None)
    os.environ["VLLM_USE_V1"] = "0"
    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
                                                            func=start_engine)
    try:
@ -165,11 +161,6 @@ async def async_engine():
        await asyncio.sleep(0.1)
        cleanup_dist_env_and_memory()
        if previous_value:
            os.environ["VLLM_USE_V1"] = previous_value
        else:
            del os.environ["VLLM_USE_V1"]
@pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -47,7 +47,6 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@ -64,33 +63,31 @@ def test_models(
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
-    with monkeypatch.context() as m:
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
        m.setenv("VLLM_ATTENTION_BACKEND", backend)
-        # 5042 tokens for gemma2
+    # 5042 tokens for gemma2
-        # gemma2 has alternating sliding window size of 4096
+    # gemma2 has alternating sliding window size of 4096
-        # we need a prompt with more than 4096 tokens to test the sliding window
+    # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = "The following numbers of the sequence " + ", ".join(
+    prompt = "The following numbers of the sequence " + ", ".join(
-            str(i) for i in range(1024)) + " are:"
+        str(i) for i in range(1024)) + " are:"
-        example_prompts = [prompt]
+    example_prompts = [prompt]
-        with hf_runner(model, dtype=dtype) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-        with VllmRunner(model,
+    with VllmRunner(model,
-                        max_model_len=8192,
+                    max_model_len=8192,
-                        dtype=dtype,
+                    dtype=dtype,
-                        enforce_eager=enforce_eager,
+                    enforce_eager=enforce_eager,
-                        gpu_memory_utilization=0.7) as vllm_model:
+                    gpu_memory_utilization=0.7) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
                                                      max_tokens)
-        check_outputs_equal(
+    check_outputs_equal(
-            outputs_0_lst=hf_outputs,
+        outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+        outputs_1_lst=vllm_outputs,
-            name_0="hf",
+        name_0="hf",
-            name_1="vllm",
+        name_1="vllm",
-        )
+    )
@multi_gpu_test(num_gpus=2)
@ -107,7 +104,6 @@ def test_models(
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    vllm_runner,
    example_prompts,
@ -120,41 +116,34 @@ def test_models_distributed(
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")
-    with monkeypatch.context() as monkeypatch_context:
+    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test Ray Compiled Graph
-            # test Ray Compiled Graph
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-        if attention_backend:
+    if attention_backend:
-            monkeypatch_context.setenv(
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
                "VLLM_ATTENTION_BACKEND",
                attention_backend,
            )
-        dtype = "half"
+    dtype = "half"
-        max_tokens = 5
+    max_tokens = 5
-        # NOTE: take care of the order. run vLLM first, and then run HF.
+    # NOTE: take care of the order. run vLLM first, and then run HF.
-        # vLLM needs a fresh new process without cuda initialization.
+    # vLLM needs a fresh new process without cuda initialization.
-        # if we run HF first, the cuda initialization will be done and it
+    # if we run HF first, the cuda initialization will be done and it
-        # will hurt multiprocessing backend with fork method
+    # will hurt multiprocessing backend with fork method (the default method).
-        # (the default method).
+    with vllm_runner(model,
-        with vllm_runner(
+                     dtype=dtype,
-                model,
+                     tensor_parallel_size=2,
-                dtype=dtype,
+                     distributed_executor_backend=distributed_executor_backend
-                tensor_parallel_size=2,
+                     ) as vllm_model:
-                distributed_executor_backend=distributed_executor_backend,
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
-        with hf_runner(model, dtype=dtype) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-        check_outputs_equal(
+    check_outputs_equal(
-            outputs_0_lst=hf_outputs,
+        outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+        outputs_1_lst=vllm_outputs,
-            name_0="hf",
+        name_0="hf",
-            name_1="vllm",
+        name_1="vllm",
-        )
+    )
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -7,39 +7,22 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-
+import os
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import pytest
 from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 if TYPE_CHECKING:
    from .conftest import HfRunner, VllmRunner
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-3.2-1B-Instruct",
 ]
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the file.
    """
    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@ -50,8 +33,8 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner: HfRunner,
+    hf_runner,
-    vllm_runner: VllmRunner,
+    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
@ -60,39 +43,37 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
+    monkeypatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
-    with monkeypatch.context() as m:
+    override_backend_env_variable(monkeypatch, attention_backend)
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        max_num_seqs = chunked_prefill_token_size
+    max_num_seqs = chunked_prefill_token_size
-        max_num_batched_tokens = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
-        with hf_runner(model, dtype=dtype) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-        with vllm_runner(
+    with vllm_runner(
-                model,
+            model,
-                dtype=dtype,
+            dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
+            max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
+            enable_chunked_prefill=True,
-                tensor_parallel_size=tensor_parallel_size,
+            tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
+            enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
+            max_num_seqs=max_num_seqs,
-        ) as vllm_model:
+    ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
                                                      max_tokens)
-        check_outputs_equal(
+    check_outputs_equal(
-            outputs_0_lst=hf_outputs,
+        outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+        outputs_1_lst=vllm_outputs,
-            name_0="hf",
+        name_0="hf",
-            name_1="vllm",
+        name_1="vllm",
-        )
+    )
@multi_gpu_test(num_gpus=2)
@ -100,61 +81,57 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner: HfRunner,
+    hf_runner,
-    vllm_runner: VllmRunner,
+    vllm_runner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
+    monkeypatch,
 ) -> None:
-    with monkeypatch.context() as m:
+    override_backend_env_variable(monkeypatch, attention_backend)
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        if (model == "meta-llama/Llama-3.2-1B-Instruct"
                and distributed_executor_backend == "ray"):
            # test Ray Compiled Graph
            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-        dtype = "half"
+    if (model == "meta-llama/Llama-3.2-1B-Instruct"
-        max_tokens = 5
+            and distributed_executor_backend == "ray"):
-        chunked_prefill_token_size = 16
+        # test Ray Compiled Graph
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-        # Add a chunked prefill config.
+    dtype = "half"
-        max_num_seqs = min(chunked_prefill_token_size, 256)
+    max_tokens = 5
-        assert chunked_prefill_token_size != -1
+    chunked_prefill_token_size = 16
        enable_chunked_prefill = True
        max_num_batched_tokens = chunked_prefill_token_size
-        # NOTE: take care of the order. run vLLM first, and then run HF.
+    # Add a chunked prefill config.
-        # vLLM needs a fresh new process without cuda initialization.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
-        # if we run HF first, the cuda initialization will be done and it
+    assert chunked_prefill_token_size != -1
-        # will hurt multiprocessing backend with
+    enable_chunked_prefill = True
-        # fork method (the default method).
+    max_num_batched_tokens = chunked_prefill_token_size
-        with vllm_runner(
+    # NOTE: take care of the order. run vLLM first, and then run HF.
-                model,
+    # vLLM needs a fresh new process without cuda initialization.
-                dtype=dtype,
+    # if we run HF first, the cuda initialization will be done and it
-                tensor_parallel_size=2,
+    # will hurt multiprocessing backend with fork method (the default method).
                max_num_seqs=max_num_seqs,
                enable_chunked_prefill=enable_chunked_prefill,
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(
                example_prompts,
                max_tokens,
            )
-        with hf_runner(model, dtype=dtype) as hf_model:
+    with vllm_runner(
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            model,
            dtype=dtype,
            tensor_parallel_size=2,
            max_num_seqs=max_num_seqs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        check_outputs_equal(
+    with hf_runner(model, dtype=dtype) as hf_model:
-            outputs_0_lst=hf_outputs,
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-            outputs_1_lst=vllm_outputs,
+
-            name_0="hf",
+    check_outputs_equal(
-            name_1="vllm",
+        outputs_0_lst=hf_outputs,
-        )
+        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
@pytest.mark.parametrize(
@ -172,7 +149,7 @@ def test_models_distributed(
 # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner: VllmRunner,
+    vllm_runner,
    example_prompts,
    kv_cache_dtype: str,
    model: str,
@ -232,7 +209,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner: VllmRunner,
+    vllm_runner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
@ -268,10 +245,8 @@ def test_with_prefix_caching(
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy(
+                outputs[enable] += vllm_model.generate_greedy([prompt],
-                    [prompt],
+                                                              max_tokens)
                    max_tokens,
                )
    check_outputs_equal(
        outputs_0_lst=outputs[False],
@ -282,7 +257,7 @@ def test_with_prefix_caching(
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
@ -290,8 +265,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner: HfRunner,
+    hf_runner,
-    vllm_runner: VllmRunner,
+    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
@ -299,7 +274,7 @@ def test_models_cpu(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
+    monkeypatch,
 ) -> None:
    test_models(
        hf_runner,
@ -319,11 +294,11 @@ def test_models_cpu(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner: VllmRunner,
+    vllm_runner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@ -1,15 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 from ..utils import compare_two_settings
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    monkeypatch.setenv('VLLM_USE_V1', '0')
 def test_cpu_offload():
    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                         ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -7,10 +7,10 @@ from vllm import LLM, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
-from ..utils import create_new_process_for_each_test
+from ..utils import fork_new_process_for_each_test
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_python_error():
    """
    Test if Python error occurs when there's low-level
@ -36,7 +36,7 @@ def test_python_error():
        allocator.wake_up()
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_basic_cumem():
    # some tensors from default memory pool
    shape = (1024, 1024)
@ -69,7 +69,7 @@ def test_basic_cumem():
    assert torch.allclose(output, torch.ones_like(output) * 3)
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_cumem_with_cudagraph():
    allocator = CuMemAllocator.get_instance()
    with allocator.use_memory_pool():
@ -114,7 +114,7 @@ def test_cumem_with_cudagraph():
    assert torch.allclose(y, x + 1)
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
@pytest.mark.parametrize(
    "model, use_v1",
    [
@ -123,38 +123,40 @@ def test_cumem_with_cudagraph():
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+def test_end_to_end(model: str, use_v1: bool):
-    with monkeypatch.context() as m:
+    import os
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-        free, total = torch.cuda.mem_get_info()
+    free, total = torch.cuda.mem_get_info()
-        used_bytes_baseline = total - free  # in case other process is running
+    used_bytes_baseline = total - free  # in case other process is running
-        llm = LLM(model, enable_sleep_mode=True)
+    llm = LLM(model, enable_sleep_mode=True)
-        prompt = "How are you?"
+    prompt = "How are you?"
-        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
-        output = llm.generate(prompt, sampling_params)
+    output = llm.generate(prompt, sampling_params)
-        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-        # which is difficult to measure in the test. therefore, we only
+    # which is difficult to measure in the test. therefore, we only
-        # test sleep level 1 here.
+    # test sleep level 1 here.
-        llm.sleep(level=1)
+    llm.sleep(level=1)
-        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-        # now the memory usage is mostly cudagraph memory pool,
+    # now the memory usage is mostly cudagraph memory pool,
-        # and it should be less than the model weights (1B model, 2GiB weights)
+    # and it should be less than the model weights (1B model, 2GiB weights)
-        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-        # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
-        # therefore high memory usage after `llm.sleep` is called is expected.
+    # therefore high memory usage after `llm.sleep` is called is expected.
-        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-        # in V1.
+    # in V1.
-        if use_v1:
+    if use_v1:
-            assert used_bytes < 7 * GiB_bytes
+        assert used_bytes < 7 * GiB_bytes
-        else:
+    else:
-            assert used_bytes < 2 * GiB_bytes
+        assert used_bytes < 2 * GiB_bytes
-        llm.wake_up()
+    llm.wake_up()
-        output2 = llm.generate(prompt, sampling_params)
+    output2 = llm.generate(prompt, sampling_params)
-        # cmp output
+    # cmp output
-        assert output[0].outputs[0].text == output2[0].outputs[0].text
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
    del os.environ["VLLM_USE_V1"]
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@ -21,15 +21,6 @@ MODELS = [
 ]
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    """
    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
    so use VLLM_USE_V1=0 for all tests in the file.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(scope="module", autouse=True)
 def check_settings():
    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@ -6,7 +6,6 @@ from typing import Callable, Union
 from torch import fx
 from vllm.compilation.inductor_pass import InductorPass
 from vllm.config import get_current_vllm_config
 class TestBackend:
@ -18,14 +17,13 @@ class TestBackend:
    Inductor config can be modified directly by editing the inductor_config
    property. This can be helpful for adding passes like the
    'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
    Inductor config is default-initialized from VllmConfig.CompilationConfig.
    """
    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
                                                             None]]):
        self.custom_passes = list(passes)
-        compile_config = get_current_vllm_config().compilation_config
+        from torch._inductor import config
-        self.inductor_config = compile_config.inductor_compile_config
+        self.inductor_config = config.shallow_copy_dict()
        self.inductor_config['force_disable_caches'] = True
        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
@ -1,14 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 # TEST V1: this should be removed. Right now V1 overrides
 # all the torch compile logic. We should re-enable this
 # as we add torch compile support back to V1.
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import dataclasses
 from typing import Optional
 import pytest
@ -22,76 +22,75 @@ class TestSetting:
    fullgraph: bool
 # representative settings for testing
 test_settings = [
    # basic llama model
    TestSetting(
        model="meta-llama/Llama-3.2-1B-Instruct",
        model_args=[],
        pp_size=2,
        tp_size=2,
        attn_backend="FLASHINFER",
        method="generate",
        fullgraph=True,
    ),
    # llama model with quantization
    TestSetting(
        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
        model_args=["--quantization", "gptq"],
        pp_size=1,
        tp_size=1,
        attn_backend="FLASH_ATTN",
        method="generate",
        fullgraph=True,
    ),
    # MoE model
    TestSetting(
        model="ibm/PowerMoE-3b",
        model_args=[],
        pp_size=1,
        tp_size=2,
        attn_backend="FLASH_ATTN",
        method="generate",
        fullgraph=True,
    ),
    # embedding model
    TestSetting(
        model="BAAI/bge-multilingual-gemma2",
        model_args=["--task", "embed"],
        pp_size=1,
        tp_size=1,
        attn_backend="FLASH_ATTN",
        method="encode",
        fullgraph=True,
    ),
    # encoder-based embedding model (BERT)
    TestSetting(
        model="BAAI/bge-base-en-v1.5",
        model_args=["--task", "embed"],
        pp_size=1,
        tp_size=1,
        attn_backend="XFORMERS",
        method="encode",
        fullgraph=True,
    ),
    # vision language model
    TestSetting(
        model="microsoft/Phi-3.5-vision-instruct",
        model_args=["--trust-remote-code", "--max-model-len", "2048"],
        pp_size=2,
        tp_size=1,
        attn_backend="FLASH_ATTN",
        method="generate_with_image",
        fullgraph=False,
    ),
 ]
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("test_setting", test_settings)
-    "test_setting",
+def test_compile_correctness(test_setting: TestSetting):
    [
        # basic llama model
        TestSetting(
            model="meta-llama/Llama-3.2-1B-Instruct",
            model_args=[],
            pp_size=2,
            tp_size=2,
            attn_backend="FLASHINFER",
            method="generate",
            fullgraph=True,
        ),
        # llama model with quantization
        TestSetting(
            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
            model_args=["--quantization", "gptq"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # MoE model
        TestSetting(
            model="ibm/PowerMoE-3b",
            model_args=[],
            pp_size=1,
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # embedding model
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
            model_args=["--task", "embed", "--dtype", "bfloat16"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
            fullgraph=True,
        ),
        # encoder-based embedding model (BERT)
        TestSetting(
            model="BAAI/bge-base-en-v1.5",
            model_args=["--task", "embed"],
            pp_size=1,
            tp_size=1,
            attn_backend="XFORMERS",
            method="encode",
            fullgraph=True,
        ),
        # vision language model
        TestSetting(
            model="microsoft/Phi-3.5-vision-instruct",
            model_args=["--trust-remote-code", "--max-model-len", "2048"],
            pp_size=2,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate_with_image",
            fullgraph=False,
        ),
    ])
 def test_compile_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_setting: TestSetting,
 ):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
@ -104,45 +103,41 @@ def test_compile_correctness(
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")
    import os
    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
                ["-tp", str(tp_size)]
-    with monkeypatch.context() as m:
+    all_args: list[list[str]] = []
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+    all_envs: list[Optional[dict[str, str]]] = []
        final_args = [
            "--enforce-eager", *model_args, "-pp",
            str(pp_size), "-tp",
            str(tp_size)
        ]
-        all_args: list[list[str]] = []
+    for level in [
-        all_envs: list[dict[str, str] | None] = []
+            CompilationLevel.NO_COMPILATION,
            CompilationLevel.PIECEWISE,
    ]:
        all_args.append(final_args + [f"-O{level}"])
        all_envs.append({})
-        for level in [
+    # inductor will change the output, so we only compare if the output
-                CompilationLevel.NO_COMPILATION,
+    # is close, not exactly the same.
-                CompilationLevel.PIECEWISE,
+    compare_all_settings(
-        ]:
+        model,
-            all_args.append(final_args + [f"-O{level}"])
+        all_args,
-            all_envs.append({})
+        all_envs,
        method=method if method != "generate" else "generate_close")
    all_envs.clear()
    all_args.clear()
-        # inductor will change the output, so we only compare if the output
+    for level in [
-        # is close, not exactly the same.
+            CompilationLevel.NO_COMPILATION,
-        compare_all_settings(
+            CompilationLevel.DYNAMO_AS_IS,
-            model,
+            CompilationLevel.DYNAMO_ONCE,
-            all_args,
+    ]:
-            all_envs,
+        all_args.append(final_args + [f"-O{level}"])
-            method=method if method != "generate" else "generate_close")
+        all_envs.append({})
-        all_envs.clear()
+        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-        all_args.clear()
+            # "DYNAMO_ONCE" will always use fullgraph
            all_envs[-1][
                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
-        for level in [
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
                CompilationLevel.NO_COMPILATION,
                CompilationLevel.DYNAMO_AS_IS,
                CompilationLevel.DYNAMO_ONCE,
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})
            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
                # "DYNAMO_ONCE" will always use fullgraph
                all_envs[-1][
                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,115 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 from typing import Any
 import pytest
 import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
-from ..utils import create_new_process_for_each_test
+from ..utils import fork_new_process_for_each_test
-
+from .utils import TEST_MODELS, check_full_graph_support
@pytest.fixture(params=None, name="model_info")
 def models_list_fixture(request):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
            "dtype": torch.float16,
            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
            "dtype": torch.float16,
            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
            "quantization": "compressed-tensors"
        }),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]
    if is_quant_method_supported("aqlm"):
        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
            "quantization": "aqlm"
        }))
    # TODO: figure out why this fails.
    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
            "quantization": "gguf"
        }))
    if is_quant_method_supported("gptq"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
            "quantization": "gptq"
        }))
    if is_quant_method_supported("gptq_marlin"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
            "quantization": "gptq_marlin"
        }))
    if is_quant_method_supported("gptq_marlin_24"):
        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
            "quantization": "gptq_marlin_24"
        }))
    if is_quant_method_supported("marlin"):
        TEST_MODELS.append(
            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
                "quantization": "marlin"
            }))
    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
            "quantization": "AWQ"
        }))
    return TEST_MODELS
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize(
    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
-)
+@fork_new_process_for_each_test
-@pytest.mark.parametrize("model_info", "", indirect=True)
+def test_full_graph(model_info, optimization_level):
-@create_new_process_for_each_test()
+    model = model_info[0]
-def test_full_graph(
+    model_kwargs = model_info[1]
-    monkeypatch: pytest.MonkeyPatch,
+    check_full_graph_support(model,
-    model_info: tuple[str, dict[str, Any]],
+                             model_kwargs,
-    optimization_level: int,
+                             optimization_level,
-):
+                             tp_size=1)
    model, model_kwargs = model_info
    with monkeypatch.context() as m:
        # make sure these models can be captured in full graph mode
        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        print(f"MODEL={model}")
        prompts = [
            "Hello, my name is",
            "The president of the United States is",
            "The capital of France is",
            "The future of AI is",
        ]
        sampling_params = SamplingParams(temperature=0)
        llm = LLM(
            model=model,
            enforce_eager=True,
            tensor_parallel_size=1,
            disable_custom_all_reduce=True,
            compilation_config=optimization_level,
            **model_kwargs,
        )
        outputs = llm.generate(prompts, sampling_params)
        # Print the outputs.
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@ -0,0 +1,93 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
 TEST_MODELS = [
    ("facebook/opt-125m", {}),
    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
        "quantization": "compressed-tensors"
    }),
    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]
 if is_quant_method_supported("aqlm"):
    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
        "quantization": "aqlm"
    }))
 # TODO: figure out why this fails.
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
        "quantization": "gguf"
    }))
 if is_quant_method_supported("gptq"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
        "quantization": "gptq"
    }))
 if is_quant_method_supported("gptq_marlin"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
        "quantization": "gptq_marlin"
    }))
 if is_quant_method_supported("gptq_marlin_24"):
    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
        "quantization": "gptq_marlin_24"
    }))
 if is_quant_method_supported("marlin"):
    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
        "quantization": "marlin"
    }))
 if not current_platform.is_rocm() and is_quant_method_supported("awq"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
        "quantization": "AWQ"
    }))
 def check_full_graph_support(model,
                             model_kwargs,
                             optimization_level,
                             tp_size=1):
    # make sure these models can be captured in full graph mode
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
    print(f"MODEL={model}")
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
    llm = LLM(model=model,
              enforce_eager=True,
              tensor_parallel_size=tp_size,
              disable_custom_all_reduce=True,
              compilation_config=optimization_level,
              **model_kwargs)
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -14,8 +14,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchEncoding, BatchFeature)
+                          BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from tests.models.utils import (TokensTextLogprobs,
@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
+from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@ -34,7 +34,8 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import cuda_device_count_stateless, is_list_of
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                        identity, is_list_of)
 logger = init_logger(__name__)
@ -110,26 +111,6 @@ VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(scope="function", autouse=True)
 def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.
    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.
    This fixture is used by every test.
    """
    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")
@pytest.fixture(params=[True, False])
 def run_with_both_engines(request, monkeypatch):
    # Automatically runs tests twice, once with V1 and once without
@ -270,18 +251,14 @@ _R = TypeVar("_R")
 class HfRunner:
    def get_default_device(self):
        from vllm.platforms import current_platform
        return ("cpu" if current_platform.is_cpu()
                or current_platform.is_openvino() else "cuda")
    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
        from vllm.platforms import current_platform
        if x is None or isinstance(x, (bool, )):
            return x
        if device is None:
-            device = self.device
+            device = "cpu" if current_platform.is_cpu(
            ) or current_platform.is_openvino() else "cuda"
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
@ -294,59 +271,45 @@ class HfRunner:
    def __init__(
        self,
        model_name: str,
-        dtype: str = "auto",
+        dtype: str = "half",
        *,
        model_kwargs: Optional[dict[str, Any]] = None,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[..., BatchEncoding] = identity,
    ) -> None:
        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
        self.model_name = model_name
        self.config = AutoConfig.from_pretrained(
            model_name,
            trust_remote_code=True,
        )
        self.device = self.get_default_device()
        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)
        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-
+            self.model = self.wrap_device(
-            self.model = SentenceTransformer(
+                SentenceTransformer(
-                model_name,
+                    model_name,
-                device=self.device,
+                    device="cpu",
-                model_kwargs=model_kwargs,
+                    trust_remote_code=True,
-                trust_remote_code=True,
+                ).to(dtype=torch_dtype))
            )
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
-
+            self.model = CrossEncoder(model_name,
-            self.model = CrossEncoder(
+                                      device="cpu",
-                model_name,
+                                      trust_remote_code=True)
-                device=self.device,
+            self.model.model = self.wrap_device(self.model.model)\
-                automodel_args=model_kwargs,
+                .to(dtype=torch_dtype)
                trust_remote_code=True,
            )
        else:
-            model = auto_cls.from_pretrained(
+            model_kwargs = model_kwargs if model_kwargs is not None else {}
-                model_name,
+            self.model = self.wrap_device(
-                trust_remote_code=True,
+                auto_cls.from_pretrained(
-                **model_kwargs,
+                    model_name,
-            )
+                    torch_dtype=torch_dtype,
-
+                    trust_remote_code=True,
-            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    **model_kwargs,
-                    and len({p.device
+                ))
                             for p in model.parameters()}) < 2):
                model = model.to(self.device)
            self.model = model
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
@ -366,13 +329,16 @@ class HfRunner:
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
        self.dtype = dtype
        self.postprocess_inputs = postprocess_inputs
    def get_inputs(
        self,
        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> list[Union[BatchFeature, BatchEncoding]]:
+    ) -> list[BatchEncoding]:
        if images is not None:
            assert len(prompts) == len(images)
@ -382,7 +348,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)
-        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
+        all_inputs: list[BatchEncoding] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: dict[str, Any] = {
                "text": prompt,
@ -398,8 +364,7 @@ class HfRunner:
                processor_kwargs["sampling_rate"] = sr
            inputs = self.processor(**processor_kwargs)
-            if isinstance(inputs, BatchFeature):
+            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
                inputs = inputs.to(dtype=self.dtype)
            all_inputs.append(inputs)
@ -432,7 +397,7 @@ class HfRunner:
        outputs: list[tuple[list[list[int]], list[str]]] = []
        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                **kwargs,
            )
@ -503,7 +468,7 @@ class HfRunner:
        all_logprobs: list[list[torch.Tensor]] = []
        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -584,7 +549,7 @@ class HfRunner:
        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs),
+                **self.wrap_device(inputs, device=self.model.device.type),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -635,15 +600,19 @@ class HfRunner:
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
-            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(
-            encoder_inputs = self.wrap_device(encoder_inputs)
+                self.processor(**processor_kwargs),
                device=self.model.device.type,
            )
            if decoder_prompt is None:
                decoder_input_ids = None
            else:
-                decoder_inputs = self.tokenizer(decoder_prompt,
+                decoder_input_ids = self.wrap_device(
-                                                return_tensors="pt")
+                    self.tokenizer(decoder_prompt,
-                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
+                                   return_tensors="pt").input_ids,
                    device=self.model.device.type,
                )
            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
@ -692,18 +661,6 @@ def hf_runner():
 class VllmRunner:
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
    """
    def __init__(
        self,
@ -711,14 +668,14 @@ class VllmRunner:
        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
        tokenizer_mode: str = "auto",
-        trust_remote_code: bool = True,
+        # Use smaller max model length, otherwise bigger model cannot run due
-        seed: Optional[int] = 0,
+        # to kv cache size limit.
        max_model_len: int = 1024,
-        dtype: str = "auto",
+        dtype: str = "half",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
-        enable_chunked_prefill: Optional[bool] = False,
+        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
        **kwargs,
@ -728,9 +685,8 @@ class VllmRunner:
            task=task,
            tokenizer=tokenizer_name,
            tokenizer_mode=tokenizer_mode,
-            trust_remote_code=trust_remote_code,
+            trust_remote_code=True,
            dtype=dtype,
            seed=seed,
            swap_space=swap_space,
            enforce_eager=enforce_eager,
            disable_log_stats=disable_log_stats,
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@ -1,11 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/detokenizer/conftest.py
+++ b/tests/detokenizer/conftest.py
@ -1,10 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
@pytest.fixture(autouse=True)
 def v1(run_with_both_engines):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -1,141 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Any, Optional
 import pytest
 from vllm import LLM, SamplingParams, envs
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
 def _test_stopping(llm: LLM,
                   expected_output: str,
                   expected_reason: Any,
                   stop: Optional[list[str]] = None,
                   stop_token_ids: Optional[list[int]] = None,
                   include_in_output: bool = False) -> None:
    output = llm.generate(
        "A story about vLLM:\n",
        SamplingParams(
            temperature=0.0,
            max_tokens=MAX_TOKENS,
            stop=stop,
            stop_token_ids=stop_token_ids,
            include_stop_str_in_output=include_in_output,
        ))[0].outputs[0]
    assert output is not None
    assert output.text == expected_output
    assert output.stop_reason == expected_reason
 def _set_async_mode(llm, is_async):
    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
 def _stop_basic(llm):
    _test_stopping(llm,
                   stop=["."],
                   include_in_output=False,
                   expected_output="VLLM is a 100% volunteer organization",
                   expected_reason=".")
    _test_stopping(llm,
                   stop=["."],
                   include_in_output=True,
                   expected_output="VLLM is a 100% volunteer organization.",
                   expected_reason=".")
 def _stop_multi_tokens(llm):
    _test_stopping(
        llm,
        stop=["group of peo", "short"],
        include_in_output=False,
        expected_output="VLLM is a 100% volunteer organization. We are a ",
        expected_reason="group of peo")
    _test_stopping(
        llm,
        stop=["group of peo", "short"],
        include_in_output=True,
        expected_output=
        "VLLM is a 100% volunteer organization. We are a group of peo",
        expected_reason="group of peo")
 def _stop_partial_token(llm):
    _test_stopping(llm,
                   stop=["gani"],
                   include_in_output=False,
                   expected_output="VLLM is a 100% volunteer or",
                   expected_reason="gani")
    _test_stopping(llm,
                   stop=["gani"],
                   include_in_output=True,
                   expected_output="VLLM is a 100% volunteer organi",
                   expected_reason="gani")
 def _stop_token_id(llm):
    # token id 13013 => " organization"
    _test_stopping(llm,
                   stop_token_ids=[13013],
                   include_in_output=False,
                   expected_output="VLLM is a 100% volunteer",
                   expected_reason=13013)
    _test_stopping(llm,
                   stop_token_ids=[13013],
                   include_in_output=True,
                   expected_output="VLLM is a 100% volunteer organization",
                   expected_reason=13013)
@pytest.mark.skip_global_cleanup
 def test_stop_strings():
    # If V0, must set enforce_eager=False since we use
    # async output processing below.
    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
    if envs.VLLM_USE_V1:
        _stop_basic(vllm_model)
    else:
        _set_async_mode(vllm_model, True)
        _stop_basic(vllm_model)
        _set_async_mode(vllm_model, False)
        _stop_basic(vllm_model)
    if envs.VLLM_USE_V1:
        _stop_multi_tokens(vllm_model)
    else:
        _set_async_mode(vllm_model, True)
        _stop_multi_tokens(vllm_model)
        _set_async_mode(vllm_model, False)
        _stop_multi_tokens(vllm_model)
    if envs.VLLM_USE_V1:
        _stop_partial_token(vllm_model)
    else:
        _set_async_mode(vllm_model, True)
        _stop_partial_token(vllm_model)
        _set_async_mode(vllm_model, False)
        _stop_partial_token(vllm_model)
    if envs.VLLM_USE_V1:
        # FIXME: this does not respect include_in_output=False
        # _stop_token_id(vllm_model)
        pass
    else:
        _set_async_mode(vllm_model, True)
        _stop_token_id(vllm_model)
        _set_async_mode(vllm_model, False)
        _stop_token_id(vllm_model)
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -3,10 +3,7 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-
+import os
 from __future__ import annotations
 from typing import Any, Callable
 import pytest
 import ray
@ -20,18 +17,12 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(
+def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
-    monkeypatch: pytest.MonkeyPatch,
+                           distributed_init_port: str):
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -48,17 +39,12 @@ def all_reduce_test_worker(
@ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(
+def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
-    monkeypatch: pytest.MonkeyPatch,
+                           distributed_init_port: str):
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -81,17 +67,12 @@ def all_gather_test_worker(
@ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(
+def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-    monkeypatch: pytest.MonkeyPatch,
+                                      distributed_init_port: str):
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -125,14 +106,9 @@ def broadcast_tensor_dict_test_worker(
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-    monkeypatch: pytest.MonkeyPatch,
+                                      distributed_init_port: str):
-    tp_size: int,
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -170,14 +146,9 @@ def send_recv_tensor_dict_test_worker(
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
-    monkeypatch: pytest.MonkeyPatch,
+                          distributed_init_port: str):
-    tp_size: int,
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -203,12 +174,8 @@ def send_recv_test_worker(
    all_reduce_test_worker, all_gather_test_worker,
    broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(
+def test_multi_process_tensor_parallel(tp_size, test_target):
-    monkeypatch: pytest.MonkeyPatch,
+    multi_process_parallel(tp_size, 1, test_target)
    tp_size: int,
    test_target: Callable[..., Any],
 ):
    multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -216,12 +183,8 @@ def test_multi_process_tensor_parallel(
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(
+def test_multi_process_pipeline_parallel(pp_size, test_target):
-    monkeypatch: pytest.MonkeyPatch,
+    multi_process_parallel(1, pp_size, test_target)
    pp_size: int,
    test_target: Callable[..., Any],
 ):
    multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -234,9 +197,5 @@ def test_multi_process_pipeline_parallel(
    broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-    tp_size: int,
+        tp_size, pp_size, test_target):
-    pp_size: int,
+    multi_process_parallel(tp_size, pp_size, test_target)
    test_target: Callable[..., Any],
    monkeypatch: pytest.MonkeyPatch,
 ):
    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import random
 import pytest
@ -22,115 +23,95 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(
+def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    monkeypatch: pytest.MonkeyPatch,
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    tp_size,
+    device = torch.device(f"cuda:{rank}")
-    pp_size,
+    torch.cuda.set_device(device)
-    rank,
+    init_test_distributed_environment(tp_size, pp_size, rank,
-    distributed_init_port,
+                                      distributed_init_port)
-):
+    ensure_model_parallel_initialized(tp_size, pp_size)
-    with monkeypatch.context() as m:
+    group = get_tensor_model_parallel_group().device_group
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
                                          distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tensor_model_parallel_group().device_group
-        # A small all_reduce for warmup.
+    # A small all_reduce for warmup.
-        # this is needed because device communicators might be created lazily
+    # this is needed because device communicators might be created lazily
-        # (e.g. NCCL). This will ensure that the communicator is initialized
+    # (e.g. NCCL). This will ensure that the communicator is initialized
-        # before any communication happens, so that this group can be used for
+    # before any communication happens, so that this group can be used for
-        # graph capture immediately.
+    # graph capture immediately.
-        data = torch.zeros(1)
+    data = torch.zeros(1)
-        data = data.to(device=device)
+    data = data.to(device=device)
-        torch.distributed.all_reduce(data, group=group)
+    torch.distributed.all_reduce(data, group=group)
-        torch.cuda.synchronize()
+    torch.cuda.synchronize()
-        del data
+    del data
-        # we use the first group to communicate once
+    # we use the first group to communicate once
-        # and the second group to communicate twice
+    # and the second group to communicate twice
-        # and so on
+    # and so on
-        # this is used to demonstrate that each group can
+    # this is used to demonstrate that each group can
-        # communicate independently
+    # communicate independently
-        num_communication = rank // tp_size + 1
+    num_communication = rank // tp_size + 1
-        for sz in test_sizes:
+    for sz in test_sizes:
-            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                with graph_capture(device=device) as graph_capture_context:
+            with graph_capture(device=device) as graph_capture_context:
-                    # use integers so result matches NCCL exactly
+                # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(1,
+                inp1 = torch.randint(1,
-                                         16, (sz, ),
+                                     16, (sz, ),
-                                         dtype=dtype,
+                                     dtype=dtype,
-                                         device=torch.cuda.current_device())
+                                     device=torch.cuda.current_device())
-                    inp2 = torch.randint(1,
+                inp2 = torch.randint(1,
-                                         16, (sz, ),
+                                     16, (sz, ),
-                                         dtype=dtype,
+                                     dtype=dtype,
-                                         device=torch.cuda.current_device())
+                                     device=torch.cuda.current_device())
-                    torch.cuda.synchronize()
+                torch.cuda.synchronize()
-                    graph = torch.cuda.CUDAGraph()
+                graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
+                with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                                      stream=graph_capture_context.stream):
-                        for i in range(num_communication):
+                    for i in range(num_communication):
-                            out1 = tensor_model_parallel_all_reduce(inp1)
+                        out1 = tensor_model_parallel_all_reduce(inp1)
-                            # the input buffer is immediately modified to test
+                        # the input buffer is immediately modified to test
-                            # synchronization
+                        # synchronization
-                            dist.all_reduce(inp1, group=group)
+                        dist.all_reduce(inp1, group=group)
-                            out2 = tensor_model_parallel_all_reduce(inp2)
+                        out2 = tensor_model_parallel_all_reduce(inp2)
-                            dist.all_reduce(inp2, group=group)
+                        dist.all_reduce(inp2, group=group)
-                graph.replay()
+            graph.replay()
-                torch.testing.assert_close(out1, inp1)
+            torch.testing.assert_close(out1, inp1)
-                torch.testing.assert_close(out2, inp2)
+            torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(
+def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    monkeypatch: pytest.MonkeyPatch,
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    tp_size,
+    device = torch.device(f"cuda:{rank}")
-    pp_size,
+    torch.cuda.set_device(device)
-    rank,
+    init_test_distributed_environment(tp_size, pp_size, rank,
-    distributed_init_port,
+                                      distributed_init_port)
 ):
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
                                          distributed_init_port)
-        # we use the first group to communicate once
+    # we use the first group to communicate once
-        # and the second group to communicate twice
+    # and the second group to communicate twice
-        # and so on
+    # and so on
-        # this is used to demonstrate that each group can
+    # this is used to demonstrate that each group can
-        # communicate independently
+    # communicate independently
-        num_communication = rank // tp_size + 1
+    num_communication = rank // tp_size + 1
-        sz = 1024
+    sz = 1024
-        fa = get_tp_group().ca_comm
+    fa = get_tp_group().ca_comm
-        inp = torch.ones(sz, dtype=torch.float32, device=device)
+    inp = torch.ones(sz, dtype=torch.float32, device=device)
-        out = inp
+    out = inp
-        for _ in range(num_communication):
+    for _ in range(num_communication):
-            out = fa.all_reduce(out, registered=False)
+        out = fa.all_reduce(out, registered=False)
-        torch.testing.assert_close(out, inp * (tp_size**num_communication))
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))
-        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
-        out = inp
+    out = inp
-        for _ in range(num_communication):
+    for _ in range(num_communication):
-            out = fa.all_reduce(out, registered=False)
+        out = fa.all_reduce(out, registered=False)
-        torch.testing.assert_close(out, inp * (tp_size**num_communication))
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(
+def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
    monkeypatch: pytest.MonkeyPatch,
    tp_size,
    pipeline_parallel_size,
    test_target,
 ):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
                           test_target)
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@ -8,7 +8,7 @@ import pytest
 from vllm.config import TaskOption
 from vllm.logger import init_logger
-from ..utils import compare_two_settings, create_new_process_for_each_test
+from ..utils import compare_two_settings, fork_new_process_for_each_test
 logger = init_logger("test_expert_parallel")
@ -209,7 +209,7 @@ def _compare_tp(
        for params in settings.iter_params(model_name)
    ],
 )
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_ep(
    model_name: str,
    parallel_setup: ParallelSetup,
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -17,25 +17,13 @@ from vllm.config import TaskOption
 from vllm.logger import init_logger
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, create_new_process_for_each_test
+from ..utils import compare_two_settings, fork_new_process_for_each_test
 logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    """
    For PP, we fall back to V0 by default. This means
    that the TP baseline runs with V1 while the PP engine
    runs with V0. This gives divergent results with dummy
    weights. Once we enable V1 by default for PP, we can
    remove this.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')
 class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
@ -226,7 +214,7 @@ MULTIMODAL_MODELS = {
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
-    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
@ -249,7 +237,7 @@ TEST_MODELS = [
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
    "OpenGVLab/InternVL2-1B",
-    "microsoft/Phi-3.5-vision-instruct",
+    "microsoft/Phi-3-vision-128k-instruct",
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
@ -350,10 +338,6 @@ def _compare_tp(
    else:
        pp_env = None
    tp_env = {
        "VLLM_USE_V1": vllm_major_version,
    }
    pp_args = [
        *common_args,
        "--pipeline-parallel-size",
@ -378,20 +362,14 @@ def _compare_tp(
    ]
    try:
-        compare_two_settings(model_id,
+        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
                             pp_args,
                             tp_args,
                             pp_env,
                             tp_env,
                             method=method)
    except Exception:
-        testing_ray_compiled_graph = pp_env is not None
+        if pp_env is None:
-        if testing_ray_compiled_graph and vllm_major_version == "0":
+            raise
-            # Ray Compiled Graph tests are flaky for V0,
+        else:
            # Ray Compiled Graph tests are flaky,
            # so we don't want to fail the test
            logger.exception("Ray Compiled Graph tests failed")
        else:
            raise
@pytest.mark.parametrize(
@ -402,7 +380,7 @@ def _compare_tp(
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_tp_language_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
@ -431,7 +409,7 @@ def test_tp_language_generation(
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_tp_language_embedding(
    model_id: str,
    parallel_setup: ParallelSetup,
@ -460,7 +438,7 @@ def test_tp_language_embedding(
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
-@create_new_process_for_each_test()
+@fork_new_process_for_each_test
 def test_tp_multimodal_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
--- a/Show More
+++ b/Show More