Remove temporary test file that was accidentally committed

Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Add comprehensive documentation and examples for CPU weight loading
2025-09-18 00:41:19 +00:00 · 2025-09-18 00:41:05 +00:00 · 2025-09-18 00:34:31 +00:00
182 changed files with 6968 additions and 3947 deletions
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -167,6 +167,12 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -46,18 +46,22 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
+  - tests/async_engine
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/utils_
+  - tests/worker
  - tests/standalone_tests/lazy_imports.py
  - tests/transformers_utils
  commands:
  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
  - pytest -v -s utils_ # Utils
+  - pytest -v -s worker # Worker
  - pytest -v -s transformers_utils # transformers_utils

 - label: Python-only Installation Test # 10min
@ -78,12 +82,14 @@ steps:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

 - label: Entrypoints Unit Tests # 5min
  timeout_in_minutes: 10
@ -108,7 +114,8 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -210,14 +217,16 @@ steps:
  num_gpus: 2
  source_file_dependencies:
  - vllm/
+  - tests/metrics
  - tests/v1/tracing
  commands:
+  - pytest -v -s metrics
  - "pip install \
      'opentelemetry-sdk>=1.26.0' \
      'opentelemetry-api>=1.26.0' \
      'opentelemetry-exporter-otlp>=1.26.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s v1/tracing
+  - pytest -v -s tracing

 ##### fast check tests  #####
 #####  1 GPU test  #####
@ -280,7 +289,6 @@ steps:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/executor
-    - pytest -v -s v1/offloading
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -41,6 +41,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
+/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
@ -49,6 +50,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
+/tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
@ -61,10 +63,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC

-# Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
-/tests/models/test_transformers.py @hmellor
-
 # Docs
 /docs @hmellor
 mkdocs.yaml @hmellor
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -171,7 +171,7 @@ pull_request_rules:
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
@ -302,20 +302,3 @@ pull_request_rules:
    label:
      remove:
        - needs-rebase
-
- name: label-kv-connector
-  description: Automatically apply kv-connector label
-  conditions:
-    - or:
-      - files~=^examples/online_serving/disaggregated[^/]*/.*
-      - files~=^examples/offline_inference/disaggregated[^/]*/.*
-      - files~=^examples/others/lmcache/
-      - files~=^tests/v1/kv_connector/
-      - files~=^vllm/distributed/kv_transfer/
-      - title~=(?i)\bP/?D\b
-      - title~=(?i)NIXL
-      - title~=(?i)LMCache
-  actions:
-    label:
-      add:
-        - kv-connector
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -164,7 +164,9 @@ repos:
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/validate_config.py
    language: python
-    additional_dependencies: [regex]
+    types: [python]
+    pass_filenames: true
+    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -696,11 +696,11 @@ def evaluate(ret, args):
        return re.match(args.regex, actual) is not None

    def _eval_correctness(expected, actual):
-        if args.structure_type == "json":
+        if args.structure_type == "guided_json":
            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == "regex":
+        elif args.structure_type == "guided_regex":
            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == "choice":
+        elif args.structure_type == "guided_choice":
            return _eval_correctness_choice(expected, actual)
        else:
            return None
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
    )

    if args.dataset == "grammar":
-        args.structure_type = "grammar"
+        args.structure_type = "guided_grammar"
    elif args.dataset == "regex":
-        args.structure_type = "regex"
+        args.structure_type = "guided_regex"
    elif args.dataset == "choice":
-        args.structure_type = "choice"
+        args.structure_type = "guided_choice"
    else:
-        args.structure_type = "json"
+        args.structure_type = "guided_json"

    if args.no_structured_output:
        args.structured_output_ratio = 0
    if args.save_results:
-        result_file_name = f"{args.structured_output_ratio}so"
+        result_file_name = f"{args.structured_output_ratio}guided"
        result_file_name += f"_{backend}"
        result_file_name += f"_{args.request_rate}qps"
        result_file_name += f"_{args.model.split('/')[-1]}"
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@ -17,8 +17,4 @@
  #warning "unsupported vLLM cpu implementation"
 #endif

-#ifdef _OPENMP
-  #include <omp.h>
-#endif
-
 #endif
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -21,7 +21,6 @@
 #include <torch/all.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
-#include <cuda/std/limits>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
@ -29,6 +28,7 @@ namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {

+constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
@ -411,21 +411,14 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
  return __bfloat162float(val);
 }

-template <typename T>
-__device__ inline T neg_inf() {
-  // cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
-  // so we need to cast from fp32
-  return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
-}
-
 template <typename T>
 __device__ void topk_with_k2(T* output, T const* input,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
                             int const num_experts_per_group) {
  // Get the top2 per thread
-  T largest = neg_inf<T>();
-  T second_largest = neg_inf<T>();
+  T largest = -INFINITY;
+  T second_largest = -INFINITY;

  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@ -520,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
      warp_id * topk;
  s_topk_idx += warp_id * topk;

-  T value = neg_inf<T>();
-  T topk_group_value = neg_inf<T>();
+  T value = kNegInfinity;
+  T topk_group_value = kNegInfinity;
  int32_t num_equalto_topkth_group;

 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@ -532,8 +525,11 @@ __global__ void group_idx_and_topk_idx_kernel(
  if (case_id < num_tokens) {
    // calculate group_idx
    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
-    // The check is necessary to avoid abnormal input
-    if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) {
+    if (lane_id < n_group &&
+        (isfinite(cuda_cast<float, T>(
+            group_scores[lane_id]))))  // The check is necessary to avoid
+                                       // abnormal input
+    {
      value = group_scores[lane_id];
    }

@ -544,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
      __syncwarp();  // Ensure all threads have valid data before reduction
      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
      if (value == topk_group_value) {
-        value = neg_inf<T>();
+        value = kNegInfinity;
      }
      pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value =
-          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
+      count_equal_to_top_value = __popc(__ballot_sync(
+          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
    }
    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
  }
@ -556,10 +552,11 @@ __global__ void group_idx_and_topk_idx_kernel(

  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
-      queue((int32_t)topk, neg_inf<T>());
+      queue((int32_t)topk, -INFINITY);

  int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
+  bool if_proceed_next_topk =
+      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i_group = 0; i_group < n_group; i_group++) {
      if ((group_scores[i_group] > topk_group_value) ||
@ -569,10 +566,10 @@ __global__ void group_idx_and_topk_idx_kernel(
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
          T candidates =
-              (i < num_experts_per_group) &&
-                      cuda::std::isfinite(scores_with_bias[offset + i])
+              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
+                                                 scores_with_bias[offset + i]))
                  ? scores_with_bias[offset + i]
-                  : neg_inf<T>();
+                  : cuda_cast<T, float>(kNegInfinity);
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
@ -601,8 +598,7 @@ __global__ void group_idx_and_topk_idx_kernel(
      if (i < topk) {
        s_topk_value[i] = value;
      }
-      topk_sum +=
-          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
    }
  }

--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -365,6 +365,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
  int32_t compute_pipeline_offset_64 = 0;

  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
+    __nv_bfloat16 y_max_bf16 = EPS;
    __nv_bfloat162 results_bf162[2];

    cp_async_wait<NUM_STAGES - 2>();
@ -404,7 +405,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
    auto _y_max2 =
        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));

-    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
+    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);

    // An entire group is assigned to a single warp, so a simple warp reduce
    // is used.
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -29,10 +29,7 @@ ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
-	    && git checkout FETCH_HEAD \
-        && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
-               git remote add upstream "https://github.com/vllm-project/vllm.git" \
-               && git fetch upstream ; fi
+	    && git checkout FETCH_HEAD
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm

 # -----------------------
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,23 +1,25 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
-ARG TRITON_BRANCH="f9e5bf54"
-ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="b2fb6885"
-ARG PYTORCH_VISION_BRANCH="v0.23.0"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
+ARG HIPBLASLT_BRANCH="aa0bda7b"
+ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="f717b2af"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="0e60e394"
+ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="2ab9f4cd"
+ARG AITER_BRANCH="4822e675"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base

-ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV AITER_ROCM_ARCH=gfx942;gfx950

 ARG PYTHON_VERSION=3.12

@ -43,6 +45,29 @@ RUN apt-get update -y \

 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython

+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
+RUN cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && apt-get install -y llvm-dev \
+    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
@ -96,11 +121,13 @@ RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install

 FROM base AS debs
 RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    cp /install/*.deb /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@ -111,6 +138,11 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs

 FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@ -121,6 +153,9 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    pip install /install/*.whl

 ARG BASE_IMAGE
+ARG HIPBLAS_COMMON_BRANCH
+ARG HIPBLASLT_BRANCH
+ARG LEGACY_HIPBLASLT_OPTION
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@ -132,6 +167,9 @@ ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
+    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
+    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
+    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@ -139,6 +177,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.LoRAConfig][]
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
- [vllm.config.StructuredOutputsConfig][]
+- [vllm.config.DecodingConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
@ -46,6 +46,7 @@ Engine classes for offline and online inference.
 Inference parameters for vLLM APIs.

 [](){ #sampling-params }
+[](){ #pooling-params }

 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -175,7 +175,6 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models:

 - GLM-4.5V GLM-4.1V (<gh-pr:23168>)
- InternVL (<gh-pr:23909>)
 - Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -26,123 +26,113 @@ See <gh-file:LICENSE>.

 ## Developing

-The first step of contributing to vLLM is to clone the GitHub repository:
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
+Check out the [building from source][build-from-source] documentation for details.
+
+For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
+
+### Building the docs with MkDocs
+
+#### Introduction to MkDocs
+
+[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
+
+#### Install MkDocs and Plugins
+
+Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
+
+```bash
+uv pip install -r requirements/docs.txt
+```
+
+!!! note
+    Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
+
+#### Verify Installation
+
+Confirm that MkDocs is correctly installed:
+
+```bash
+mkdocs --version
+```
+
+Example output:
+
+```console
+mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
+```
+
+#### Clone the `vLLM` repository

 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 ```

-Then, configure your Python virtual environment.
+#### Start the Development Server

--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
-
-If you are only developing vLLM's Python code, install vLLM using:
+MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:

 ```bash
-VLLM_USE_PRECOMPILED=1 uv pip install -e .
+mkdocs serve
 ```

-If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
+Example output:

-```bash
-uv pip install -e .
+```console
+INFO    -  Documentation built in 106.83 seconds
+INFO    -  [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
+INFO    -  [22:02:02] Serving on http://127.0.0.1:8000/
 ```

-For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
+#### View in Your Browser

-For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
+Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
+
+#### Learn More
+
+For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
+
+## Testing
+
+??? console "Commands"
+
+    ```bash
+    # These commands are only for Nvidia CUDA platforms.
+    uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
+
+    # Linting, formatting and static type checking
+    pre-commit install
+
+    # You can manually run pre-commit with
+    pre-commit run --all-files --show-diff-on-failure
+
+    # To manually run something from CI that does not run
+    # locally by default, you can run:
+    pre-commit run mypy-3.9 --hook-stage manual --all-files
+
+    # Unit tests
+    pytest tests/
+
+    # Run tests for a single test file with detailed output
+    pytest -s -v tests/test_logger.py
+    ```

 !!! tip
-    vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
+    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.

    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.

-### Linting
-
-vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:
-
-```bash
-uv pip install pre-commit
-pre-commit install
-```
-
-vLLM's `pre-commit` hooks will now run automatically every time you commit.
-
-!!! tip "Tips"
-    You can manually run the `pre-commit` hooks using:
-
-    ```bash
-    pre-commit run     # runs on staged files
-    pre-commit run -a  # runs on all files (short for --all-files)
-    ```
-
-    ---
-
-    Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
-
-    ```bash
-    pre-commit run --hook-stage manual markdownlint
-    pre-commit run --hook-stage manual mypy-3.9
-    ```
-
-### Documentation
-
-MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
-
-Get started with:
-
-```bash
-uv pip install -r requirements/docs.txt
-```
-
-!!! tip
-    Ensure that your Python version is compatible with the plugins
-    (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
-
-MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it.
-From the root of the repository, run:
-
-```bash
-mkdocs serve                           # with API ref (~10 minutes)
-API_AUTONAV_EXCLUDE=vllm mkdocs serve  # API ref off (~15 seconds)
-```
-
-Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready!
-Open <http://127.0.0.1:8000/> in your browser to see it.
-
-For additional features and advanced configurations, refer to the:
-
- [MkDocs documentation](https://www.mkdocs.org/)
- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use)
-
-### Testing
-
-vLLM uses `pytest` to test the codebase.
-
-```bash
-# Install the test dependencies used in CI (CUDA only)
-uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
-
-# Install some common test dependencies (hardware agnostic)
-uv pip install pytest pytest-asyncio
-
-# Run all tests
-pytest tests/
-
-# Run tests for a single test file with detailed output
-pytest -s -v tests/test_logger.py
-```
-
-!!! tip "Install python3-dev if Python.h is missing"
+!!! note "Install python3-dev if Python.h is missing"
    If any of the above commands fails with `Python.h: No such file or directory`, install
    `python3-dev` with `sudo apt install python3-dev`.

-!!! warning "Warnings"
+!!! note
    Currently, the repository is not fully checked by `mypy`.

-    ---
-
+!!! note
    Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
    platform to run unit tests locally, rely on the continuous integration system to run the tests for
    now.
@ -204,7 +194,8 @@ appropriately to indicate the type of change. Please use one of the following:
 The PR needs to meet the following code quality standards:

 - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
- Pass all linter checks.
+- Pass all linter checks. Please use `pre-commit` to format your code. See
+  <https://pre-commit.com/#usage> if `pre-commit` is new to you.
 - The code needs to be well-documented to ensure future contributors can easily
  understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -156,6 +156,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
+  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -229,6 +230,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
+  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -243,6 +245,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
  --backend openai-chat \
+  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:

 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
-| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
-| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
-| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |

 !!! note
    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla

 The following parameters are supported, which must be added as extra parameters:

- `choice`: the output will be exactly one of the choices.
- `regex`: the output will follow the regex pattern.
- `json`: the output will follow the JSON schema.
- `grammar`: the output will follow the context free grammar.
+- `guided_choice`: the output will be exactly one of the choices.
+- `guided_regex`: the output will follow the regex pattern.
+- `guided_json`: the output will follow the JSON schema.
+- `guided_grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.

 You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.

 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
-`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
+`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
 which will try to choose an appropriate backend based on the details of the
 request. You may also choose a specific backend, along with
 some options. A full set of options is available in the `vllm serve --help`
 text.

-Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
+Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:

 ??? code

@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
        messages=[
            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
        ],
-        extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
+        extra_body={"guided_choice": ["positive", "negative"]},
    )
    print(completion.choices[0].message.content)
    ```

-The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:

 ??? code

@ -63,18 +63,18 @@ The next example shows how to use the `regex`. The idea is to generate an email
                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
            }
        ],
-        extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
+        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
    )
    print(completion.choices[0].message.content)
    ```

 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
-For this we can use the `json` parameter in two different ways:
+For this we can use the `guided_json` parameter in two different ways:

 - Using directly a [JSON Schema](https://json-schema.org/)
 - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).

-The next example shows how to use the `response_format` parameter with a Pydantic model:
+The next example shows how to use the `guided_json` parameter with a Pydantic model:

 ??? code

@ -119,7 +119,7 @@ The next example shows how to use the `response_format` parameter with a Pydanti
    JSON schema and how the fields should be populated. This can improve the
    results notably in most cases.

-Finally we have the `grammar` option, which is probably the most
+Finally we have the `guided_grammar` option, which is probably the most
 difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
            }
        ],
-        extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
+        extra_body={"guided_grammar": simplified_sql_grammar},
    )
    print(completion.choices[0].message.content)
    ```
@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
 ## Offline Inference

 Offline inference allows for the same types of structured outputs.
-To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
-The main available options inside `StructuredOutputsParams` are:
+To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
+The main available options inside `GuidedDecodingParams` are:

 - `json`
 - `regex`
@ -309,12 +309,12 @@ shown below:

    ```python
    from vllm import LLM, SamplingParams
-    from vllm.sampling_params import StructuredOutputsParams
+    from vllm.sampling_params import GuidedDecodingParams

    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")

-    structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
-    sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
+    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
    outputs = llm.generate(
        prompts="Classify this sentiment: vLLM is wonderful!",
        sampling_params=sampling_params,
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -71,7 +71,7 @@ This example demonstrates:
 * Making a request with `tool_choice="auto"`
 * Handling the structured response and executing the corresponding function

-You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
+You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.

 Remember that it's the caller's responsibility to:

@ -83,18 +83,19 @@ For more advanced usage, including parallel tool calls and different model-speci

 ## Named Function Calling

-vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
+vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
+enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
 high-quality one.

-vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
-For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
+vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.

 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.

 ## Required Function Calling

-vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.

 When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.

--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@ -1,4 +1,4 @@
-It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:

 ```bash
 uv venv --python 3.12 --seed
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -554,17 +554,6 @@ If your model is not in the above list, we will try to automatically convert the
    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.

-#### Token Classification
-
-These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
-
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
-| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. |  |  | ✅︎ |
-
-!!! note
-    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
-
 [](){ #supported-mm-models }

 ## List of Multimodal Language Models
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -133,7 +133,7 @@ completion = client.chat.completions.create(
        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
    ],
    extra_body={
-        "structured_outputs": {"choice": ["positive", "negative"]}
+        "guided_choice": ["positive", "negative"]
    }
 )
 ```
@ -317,11 +317,10 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli

 #### Extra parameters

-The following [pooling parameters][vllm.PoolingParams] are supported.
+The following [pooling parameters][pooling-params] are supported.

 ```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:embedding-pooling-params"
+--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
 ```

 The following extra parameters are supported by default:
@ -375,7 +374,7 @@ The following extra parameters are supported:
    ```python
    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
    ```
-
+  
 [](){ #translations-api }

 ### Translations API
@ -528,11 +527,10 @@ curl -v "http://127.0.0.1:8000/classify" \

 #### Extra parameters

-The following [pooling parameters][vllm.PoolingParams] are supported.
+The following [pooling parameters][pooling-params] are supported.

 ```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
+--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
 ```

 The following extra parameters are supported:
@ -735,11 +733,10 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu

 #### Extra parameters

-The following [pooling parameters][vllm.PoolingParams] are supported.
+The following [pooling parameters][pooling-params] are supported.

 ```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
+--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
 ```

 The following extra parameters are supported:
@ -818,11 +815,10 @@ Result documents will be sorted by relevance, and the `index` property can be us

 #### Extra parameters

-The following [pooling parameters][vllm.PoolingParams] are supported.
+The following [pooling parameters][pooling-params] are supported.

 ```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
+--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
 ```

 The following extra parameters are supported:
--- a/docs/usage/faq.md
+++ b/docs/usage/faq.md
@ -33,3 +33,52 @@ different tokens being sampled. Once a different token is sampled, further diver
 - For improved stability and reduced variance, use `float32`. Note that this will require more memory.
 - If using `bfloat16`, switching to `float16` can also help.
 - Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
+
+---
+
+> Q: How do you load weights from CPU?
+
+A: vLLM supports loading model weights from CPU using the `pt_load_map_location` parameter. This parameter controls where PyTorch checkpoints are loaded to and is especially useful when:
+
+- You have model weights stored on CPU and want to load them directly
+- You need to manage memory usage by loading weights to CPU first
+- You want to load from specific device mappings
+
+## Usage Examples
+
+### Command Line Interface
+
+```bash
+# Load weights from CPU
+vllm serve meta-llama/Llama-2-7b-hf --pt-load-map-location cpu
+
+# Load from specific device mapping (e.g., CUDA device 1 to device 0)
+vllm serve meta-llama/Llama-2-7b-hf --pt-load-map-location '{"cuda:1": "cuda:0"}'
+```
+
+### Python API
+
+```python
+from vllm import LLM
+
+# Load weights from CPU
+llm = LLM(
+    model="meta-llama/Llama-2-7b-hf",
+    pt_load_map_location="cpu"
+)
+
+# Load with device mapping
+llm = LLM(
+    model="meta-llama/Llama-2-7b-hf", 
+    pt_load_map_location={"cuda:1": "cuda:0"}
+)
+```
+
+The `pt_load_map_location` parameter accepts the same values as PyTorch's [`torch.load(map_location=...)`](https://pytorch.org/docs/stable/generated/torch.load.html) parameter:
+
+- `"cpu"` - Load all weights to CPU
+- `"cuda"` - Load all weights to CUDA (equivalent to `{"": "cuda"}`)
+- `{"cuda:1": "cuda:0"}` - Map weights from CUDA device 1 to device 0
+- Custom device mappings as needed
+
+Note: This parameter defaults to `"cpu"` and primarily affects PyTorch `.pt`/`.bin` checkpoint files. For optimal performance on GPU inference, weights will be moved to the target device after loading.
--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
@ -78,3 +78,23 @@ Try it yourself with the following arguments:
 ```bash
 --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
 ```
+
+### CPU weight loading
+
+The `cpu_weight_loading.py` example demonstrates how to control where model weights are loaded from using the `pt_load_map_location` parameter. This is particularly useful for memory management and when working with PyTorch checkpoint files.
+
+Try it yourself:
+
+```bash
+python examples/offline_inference/basic/cpu_weight_loading.py
+```
+
+You can also use this parameter with other scripts that support argument parsing:
+
+```bash
+# Load weights from CPU (default behavior)
+python examples/offline_inference/basic/chat.py --pt-load-map-location cpu
+
+# Use custom device mapping (example syntax)
+python examples/offline_inference/basic/generate.py --pt-load-map-location '{"": "cpu"}'
+```
--- a/examples/offline_inference/basic/cpu_weight_loading.py
+++ b/examples/offline_inference/basic/cpu_weight_loading.py
@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example demonstrating how to load model weights from CPU using pt_load_map_location.
+
+This is useful when:
+- You want to explicitly load PyTorch checkpoints from CPU
+- You need to manage memory usage during model initialization
+- You want to map weights from one device to another
+
+The pt_load_map_location parameter works the same as PyTorch's torch.load(map_location=...)
+and defaults to "cpu" for most efficient loading.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "The advantages of loading weights from CPU include",
+    "When should you use CPU weight loading?",
+    "Memory management in machine learning is important because",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
+
+
+def main():
+    # Example 1: Explicitly load weights from CPU (default behavior)
+    print("=== Example 1: Loading weights from CPU ===")
+    llm = LLM(
+        model="facebook/opt-125m",
+        pt_load_map_location="cpu"  # Explicitly specify CPU loading
+    )
+    
+    outputs = llm.generate(prompts[:1], sampling_params)
+    for output in outputs:
+        print(f"Prompt: {output.prompt}")
+        print(f"Output: {output.outputs[0].text}")
+    
+    # Example 2: Using device mapping (useful for multi-GPU setups)
+    print("\n=== Example 2: Device mapping example ===")
+    # Note: This example shows the syntax, but may not be applicable 
+    # unless you have multiple CUDA devices available
+    try:
+        llm_mapped = LLM(
+            model="facebook/opt-125m",
+            pt_load_map_location={"": "cpu"}  # Alternative syntax for CPU
+        )
+        
+        outputs = llm_mapped.generate(prompts[1:2], sampling_params)
+        for output in outputs:
+            print(f"Prompt: {output.prompt}")
+            print(f"Output: {output.outputs[0].text}")
+            
+    except Exception as e:
+        print(f"Device mapping example failed (this is normal if no CUDA available): {e}")
+    
+    # Example 3: Default behavior (pt_load_map_location="cpu" is the default)
+    print("\n=== Example 3: Default behavior (CPU loading) ===")
+    llm_default = LLM(model="facebook/opt-125m")  # Uses CPU loading by default
+    
+    outputs = llm_default.generate(prompts[2:3], sampling_params)
+    for output in outputs:
+        print(f"Prompt: {output.prompt}")
+        print(f"Output: {output.outputs[0].text}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@ -26,14 +26,8 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
 python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ```

-## Named Entity Recognition (NER) usage
-
-```bash
-python examples/offline_inference/pooling/ner.py
-```
-
 ## Qwen3 reranker usage

 ```bash
-python examples/offline_inference/pooling/qwen3_reranker.py
+python qwen3_reranker.py
 ```
--- a/examples/offline_inference/pooling/ner.py
+++ b/examples/offline_inference/pooling/ner.py
@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
-
-from argparse import Namespace
-
-from vllm import LLM, EngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-
-def parse_args():
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(
-        model="boltuix/NeuroBERT-NER",
-        runner="pooling",
-        enforce_eager=True,
-        trust_remote_code=True,
-    )
-    return parser.parse_args()
-
-
-def main(args: Namespace):
-    # Sample prompts.
-    prompts = [
-        "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
-    ]
-
-    # Create an LLM.
-    llm = LLM(**vars(args))
-    tokenizer = llm.get_tokenizer()
-    label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
-
-    # Run inference
-    outputs = llm.encode(prompts)
-
-    for prompt, output in zip(prompts, outputs):
-        logits = output.outputs.data
-        predictions = logits.argmax(dim=-1)
-
-        # Map predictions to labels
-        tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids)
-        labels = [label_map[p.item()] for p in predictions]
-
-        # Print results
-        for token, label in zip(tokens, labels):
-            if token not in tokenizer.all_special_tokens:
-                print(f"{token:15} → {label}")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -53,6 +53,7 @@ def parse_args():
        "--method",
        type=str,
        default="eagle",
+        choices=["ngram", "eagle", "eagle3", "mtp"],
    )
    parser.add_argument("--num-spec-tokens", type=int, default=2)
    parser.add_argument("--prompt-lookup-max", type=int, default=5)
@ -117,11 +118,6 @@ def main():
            "prompt_lookup_max": args.prompt_lookup_max,
            "prompt_lookup_min": args.prompt_lookup_min,
        }
-    elif args.method.endswith("mtp"):
-        speculative_config = {
-            "method": args.method,
-            "num_speculative_tokens": args.num_spec_tokens,
-        }
    else:
        raise ValueError(f"unknown method: {args.method}")

--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-This file demonstrates the example usage of structured outputs
-in vLLM. It shows how to apply different constraints such as choice,
-regex, json schema, and grammar to produce structured and formatted
-results based on specific prompts.
+This file demonstrates the example usage of guided decoding
+to generate structured outputs using vLLM. It shows how to apply
+different guided decoding techniques such as Choice, Regex, JSON schema,
+and Grammar to produce structured and formatted results
+based on specific prompts.
 """

 from enum import Enum
@ -12,23 +13,19 @@ from enum import Enum
 from pydantic import BaseModel

 from vllm import LLM, SamplingParams
-from vllm.sampling_params import StructuredOutputsParams
+from vllm.sampling_params import GuidedDecodingParams

 MAX_TOKENS = 50

-# Structured outputs by Choice (list of possible options)
-structured_outputs_params_choice = StructuredOutputsParams(
-    choice=["Positive", "Negative"]
-)
-sampling_params_choice = SamplingParams(
-    structured_outputs=structured_outputs_params_choice
-)
+# Guided decoding by Choice (list of possible options)
+guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
 prompt_choice = "Classify this sentiment: vLLM is wonderful!"

-# Structured outputs by Regex
-structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
+# Guided decoding by Regex
+guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    structured_outputs=structured_outputs_params_regex,
+    guided_decoding=guided_decoding_params_regex,
    stop=["\n"],
    max_tokens=MAX_TOKENS,
 )
@ -39,7 +36,7 @@ prompt_regex = (
 )


-# Structured outputs by JSON using Pydantic schema
+# Guided decoding by JSON using Pydantic schema
 class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
@ -54,16 +51,17 @@ class CarDescription(BaseModel):


 json_schema = CarDescription.model_json_schema()
-structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
+guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
 sampling_params_json = SamplingParams(
-    structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
+    guided_decoding=guided_decoding_params_json,
+    max_tokens=MAX_TOKENS,
 )
 prompt_json = (
-    "Generate a JSON with the brand, model and car_type of "
+    "Generate a JSON with the brand, model and car_type of"
    "the most iconic car from the 90's"
 )

-# Structured outputs by Grammar
+# Guided decoding by Grammar
 simplified_sql_grammar = """
 root ::= select_statement
 select_statement ::= "SELECT " column " from " table " where " condition
@ -72,15 +70,13 @@ table ::= "table_1 " | "table_2 "
 condition ::= column "= " number
 number ::= "1 " | "2 "
 """
-structured_outputs_params_grammar = StructuredOutputsParams(
-    grammar=simplified_sql_grammar
-)
+guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
 sampling_params_grammar = SamplingParams(
-    structured_outputs=structured_outputs_params_grammar,
+    guided_decoding=guided_decoding_params_grammar,
    max_tokens=MAX_TOKENS,
 )
 prompt_grammar = (
-    "Generate an SQL query to show the 'username' and 'email' from the 'users' table."
+    "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
 )


@ -97,16 +93,16 @@ def main():
    llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)

    choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
-    format_output("Structured outputs by Choice", choice_output)
+    format_output("Guided decoding by Choice", choice_output)

    regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
-    format_output("Structured outputs by Regex", regex_output)
+    format_output("Guided decoding by Regex", regex_output)

    json_output = generate_output(prompt_json, sampling_params_json, llm)
-    format_output("Structured outputs by JSON", json_output)
+    format_output("Guided decoding by JSON", json_output)

    grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
-    format_output("Structured outputs by Grammar", grammar_output)
+    format_output("Guided decoding by Grammar", grammar_output)


 if __name__ == "__main__":
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@ -1,15 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-import threading
 from typing import Optional

-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.v1.metrics.loggers import AggregatedStatLogger, LoggingStatLogger

 """
 To run this example, run the following commands simultaneously with
@ -25,67 +22,37 @@ send a request to the instance with DP rank 1.
 """


-def _do_background_logging(engine, interval, stop_event):
-    try:
-        while not stop_event.is_set():
-            asyncio.run(engine.do_log_stats())
-            stop_event.wait(interval)
-    except Exception as e:
-        print(f"vLLM background logging shutdown: {e}")
-        pass
-
-
 async def main():
    engine_args = AsyncEngineArgs(
        model="ibm-research/PowerMoE-3b",
        data_parallel_size=2,
-        tensor_parallel_size=1,
        dtype="auto",
        max_model_len=2048,
        data_parallel_address="127.0.0.1",
        data_parallel_rpc_port=62300,
        data_parallel_size_local=1,
        enforce_eager=True,
-        enable_log_requests=True,
-        disable_custom_all_reduce=True,
    )

-    def per_engine_logger_factory(config: VllmConfig, rank: int) -> LoggingStatLogger:
-        return LoggingStatLogger(config, rank)
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)

-    engine_client = AsyncLLMEngine.from_engine_args(
-        engine_args,
-        # Example: Using both regular loggers and aggregated logger
-        stat_loggers=[per_engine_logger_factory, AggregatedStatLogger],
-    )
-    stop_logging_event = threading.Event()
-    logging_thread = threading.Thread(
-        target=_do_background_logging,
-        args=(engine_client, 5, stop_logging_event),
-        daemon=True,
-    )
-    logging_thread.start()
    sampling_params = SamplingParams(
        temperature=0.7,
        top_p=0.9,
        max_tokens=100,
    )
-    num_prompts = 10
-    for i in range(num_prompts):
-        prompt = "Who won the 2004 World Series?"
-        final_output: Optional[RequestOutput] = None
-        async for output in engine_client.generate(
-            prompt=prompt,
-            sampling_params=sampling_params,
-            request_id=f"abcdef-{i}",
-            data_parallel_rank=1,
-        ):
-            final_output = output
-        if final_output:
-            print(final_output.outputs[0].text)

-    stop_logging_event.set()
-    logging_thread.join()
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)


 if __name__ == "__main__":
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@ -6,7 +6,7 @@ without any specific flags:

 ```bash
 VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
-    --structured-outputs-config.backend outlines
+    --guided-decoding-backend outlines
 ```

 This example demonstrates how to generate chat completions
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@ -12,12 +12,6 @@ python examples/online_serving/pooling/cohere_rerank_client.py
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```

-## Named Entity Recognition (NER) usage
-
-```bash
-python examples/online_serving/pooling/ner.py
-```
-
 ## Openai chat embedding for multimodal usage

 ```bash
--- a/examples/online_serving/pooling/ner.py
+++ b/examples/online_serving/pooling/ner.py
@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
-
-"""
-Example online usage of Pooling API for Named Entity Recognition (NER).
-
-Run `vllm serve <model> --runner pooling`
-to start up the server in vLLM. e.g.
-
-vllm serve boltuix/NeuroBERT-NER
-"""
-
-import argparse
-
-import requests
-import torch
-
-
-def post_http_request(prompt: dict, api_url: str) -> requests.Response:
-    headers = {"User-Agent": "Test Client"}
-    response = requests.post(api_url, headers=headers, json=prompt)
-    return response
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER")
-
-    return parser.parse_args()
-
-
-def main(args):
-    from transformers import AutoConfig, AutoTokenizer
-
-    api_url = f"http://{args.host}:{args.port}/pooling"
-    model_name = args.model
-
-    # Load tokenizer and config
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    config = AutoConfig.from_pretrained(model_name)
-    label_map = config.id2label
-
-    # Input text
-    text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
-    prompt = {"model": model_name, "input": text}
-
-    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
-
-    # Run inference
-    output = pooling_response.json()["data"][0]
-    logits = torch.tensor(output["data"])
-    predictions = logits.argmax(dim=-1)
-    inputs = tokenizer(text, return_tensors="pt")
-
-    # Map predictions to labels
-    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-    labels = [label_map[p.item()] for p in predictions]
-    assert len(tokens) == len(predictions)
-
-    # Print results
-    for token, label in zip(tokens, labels):
-        if token not in tokenizer.all_special_tokens:
-            print(f"{token:15} → {label}")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
                "content": "Classify this sentiment: vLLM is wonderful!",
            }
        ],
-        "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
+        "extra_body": {"guided_choice": ["positive", "negative"]},
    },
    "regex": {
        "messages": [
@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
            }
        ],
        "extra_body": {
-            "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
+            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
        },
    },
    "json": {
@ -122,8 +122,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
            }
        ],
        "extra_body": {
-            "structured_outputs": {
-                "grammar": """
+            "guided_grammar": """
 root ::= select_statement

 select_statement ::= "SELECT " column " from " table " where " condition
@ -136,7 +135,6 @@ condition ::= column "= " number

 number ::= "1 " | "2 "
 """,
-            }
        },
    },
    "structural_tag": {
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -79,7 +79,6 @@ plugins:
        - "re:vllm\\._.*"  # Internal modules
        - "vllm.third_party"
        - "vllm.vllm_flash_attn"
-        - !ENV [API_AUTONAV_EXCLUDE, "re:^$"]  # Match nothing by default
  - mkdocstrings:
      handlers:
        python:
--- a/tests/async_engine/init.py
+++ b/tests/async_engine/init.py
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+from collections.abc import Iterable
+from typing import Any
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+import vllm.envs as envs
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        ids = list(request_ids)
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
+
+    def testing_stats(self) -> dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copyreg
+import os
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+import urllib3.exceptions
+
+
+def _pickle_new_connection_error(obj):
+    """Custom pickler for NewConnectionError to fix tblib compatibility."""
+    # Extract the original message by removing the "conn: " prefix
+    full_message = obj.args[0] if obj.args else ""
+    if ': ' in full_message:
+        # Split off the connection part and keep the actual message
+        _, actual_message = full_message.split(': ', 1)
+    else:
+        actual_message = full_message
+    return _unpickle_new_connection_error, (actual_message, )
+
+
+def _unpickle_new_connection_error(message):
+    """Custom unpickler for NewConnectionError."""
+    # Create with None as conn and the actual message
+    return urllib3.exceptions.NewConnectionError(None, message)
+
+
+# Register the custom pickle/unpickle functions for tblib compatibility
+copyreg.pickle(urllib3.exceptions.NewConnectionError,
+               _pickle_new_connection_error)
+
+
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
+    response = requests.post("http://localhost:8000/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": max_tokens,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
+
+
+@pytest.fixture
+def api_server(distributed_executor_backend: str):
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    commands = [
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
+        "--distributed-executor-backend",
+        distributed_executor_backend,
+    ]
+
+    # API Server Test Requires V0.
+    my_env = os.environ.copy()
+    my_env["VLLM_USE_V1"] = "0"
+    uvicorn_process = subprocess.Popen(commands, env=my_env)
+    yield
+    uvicorn_process.terminate()
+
+
+@pytest.mark.timeout(300)
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
+def test_api_server(api_server, distributed_executor_backend: str):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["warm up"] * 1
+        result = None
+        while not result:
+            try:
+                for r in pool.map(_query_server, prompts):
+                    result = r
+                    break
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["test prompt"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+    with Pool(32) as pool:
+        # Cancel requests
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        # give it some time to update the stats
+        time.sleep(1)
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["test prompt after canceled"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.engine.async_llm_engine import RequestTracker
+from vllm.outputs import RequestOutput
+
+
+@pytest.mark.asyncio
+async def test_request_tracker():
+    tracker = RequestTracker()
+    stream_1 = tracker.add_request("1")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 1
+    assert new[0]["request_id"] == "1"
+    assert not aborted
+    assert not stream_1.finished
+
+    stream_2 = tracker.add_request("2")
+    stream_3 = tracker.add_request("3")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 2
+    assert new[0]["request_id"] == "2"
+    assert new[1]["request_id"] == "3"
+    assert not aborted
+    assert not stream_2.finished
+    assert not stream_3.finished
+
+    # request_ids must be unique
+    with pytest.raises(KeyError):
+        tracker.add_request("1")
+    assert not tracker.new_requests_event.is_set()
+
+    tracker.abort_request("1")
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert len(aborted) == 1
+    assert "1" in aborted
+    assert not new
+    assert stream_1.finished
+
+    stream_4 = tracker.add_request("4")
+    tracker.abort_request("4")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    # aborted new requests will cancel each other out -
+    # there's no need for them to propagate into the
+    # engine
+    assert not aborted
+    assert not new
+    assert stream_4.finished
+
+    stream_5 = tracker.add_request("5")
+    assert tracker.new_requests_event.is_set()
+    tracker.process_request_output(
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert not aborted
+    assert len(new) == 1
+    assert new[0]["request_id"] == "5"
+    assert stream_2.finished
+    assert not stream_5.finished
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
+
+Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
+pytest tests/basic_correctness/test_preemption.py`.
+"""
+import pytest
+from prometheus_client import REGISTRY
+
+import vllm.envs as envs
+from vllm import SamplingParams
+from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
+                                 ENABLE_ARTIFICIAL_PREEMPT)
+
+from ..models.utils import check_outputs_equal
+
+MODELS = [
+    "distilbert/distilgpt2",
+]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
+    so use VLLM_USE_V1=0 for all tests in the file.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
+        "pytest tests/basic_correctness/test_preemption.py`")
+
+
+@pytest.fixture
+def distributed_executor_backend() -> str:
+    # When SPMD worker is used, use distributed_executor_backend="ray"
+    # to test delta input optimization works with preemption.
+    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_recompute(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    distributed_executor_backend: str,
+) -> None:
+    """Ensure that chunked prefill works with preemption."""
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_seqs=max_num_seqs,
+            distributed_executor_backend=distributed_executor_backend,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption(
+    caplog_vllm,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    distributed_executor_backend: str,
+) -> None:
+    """By default, recompute preemption is enabled"""
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
+            "is not enough KV cache space." in caplog_vllm.text)
+    # Ensure the count bucket of request-level histogram metrics matches
+    # the number of requests as a simple sanity check to ensure metrics are
+    # generated
+    preemption_metrics = None
+    for m in REGISTRY.collect():
+        if m.name == "vllm:num_preemptions":
+            preemption_metrics = m
+    assert preemption_metrics is not None
+    total_recorded_preemption = 0
+    for sample in preemption_metrics.samples:
+        total_recorded_preemption += sample.value
+    assert total_preemption == total_recorded_preemption
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    distributed_executor_backend: str,
+) -> None:
+    """Verify infeasible preemption request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            block_size=BLOCK_SIZE,
+            # Not enough gpu blocks to complete a single sequence.
+            # preemption should happen, and the sequence should be
+            # ignored instead of hanging forever.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.llm.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+
+    # Verify the request is ignored and not hang.
+    for req_output in req_outputs:
+        outputs = req_output.outputs
+        assert len(outputs) == 1
+        assert outputs[0].finish_reason == "length"
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
        "5",
        "--endpoint",
        "/v1/chat/completions",
-        "--backend",
+        "--endpoint-type",
        "openai-chat",
    ]
    result = subprocess.run(command, capture_output=True, text=True)
--- a/tests/detokenizer/conftest.py
+++ b/tests/detokenizer/conftest.py
@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
--- a/tests/detokenizer/test_stop_checker.py
+++ b/tests/detokenizer/test_stop_checker.py
@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.inputs import token_inputs
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob, Sequence, SequenceStatus
+
+
+def sequence_with_eos(text: str, eos_token: str,
+                      eos_token_id: int) -> Sequence:
+    """
+    Create a Sequence that ends with an EOS token.
+    """
+    seq = Sequence(
+        seq_id=0,
+        inputs=token_inputs([]),
+        block_size=16,
+        eos_token_id=eos_token_id,
+    )
+    seq.output_text = text + eos_token
+
+    offset = eos_token_id + 1
+    for i in range(offset, len(text) + offset):
+        seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
+    seq.append_token_id(token_id=eos_token_id,
+                        logprobs={eos_token_id: Logprob(0.0)})
+
+    seq.status = SequenceStatus.RUNNING
+
+    return seq
+
+
+@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
+    ("This text ends with EOS token", "</s>", 2),
+])
+@pytest.mark.parametrize("ignore_eos", [True, False])
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.skip_global_cleanup
+def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
+                           ignore_eos: bool, include_stop_str_in_output: bool):
+    """
+    Test the behavior of the StopChecker's maybe_stop_sequence method
+    when an EOS token is encountered.
+
+    This test covers:
+    - When the EOS token should stop the sequence and be removed from the output
+    - When the EOS token should stop the sequence and be included in the output
+    - When the EOS token should be ignored, and the sequence continues
+    """
+
+    stop_checker = StopChecker(max_model_len=1024)
+
+    seq = sequence_with_eos(
+        text=text_wo_eos,
+        eos_token=eos_token,
+        eos_token_id=eos_token_id,
+    )
+    new_char_count = len(eos_token)
+
+    # Note that `stop` and `stop_token_ids` are not specified
+    sampling_params = SamplingParams(
+        min_tokens=1,
+        ignore_eos=ignore_eos,
+        include_stop_str_in_output=include_stop_str_in_output)
+
+    stop_checker.maybe_stop_sequence(
+        seq=seq,
+        new_char_count=new_char_count,
+        sampling_params=sampling_params,
+    )
+
+    if ignore_eos:
+        assert seq.status == SequenceStatus.RUNNING
+        assert seq.output_text == text_wo_eos + eos_token
+    elif include_stop_str_in_output:
+        assert seq.status == SequenceStatus.FINISHED_STOPPED
+        assert seq.output_text == text_wo_eos + eos_token
+    else:
+        assert seq.status == SequenceStatus.FINISHED_STOPPED
+        assert seq.output_text == text_wo_eos
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -26,10 +26,23 @@ logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    For PP, we fall back to V0 by default. This means
+    that the TP baseline runs with V1 while the PP engine
+    runs with V0. This gives divergent results with dummy
+    weights. Once we enable V1 by default for PP, we can
+    remove this.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
+    chunked_prefill: bool


 class PPTestOptions(NamedTuple):
@ -40,10 +53,23 @@ class PPTestOptions(NamedTuple):
@dataclass
 class PPTestSettings:
    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
    runner: RunnerOption
    test_options: PPTestOptions

+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
    @staticmethod
    def detailed(
        *,
@ -57,21 +83,27 @@ class PPTestSettings:
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
-                              eager_mode=False),
+                              eager_mode=False,
+                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
-                              eager_mode=False),
+                              eager_mode=False,
+                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
-                              eager_mode=True),
+                              eager_mode=True,
+                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
-                              eager_mode=False),
+                              eager_mode=False,
+                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
-                              eager_mode=True),
+                              eager_mode=True,
+                              chunked_prefill=False),
            ],
-            distributed_backends=["mp", "ray"],
+            distributed_backends=["mp", "mp", "ray", "ray"],
+            vllm_major_versions=["0", "1", "0", "1"],
            runner=runner,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
@ -86,14 +118,17 @@ class PPTestSettings:
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
+        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]

        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
-                              eager_mode=True),
+                              eager_mode=True,
+                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
+            vllm_major_versions=vllm_major_versions,
            runner=runner,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
@ -103,8 +138,10 @@ class PPTestSettings:
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend in self.distributed_backends:
-                yield (model_id, parallel_setup, backend, self.runner, opts)
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.runner, opts)


 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@ -232,6 +269,7 @@ def _compare_tp(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available: int,
@ -243,6 +281,7 @@ def _compare_tp(
        tp_size,
        pp_size,
        eager_mode,
+        chunked_prefill,
    ) = parallel_setup

    multi_node_only, load_format = test_options
@ -295,6 +334,8 @@ def _compare_tp(
        "--max-num-seqs",
        "8",
    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
    if runner != "auto":
@ -312,10 +353,14 @@ def _compare_tp(
    if max_num_seqs:
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])

-    if distributed_backend == "ray":
+    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    testing_ray_compiled_graph = False
+    if distributed_backend == "ray" and (vllm_major_version == "1"
+                                         or specific_case):
        # For V1, test Ray Compiled Graph for all the tests
+        # For V0, test Ray Compiled Graph for a subset of the tests
        pp_env = {
-            "VLLM_USE_V1": "1",
+            "VLLM_USE_V1": vllm_major_version,
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@ -323,15 +368,17 @@ def _compare_tp(
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of a Ray Compiled Graph issue.
        common_args.append("--disable-frontend-multiprocessing")
+        testing_ray_compiled_graph = True
    elif distributed_backend == "mp":
+        # Both V0/V1 of multiprocessing executor support PP
        pp_env = {
-            "VLLM_USE_V1": "1",
+            "VLLM_USE_V1": vllm_major_version,
        }
    else:
        pp_env = None

    tp_env = {
-        "VLLM_USE_V1": "1",
+        "VLLM_USE_V1": vllm_major_version,
    }

    pp_args = [
@ -357,17 +404,25 @@ def _compare_tp(
        "mp",
    ]

-    compare_two_settings(model_id,
-                         pp_args,
-                         tp_args,
-                         pp_env,
-                         tp_env,
-                         method=method)
+    try:
+        compare_two_settings(model_id,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             tp_env,
+                             method=method)
+    except Exception:
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
    [
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@ -378,14 +433,15 @@ def test_tp_language_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    pytest.skip("Skipping the test until V1 passes it.")
    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                runner,
                test_options,
                num_gpus_available,
@ -394,8 +450,8 @@ def test_tp_language_generation(


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
    [
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@ -406,14 +462,15 @@ def test_tp_language_embedding(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    pytest.skip("Skipping the test until V1 passes it.")
    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                runner,
                test_options,
                num_gpus_available,
@ -422,8 +479,8 @@ def test_tp_language_embedding(


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
    [
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@ -434,14 +491,15 @@ def test_tp_multimodal_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    pytest.skip("Skipping the test until V1 passes it.")
    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                runner,
                test_options,
                num_gpus_available,
--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+from typing import Any, Callable, Optional, Union
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.executor.uniproc_executor import UniProcExecutor
+from vllm.sampling_params import SamplingParams
+
+
+class Mock:
+    ...
+
+
+class CustomUniExecutor(UniProcExecutor):
+
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None) -> list[Any]:
+        # Drop marker to show that this was run
+        with open(".marker", "w"):
+            ...
+        return super().collective_rpc(method, timeout, args, kwargs)
+
+
+CustomUniExecutorAsync = CustomUniExecutor
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_custom_executor_type_checking(model):
+    with pytest.raises(ValueError):
+        engine_args = EngineArgs(model=model,
+                                 distributed_executor_backend=Mock)
+        LLMEngine.from_engine_args(engine_args)
+    with pytest.raises(ValueError):
+        engine_args = AsyncEngineArgs(model=model,
+                                      distributed_executor_backend=Mock)
+        AsyncLLMEngine.from_engine_args(engine_args)
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_custom_executor(model, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = EngineArgs(
+            model=model,
+            distributed_executor_backend=CustomUniExecutor,
+            enforce_eager=True,  # reduce test time
+        )
+        engine = LLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_custom_executor_async(model, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = AsyncEngineArgs(
+            model=model,
+            distributed_executor_backend=CustomUniExecutorAsync,
+            enforce_eager=True,  # reduce test time
+        )
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        async def t():
+            stream = await engine.add_request("0", "foo", sampling_params)
+            async for x in stream:
+                ...
+
+        asyncio.run(t())
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_respect_ray(model):
+    # even for TP=1 and PP=1,
+    # if users specify ray, we should use ray.
+    # users might do this if they want to manage the
+    # resources using ray.
+    engine_args = EngineArgs(
+        model=model,
+        distributed_executor_backend="ray",
+        enforce_eager=True,  # reduce test time
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    assert engine.model_executor.uses_ray
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from time import sleep
+from typing import Any
+
+import pytest
+
+from vllm.config import VllmConfig
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
+from vllm.worker.worker_base import WorkerWrapperBase
+
+
+class DummyWorkerWrapper(WorkerWrapperBase):
+    """Dummy version of vllm.worker.worker.Worker"""
+
+    def worker_method(self, worker_input: Any) -> tuple[int, Any]:
+        sleep(0.05)
+
+        if isinstance(worker_input, Exception):
+            # simulate error case
+            raise worker_input
+
+        return self.rpc_rank, input
+
+
+def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
+    result_handler = ResultHandler()
+    vllm_config = VllmConfig()
+    workers = [
+        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
+                             rank) for rank in range(8)
+    ]
+
+    worker_monitor = WorkerMonitor(workers, result_handler)
+    assert not worker_monitor.is_alive()
+
+    result_handler.start()
+    worker_monitor.start()
+    assert worker_monitor.is_alive()
+
+    return workers, worker_monitor
+
+
+def test_local_workers() -> None:
+    """Test workers with sync task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    def execute_workers(worker_input: str) -> None:
+        worker_outputs = [
+            worker.execute_method("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        for rank, output in enumerate(worker_outputs):
+            assert output.get() == (rank, input)
+
+    executor = ThreadPoolExecutor(max_workers=4)
+
+    # Test concurrent submission from different threads
+    futures = [
+        executor.submit(partial(execute_workers, f"thread {thread_num}"))
+        for thread_num in range(4)
+    ]
+
+    for future in futures:
+        future.result()
+
+    # Test error case
+    exception = ValueError("fake error")
+    result = workers[0].execute_method("worker_method", exception)
+    try:
+        result.get()
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+def test_local_workers_clean_shutdown() -> None:
+    """Test clean shutdown"""
+
+    workers, worker_monitor = _start_workers()
+
+    assert worker_monitor.is_alive()
+    assert all(worker.process.is_alive() for worker in workers)
+
+    # Clean shutdown
+    worker_monitor.close()
+
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+@pytest.mark.asyncio
+async def test_local_workers_async() -> None:
+    """Test local workers with async task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    async def execute_workers(worker_input: str) -> None:
+        worker_coros = [
+            worker.execute_method_async("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        results = await asyncio.gather(*worker_coros)
+        for rank, result in enumerate(results):
+            assert result == (rank, input)
+
+    tasks = [
+        asyncio.create_task(execute_workers(f"task {task_num}"))
+        for task_num in range(4)
+    ]
+
+    for task in tasks:
+        await task
+
+    # Test error case
+    exception = ValueError("fake error")
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", exception)
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import nullcontext
+
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_skip_tokenizer_initialization(model: str):
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(
+        model=model,
+        skip_tokenizer_init=True,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+        llm.generate("abc", sampling_params)
+
+    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
+def test_enable_prompt_embeds(hf_runner, model: str,
+                              enable_prompt_embeds: bool):
+    prompt = "abc"
+
+    with hf_runner(model) as hf_model:
+        token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids
+        token_ids = token_ids.to(hf_model.model.device)
+
+        embed_layer = hf_model.model.get_input_embeddings()
+        prompt_embeds = embed_layer(token_ids).squeeze(0)
+
+    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
+        ValueError, match="set `--enable-prompt-embeds`"))
+
+    llm = LLM(
+        model=model,
+        enable_prompt_embeds=enable_prompt_embeds,
+        enforce_eager=True,
+    )
+
+    with ctx:
+        llm.generate({"prompt_embeds": prompt_embeds})
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@ -25,7 +25,6 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
            model,
            max_model_len=128,  # LLaVA has a feature size of 576
            enforce_eager=True,
-            load_format="dummy",
        )

        with vllm_model:
--- a/tests/engine/test_stop_checker.py
+++ b/tests/engine/test_stop_checker.py
@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.reasoning import ReasoningParser
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Sequence, SequenceStatus
+
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+
+class MockReasoningParser(ReasoningParser):
+    """Mock reasoning parser for testing purposes."""
+
+    def __init__(self,
+                 tokenizer: AutoTokenizer,
+                 reasoning_active: bool = False):
+        super().__init__(tokenizer)
+        self.reasoning_active = reasoning_active
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return not self.reasoning_active
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return input_ids
+
+
+class MockSequence(Sequence):
+    """Mock sequence for testing purposes."""
+
+    def __init__(self, token_ids, output_text="test_output", eos_token_id=0):
+        self.token_ids = token_ids
+        self.output_text = output_text
+        self.eos_token_id = eos_token_id
+        self.status = SequenceStatus.RUNNING
+        self.stop_reason = None
+
+    def get_token_ids(self):
+        return self.token_ids
+
+    def get_last_token_id(self):
+        return self.token_ids[-1] if self.token_ids else None
+
+    def get_len(self):
+        return len(self.token_ids)
+
+    def get_output_len(self):
+        return len(self.token_ids) - 1  # Simulating prompt + outputs
+
+
+@pytest.fixture
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+@pytest.fixture
+def stop_checker():
+    return StopChecker(max_model_len=10)
+
+
+@pytest.fixture
+def stop_checker_with_reasoner():
+    reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
+    return StopChecker(max_model_len=10, reasoner=reasoner)
+
+
+def test_eos_token_stopping(stop_checker):
+    """Test sequence stopping when EOS token is encountered."""
+    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
+    sampling_params = SamplingParams()
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
+
+
+def test_ignore_eos(stop_checker):
+    """Test sequence continuing when EOS token is ignored."""
+    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
+    sampling_params = SamplingParams(ignore_eos=True)
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.RUNNING
+
+
+def test_min_tokens(stop_checker):
+    """Test min_tokens prevents early stopping."""
+    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
+    sampling_params = SamplingParams(min_tokens=3)
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.RUNNING
+
+
+def test_stop_token_ids(stop_checker):
+    """Test sequence stopping with custom stop token IDs."""
+    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
+    sampling_params = SamplingParams(stop_token_ids=[3])
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
+    assert seq.stop_reason == 3
+
+
+def test_stop_strings(stop_checker):
+    """Test sequence stopping with stop strings."""
+    seq = MockSequence(token_ids=[1, 2, 3],
+                       output_text="test output with STOP",
+                       eos_token_id=0)
+    sampling_params = SamplingParams(stop=["STOP"])
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
+    assert seq.stop_reason == "STOP"
+    assert "STOP" not in seq.output_text  # Default behavior removes stop string
+
+
+def test_include_stop_str_in_output(stop_checker):
+    """Test keeping stop strings in output."""
+    seq = MockSequence(token_ids=[1, 2, 3],
+                       output_text="test output with STOP",
+                       eos_token_id=0)
+    sampling_params = SamplingParams(stop=["STOP"],
+                                     include_stop_str_in_output=True)
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=5,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
+    assert "STOP" in seq.output_text
+
+
+def test_max_tokens(stop_checker):
+    """Test sequence stopping at max_tokens."""
+    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
+    sampling_params = SamplingParams(max_tokens=2)
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED
+
+
+def test_max_model_len(stop_checker):
+    """Test sequence stopping at max_model_len."""
+    seq = MockSequence(token_ids=list(range(11)),
+                       eos_token_id=0)  # 11 tokens, max is 10
+    sampling_params = SamplingParams()
+
+    stop_checker.maybe_stop_sequence(seq,
+                                     new_char_count=1,
+                                     sampling_params=sampling_params)
+
+    assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED
+
+
+def test_reasoning_skip_stops(stop_checker_with_reasoner):
+    """Test that stop tokens and strings are ignored during reasoning."""
+    # Set reasoning_active to True to simulate being in reasoning mode
+    stop_checker_with_reasoner.reasoner.reasoning_active = True
+
+    # Test with stop token
+    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
+    sampling_params = SamplingParams(stop_token_ids=[3])
+
+    stop_checker_with_reasoner.maybe_stop_sequence(
+        seq, new_char_count=1, sampling_params=sampling_params)
+    assert seq.status == SequenceStatus.RUNNING
+
+    # Test with stop string
+    seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP")
+    sampling_params = SamplingParams(stop=["STOP"])
+
+    stop_checker_with_reasoner.maybe_stop_sequence(
+        seq, new_char_count=4, sampling_params=sampling_params)
+    assert seq.status == SequenceStatus.RUNNING
+
+    # But EOS token still stops the sequence
+    seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
+    sampling_params = SamplingParams()
+
+    stop_checker_with_reasoner.maybe_stop_sequence(
+        seq, new_char_count=1, sampling_params=sampling_params)
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
+
+
+def test_reasoning_end_enables_stops(stop_checker_with_reasoner):
+    """Test that stop tokens work after reasoning ends."""
+    # Set reasoning_active to False to simulate being out of reasoning mode
+    stop_checker_with_reasoner.reasoner.reasoning_active = False
+
+    # Test with stop token
+    seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
+    sampling_params = SamplingParams(stop_token_ids=[3])
+
+    stop_checker_with_reasoner.maybe_stop_sequence(
+        seq, new_char_count=1, sampling_params=sampling_params)
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
+
+    # Test with stop string
+    seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP")
+    sampling_params = SamplingParams(stop=["STOP"])
+
+    stop_checker_with_reasoner.maybe_stop_sequence(
+        seq, new_char_count=4, sampling_params=sampling_params)
+    assert seq.status == SequenceStatus.FINISHED_STOPPED
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@ -184,7 +184,7 @@ def sample_enum_json_schema():


@pytest.fixture
-def sample_structured_outputs_choices():
+def sample_guided_choice():
    return [
        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
        "Ruby", "Swift", "Kotlin"
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+from contextlib import nullcontext
+
+from vllm_test_utils import BlameResult, blame
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.sampling_params import GuidedDecodingParams
+
+
+def run_normal():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="distilbert/distilgpt2",
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+
+def run_xgrammar(sample_regex):
+    # Create an LLM with guided decoding enabled.
+    llm = LLM(model="distilbert/distilgpt2",
+              enforce_eager=True,
+              guided_decoding_backend="xgrammar",
+              gpu_memory_utilization=0.3)
+    prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
+    guided_decoding = GuidedDecodingParams(regex=sample_regex)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=guided_decoding)
+    outputs = llm.generate(
+        prompts=[prompt] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
+    # make sure outlines is not imported
+    module_name = "outlines"
+    # In CI, we only check finally if the module is imported.
+    # If it is indeed imported, we can rerun the test with `use_blame=True`,
+    # which will trace every function call to find the first import location,
+    # and help find the root cause.
+    # We don't run it in CI by default because it is slow.
+    use_blame = False
+    context = blame(
+        lambda: module_name in sys.modules) if use_blame else nullcontext()
+    with context as result:
+        run_normal()
+        run_xgrammar(sample_regex)
+    if use_blame:
+        assert isinstance(result, BlameResult)
+        print(f"the first import location is:\n{result.trace_stack}")
+    assert module_name not in sys.modules, (
+        f"Module {module_name} is imported. To see the first"
+        f" import location, run the test with `use_blame=True`.")
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@ -81,3 +81,13 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
            more_args = ["--max-num-seqs", "64"]

        run_test(more_args)
+
+
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
+    """Run with the V0 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test(more_args)
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# imports for structured outputs tests
+# imports for guided decoding tests
 import json
 from typing import Optional

@ -28,9 +28,11 @@ def monkeypatch_module():
    mpatch.undo()


-@pytest.fixture(scope="module")
-def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
-    monkeypatch_module.setenv('VLLM_USE_V1', '1')
+@pytest.fixture(scope="module", params=[False, True])
+def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')

    args = [
        # use half precision for speed and memory savings in CI environment
@ -55,6 +57,13 @@ def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
        yield remote_server


+@pytest.fixture
+def is_v1_server(server):
+    import os
+    assert os.environ['VLLM_USE_V1'] in ['0', '1']
+    return os.environ['VLLM_USE_V1'] == '1'
+
+
@pytest_asyncio.fixture
 async def client(server):
    async with server.get_async_client() as async_client:
@ -471,10 +480,10 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,


@pytest.mark.asyncio
-async def test_structured_outputs_choice_chat(
-    client: openai.AsyncOpenAI,
-    sample_structured_outputs_choices,
-):
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  sample_guided_choice, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -489,10 +498,9 @@ async def test_structured_outputs_choice_chat(
        messages=messages,
        max_completion_tokens=10,
        temperature=0.7,
-        extra_body=dict(
-            structured_outputs={"choice": sample_structured_outputs_choices}))
+        extra_body=dict(guided_choice=sample_guided_choice))
    choice1 = chat_completion.choices[0].message.content
-    assert choice1 in sample_structured_outputs_choices
+    assert choice1 in sample_guided_choice

    messages.append({"role": "assistant", "content": choice1})
    messages.append({
@ -504,18 +512,18 @@ async def test_structured_outputs_choice_chat(
        messages=messages,
        max_completion_tokens=10,
        temperature=0.7,
-        extra_body=dict(
-            structured_outputs={"choice": sample_structured_outputs_choices}))
+        extra_body=dict(guided_choice=sample_guided_choice))
    choice2 = chat_completion.choices[0].message.content
-    assert choice2 in sample_structured_outputs_choices
+    assert choice2 in sample_guided_choice
    assert choice1 != choice2


@pytest.mark.asyncio
-async def test_structured_outputs_json_chat(
-    client: openai.AsyncOpenAI,
-    sample_json_schema,
-):
+async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
+                                is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -530,7 +538,7 @@ async def test_structured_outputs_json_chat(
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        extra_body=dict(structured_outputs={"json": sample_json_schema}))
+        extra_body=dict(guided_json=sample_json_schema))
    message = chat_completion.choices[0].message
    assert message.content is not None
    json1 = json.loads(message.content)
@ -547,7 +555,7 @@ async def test_structured_outputs_json_chat(
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        extra_body=dict(structured_outputs={"json": sample_json_schema}))
+        extra_body=dict(guided_json=sample_json_schema))
    message = chat_completion.choices[0].message
    assert message.content is not None
    json2 = json.loads(message.content)
@ -557,10 +565,10 @@ async def test_structured_outputs_json_chat(


@pytest.mark.asyncio
-async def test_structured_outputs_regex_chat(
-    client: openai.AsyncOpenAI,
-    sample_regex,
-):
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
+                                 is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")

    messages = [{
        "role": "system",
@ -575,7 +583,7 @@ async def test_structured_outputs_regex_chat(
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=20,
-        extra_body=dict(structured_outputs={"regex": sample_regex}))
+        extra_body=dict(guided_regex=sample_regex))
    ip1 = chat_completion.choices[0].message.content
    assert ip1 is not None
    assert re.fullmatch(sample_regex, ip1) is not None
@ -586,7 +594,7 @@ async def test_structured_outputs_regex_chat(
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=20,
-        extra_body=dict(structured_outputs={"regex": sample_regex}))
+        extra_body=dict(guided_regex=sample_regex))
    ip2 = chat_completion.choices[0].message.content
    assert ip2 is not None
    assert re.fullmatch(sample_regex, ip2) is not None
@ -594,7 +602,7 @@ async def test_structured_outputs_regex_chat(


@pytest.mark.asyncio
-async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -606,19 +614,17 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
    }]

    with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            extra_body=dict(
-                structured_outputs={"regex": {
-                    1: "Python",
-                    2: "C++"
-                }}))
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))


@pytest.mark.asyncio
-async def test_structured_outputs_choice_chat_logprobs(
-        client: openai.AsyncOpenAI, sample_structured_outputs_choices):
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           sample_guided_choice):

    messages = [{
        "role": "system",
@ -635,8 +641,7 @@ async def test_structured_outputs_choice_chat_logprobs(
        max_completion_tokens=10,
        logprobs=True,
        top_logprobs=5,
-        extra_body=dict(
-            structured_outputs={"choice": sample_structured_outputs_choices}))
+        extra_body=dict(guided_choice=sample_guided_choice))

    assert chat_completion.choices[0].logprobs is not None
    assert chat_completion.choices[0].logprobs.content is not None
@ -648,33 +653,20 @@ async def test_structured_outputs_choice_chat_logprobs(


@pytest.mark.asyncio
-async def test_named_tool_use(
-    client: openai.AsyncOpenAI,
-    sample_json_schema,
-):
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
+                              is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Tool use is only supported in v1 engine")
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
    }, {
        "role":
        "user",
-        "content": ("Give an example JSON for an employee "
-                    "profile using the specified tool.")
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
    }]
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name",
-            "description": "This is a dummy function",
-            "parameters": sample_json_schema
-        }
-    }]
-    tool_choice = {
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name"
-        }
-    }

    # non-streaming

@ -682,8 +674,20 @@ async def test_named_tool_use(
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        tools=tools,
-        tool_choice=tool_choice,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
    )
    message = chat_completion.choices[0].message
    assert len(message.content) == 0
@ -701,12 +705,25 @@ async def test_named_tool_use(

    # streaming

-    stream = await client.chat.completions.create(model=MODEL_NAME,
-                                                  messages=messages,
-                                                  max_completion_tokens=1000,
-                                                  tools=tools,
-                                                  tool_choice=tool_choice,
-                                                  stream=True)
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+        stream=True)

    output = []
    finish_reason_count = 0
@ -809,7 +826,11 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+async def test_response_format_json_schema(client: openai.AsyncOpenAI,
+                                           is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip(
+            "JSON schema response format is only supported in v1 engine")
    prompt = 'what is 1+1? The format is "result": 2'
    # Check that this prompt cannot lead to a valid JSON without json_schema
    for _ in range(2):
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@ -99,26 +99,3 @@ async def test_prompt_logprobs(client: openai.AsyncOpenAI):

    assert completion.prompt_logprobs is not None
    assert len(completion.prompt_logprobs) > 0
-
-
-@pytest.mark.asyncio
-async def test_top_logprobs(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "Beijing is the capital of which country?"
-    }]
-
-    completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        extra_body={
-            "top_logprobs": -1,
-            "logprobs": "true",
-        },
-    )
-    assert completion.choices[0].logprobs is not None
-    assert completion.choices[0].logprobs.content is not None
-    assert len(completion.choices[0].logprobs.content) > 0
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@ -0,0 +1,831 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# imports for guided decoding tests
+import json
+import os
+from typing import Optional
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import regex as re
+import requests
+# downloading lora to test lora requests
+from openai import BadRequestError
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+
+GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=["", "--disable-frontend-multiprocessing"])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+
+    original_value = os.environ.get('VLLM_USE_V1')
+    os.environ['VLLM_USE_V1'] = '0'
+    try:
+        with RemoteOpenAIServer(MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield remote_server
+    finally:
+        # Restore original env value
+        if original_value is None:
+            os.environ.pop('VLLM_USE_V1', None)
+        else:
+            os.environ['VLLM_USE_V1'] = original_value
+
+
+@pytest.fixture
+def is_v1_server(server):
+    import os
+
+    # For completion tests, we assume v0 since there's no explicit v1 setup
+    return os.environ.get('VLLM_USE_V1', '0') == '1'
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
+        # Added tokens should be rejected by the base model
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 32000, 32001, 32002],
+            echo=True,
+            max_tokens=5,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+
+        else:
+            assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: list[list[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice,
+                                        is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements, is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided grammar is only supported in v1 engine")
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex,
+                                          is_v1_server: bool):
+    if not is_v1_server:
+        pytest.skip("Guided decoding is only supported in v1 engine")
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,stream,echo",
+    [
+        (MODEL_NAME, False, False),
+        (MODEL_NAME, False, True),
+        (MODEL_NAME, True, False),
+        (MODEL_NAME, True, True)  # should not raise BadRequestError error
+    ],
+)
+async def test_echo_stream_completion(client: openai.AsyncOpenAI,
+                                      model_name: str, stream: bool,
+                                      echo: bool):
+    saying: str = "Hello, my name is"
+    result = await client.completions.create(model=model_name,
+                                             prompt=saying,
+                                             max_tokens=10,
+                                             temperature=0.0,
+                                             echo=echo,
+                                             stream=stream)
+
+    stop_reason = "length"
+
+    if not stream:
+        completion = result
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        assert len(choice.text) >= 5
+        assert choice.finish_reason == stop_reason
+
+        if echo:
+            assert choice.text is not None and saying in choice.text
+        else:
+            assert choice.text is not None and saying not in choice.text
+
+    else:
+        chunks: list[str] = []
+        final_finish_reason = None
+        async for chunk in result:
+            if chunk.choices and chunk.choices[0].text:
+                chunks.append(chunk.choices[0].text)
+            if chunk.choices and chunk.choices[0].finish_reason:
+                final_finish_reason = chunk.choices[0].finish_reason
+
+        assert final_finish_reason == stop_reason
+        content = "".join(chunks)
+        if echo:
+            assert content is not None and saying in content
+        else:
+            assert content is not None and saying not in content
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer,
+                           client: openai.AsyncOpenAI):
+    request_args = {
+        "model": MODEL_NAME,
+        "prompt": "Hello, my name is",
+        "max_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": None,
+    }
+
+    completion = await client.completions.create(**request_args)
+
+    invocation_response = requests.post(server.url_for("invocations"),
+                                        json=request_args)
+    invocation_response.raise_for_status()
+
+    completion_output = completion.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert completion_output.keys() == invocation_output.keys()
+    assert completion_output["choices"] == invocation_output["choices"]
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -142,7 +142,7 @@ def server():  # noqa: F811
        "--dtype",
        "half",
        "--enable-auto-tool-choice",
-        "--structured-outputs-config.backend",
+        "--guided-decoding-backend",
        "xgrammar",
        "--tool-call-parser",
        "hermes",
@ -225,7 +225,7 @@ def k2_server():  # noqa: F811
        "--dtype",
        "half",
        "--enable-auto-tool-choice",
-        "--structured-outputs-config.backend",
+        "--guided-decoding-backend",
        "xgrammar",
        "--tool-call-parser",
        "hermes",
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@ -14,9 +14,6 @@ from transformers import AutoConfig

 from ...utils import RemoteOpenAIServer

-pytest.skip("Skipping prompt_embeds test until V1 supports it.",
-            allow_module_level=True)
-
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

@ -231,20 +228,3 @@ async def test_completions_with_logprobs_and_prompt_embeds(
            assert max(logprobs_arg,
                       1) <= len(top_logprobs) <= logprobs_arg + 1
        assert len(logprobs.tokens) == 5
-
-
-@pytest.mark.asyncio
-async def test_prompt_logprobs_raises_error(
-        client_with_prompt_embeds: openai.AsyncOpenAI):
-    with pytest.raises(BadRequestError, match="not compatible"):
-        encoded_embeds = create_dummy_embeds()
-        await client_with_prompt_embeds.completions.create(
-            model=MODEL_NAME,
-            prompt="",
-            max_tokens=5,
-            temperature=0.0,
-            extra_body={
-                "prompt_embeds": encoded_embeds,
-                "prompt_logprobs": True
-            },
-        )
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@ -53,13 +53,12 @@ def monkeypatch_module():
    mpatch.undo()


-@pytest.fixture(scope="module", params=[True])
+@pytest.fixture(scope="module", params=[False, True])
 def server_with_lora_modules_json(request, monkeypatch_module,
                                  zephyr_lora_files):

    use_v1 = request.param
-    assert use_v1
-    monkeypatch_module.setenv('VLLM_USE_V1', '1')
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')

    # Define the json format LoRA module configurations
    lora_module_1 = {
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PREV_MINOR_VERSION = version._prev_minor_version()


-@pytest.fixture(scope="module", params=[True])
+@pytest.fixture(scope="module", params=[True, False])
 def use_v1(request):
    # Module-scoped variant of run_with_both_engines
    #
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@ -102,14 +102,12 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
                                if "custom" in tool_call:
                                    return False

-            # Sometimes structured_outputs.grammar is generated to be empty
+            # Sometimes guided_grammar is generated to be empty
            # Causing a server error in EBNF grammar parsing
            # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
-            structured_outputs = case.body.get("structured_outputs", {})
-            grammar = structured_outputs.get("grammar") if isinstance(
-                structured_outputs, dict) else None
+            guided_grammar = case.body.get("guided_grammar")

-            if grammar == '':
+            if guided_grammar == '':
                # Allow None (will be handled as no grammar)
                # But skip empty strings
                return False
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@ -3,7 +3,7 @@

 import io

-# imports for structured outputs tests
+# imports for guided decoding tests
 import openai
 import pybase64
 import pytest
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@ -10,30 +10,8 @@ import pytest
 from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer
-
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-
-
-@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
-    return [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--max-num-seqs",
-        "128",
-        "--enforce-eager",
-        # lora config
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-    ]
+from .test_completion import default_server_args  # noqa: F401
+from .test_completion import MODEL_NAME


@pytest.fixture(scope="module")
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -333,6 +333,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
            "role": "user",
            "content": "what is 1+1?"
        }],
+        guided_decoding_backend="outlines",
    )

    with suppress(Exception):
@ -377,6 +378,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
            "role": "user",
            "content": "what is 1+1?"
        }],
+        guided_decoding_backend="outlines",
    )

    with suppress(Exception):
@ -431,6 +433,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
            "role": "user",
            "content": "what is 1+1?"
        }],
+        guided_decoding_backend="outlines",
    )

    with suppress(Exception):
@ -486,6 +489,7 @@ async def test_serving_chat_could_load_correct_generation_config():
            "role": "user",
            "content": "what is 1+1?"
        }],
+        guided_decoding_backend="outlines",
    )

    with suppress(Exception):
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@ -15,6 +15,14 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
@pytest.fixture(scope="module")
 def server():
    args = [
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# imports for structured outputs tests
+# imports for guided decoding tests
 import io
 import json

--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import io
-# imports for structured outputs tests
+# imports for guided decoding tests
 import json

 import httpx
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@ -102,6 +102,9 @@ def test_triton_unified_attn(
 ) -> None:
    torch.set_default_device("cuda")

+    if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32:
+        pytest.skip("block size must be at least 32 for fp8")
+
    current_platform.seed_everything(0)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
--- a/tests/metrics/init.py
+++ b/tests/metrics/init.py
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@ -0,0 +1,268 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import ray
+from prometheus_client import REGISTRY
+
+import vllm.envs as envs
+from vllm import EngineArgs, LLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.metrics import RayPrometheusStatLogger
+from vllm.sampling_params import SamplingParams
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+MODELS = [
+    "distilbert/distilgpt2",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_prompt_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        prompt_token_counts = [
+            len(tokenizer.encode(p)) for p in example_prompts
+        ]
+        # This test needs at least 2 prompts in a batch of different lengths to
+        # verify their token count is correct despite padding.
+        assert len(example_prompts) > 1, "at least 2 prompts are required"
+        assert prompt_token_counts[0] != prompt_token_counts[1], (
+            "prompts of different lengths are required")
+        vllm_prompt_token_count = sum(prompt_token_counts)
+
+        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+            **stat_logger.labels)._value.get()
+
+    assert vllm_prompt_token_count == metric_count, (
+        f"prompt token count: {vllm_prompt_token_count!r}\n"
+        f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_generation_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+            **stat_logger.labels)._value.get()
+        vllm_generation_count = 0
+        for i in range(len(example_prompts)):
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            prompt_ids = tokenizer.encode(example_prompts[i])
+            # vllm_output_ids contains both prompt tokens and generation tokens.
+            # We're interested only in the count of the generation tokens.
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+
+    assert vllm_generation_count == metric_count, (
+        f"generation token count: {vllm_generation_count!r}\n"
+        f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "served_model_name",
+    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
+def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
+                                   served_model_name: list[str]) -> None:
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.3,
+                     served_model_name=served_model_name) as vllm_model:
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
+        metrics_tag_content = stat_logger.labels["model_name"]
+
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
+    if served_model_name is None or served_model_name == []:
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+            f"actual: {metrics_tag_content!r}")
+    else:
+        assert metrics_tag_content == served_model_name[0], (
+            f"Metrics tag model_name is wrong! expect: "
+            f"{served_model_name[0]!r}\n"
+            f"actual: {metrics_tag_content!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+@pytest.mark.asyncio
+async def test_async_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    """
+    Regression test ensuring async engine generates metrics
+    when disable_log_stats=False
+    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
+    """
+    engine_args = AsyncEngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
+    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        results = async_engine.generate(
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+            f"request-id-{i}",
+        )
+        # Exhaust the async iterator to make the async engine work
+        async for _ in results:
+            pass
+
+    assert_metrics(model, async_engine.engine, disable_log_stats,
+                   len(example_prompts))
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+def test_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    engine_args = EngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        engine.add_request(
+            f"request-id-{i}",
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+        )
+    while engine.has_unfinished_requests():
+        engine.step()
+
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
+    assert_metrics(model, engine, disable_log_stats, len(example_prompts))
+
+
+def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
+                   num_requests: int) -> None:
+    if disable_log_stats:
+        with pytest.raises(AttributeError):
+            _ = engine.stat_loggers
+    else:
+        assert (engine.stat_loggers
+                is not None), "engine.stat_loggers should be set"
+        # Ensure the count bucket of request-level histogram metrics matches
+        # the number of requests as a simple sanity check to ensure metrics are
+        # generated
+        labels = {'model_name': model}
+        request_histogram_metrics = [
+            "vllm:e2e_request_latency_seconds",
+            "vllm:request_prompt_tokens",
+            "vllm:request_generation_tokens",
+            "vllm:request_params_n",
+            "vllm:request_params_max_tokens",
+        ]
+        for metric_name in request_histogram_metrics:
+            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
+                                                     labels)
+            assert (
+                metric_value == num_requests), "Metrics should be collected"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+def test_engine_log_metrics_ray(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is quite weak - it only checks that we can use
+    # RayPrometheusStatLogger without exceptions.
+    # Checking whether the metrics are actually emitted is unfortunately
+    # non-trivial.
+
+    # We have to run in a Ray task for Ray metrics to be emitted correctly
+    @ray.remote(num_gpus=1)
+    def _inner():
+
+        class _RayPrometheusStatLogger(RayPrometheusStatLogger):
+
+            def __init__(self, *args, **kwargs):
+                self._i = 0
+                super().__init__(*args, **kwargs)
+
+            def log(self, *args, **kwargs):
+                self._i += 1
+                return super().log(*args, **kwargs)
+
+        engine_args = EngineArgs(
+            model=model,
+            dtype=dtype,
+            disable_log_stats=False,
+        )
+        engine = LLMEngine.from_engine_args(engine_args)
+        logger = _RayPrometheusStatLogger(
+            local_interval=0.5,
+            labels=dict(model_name=engine.model_config.served_model_name),
+            vllm_config=engine.vllm_config)
+        engine.add_logger("ray", logger)
+        for i, prompt in enumerate(example_prompts):
+            engine.add_request(
+                f"request-id-{i}",
+                prompt,
+                SamplingParams(max_tokens=max_tokens),
+            )
+        while engine.has_unfinished_requests():
+            engine.step()
+        assert logger._i > 0, ".log must be called at least once"
+
+    ray.get(_inner.remote())
--- a/tests/model_executor/test_logits_processor.py
+++ b/tests/model_executor/test_logits_processor.py
@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import is_pin_memory_available
+
+
+class MockLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, vocab_size: int, scale: float,
+                 fake_logits: torch.Tensor):
+        super().__init__(vocab_size=vocab_size, scale=scale)
+        self.fake_logits = fake_logits.clone()
+
+    def forward(self, *args, **kwargs):
+        with patch(
+                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
+                lambda x, y: x
+        ), patch(
+                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
+                lambda *args, **kwargs: self.fake_logits):
+            return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, vocab_size),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
+    return input_tensor, fake_logits, logits_processor
+
+
+RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_logits_processors(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    seq_group_metadata_list = []
+    seq_lens = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0,
+                                               logits_processors=[pick_ith]),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    logits_processor_output = logits_processor(
+        lm_head=None,
+        hidden_states=input_tensor,
+        sampling_metadata=sampling_metadata)
+
+    assert torch.isinf(logits_processor_output[:, 0]).all()
+
+    fake_logits *= logits_processor.scale
+    torch.testing.assert_close(logits_processor_output[:, 1],
+                               fake_logits[:, 1],
+                               rtol=1e-4,
+                               atol=0.0)
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -418,9 +418,7 @@ def test_full_cuda_graph(
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("cache_dtype_param",
-                         ["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
-def test_fp32_cache_state(
+def test_fp32_state(
    hf_runner,
    vllm_runner,
    example_prompts,
@ -428,7 +426,6 @@ def test_fp32_cache_state(
    model: str,
    max_tokens: int,
    num_logprobs: int,
-    cache_dtype_param: str,
 ) -> None:

    try:
@ -446,13 +443,13 @@ def test_fp32_cache_state(
        m.setenv("VLLM_USE_V1", "0")
        with vllm_runner(model,
                         max_num_seqs=MAX_NUM_SEQS,
-                         **{cache_dtype_param: "float32"}) as vllm_model:
+                         mamba_ssm_cache_dtype="float32") as vllm_model:
            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model,
                     max_num_seqs=MAX_NUM_SEQS,
-                     **{cache_dtype_param: "float32"}) as vllm_model:
+                     mamba_ssm_cache_dtype="float32") as vllm_model:
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import torch
-from transformers import AutoModelForTokenClassification
-
-from tests.models.utils import softmax
-
-
-@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
-# The float32 is required for this tiny model to pass the test.
-@pytest.mark.parametrize("dtype", ["float"])
-@torch.inference_mode
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForTokenClassification) as hf_model:
-        tokenizer = hf_model.tokenizer
-        hf_outputs = []
-        for prompt in example_prompts:
-            inputs = tokenizer([prompt], return_tensors="pt")
-            inputs = hf_model.wrap_device(inputs)
-            output = hf_model.model(**inputs)
-            hf_outputs.append(softmax(output.logits[0]))
-
-    # check logits difference
-    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
-        assert torch.allclose(hf_output, vllm_output, 1e-2)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -414,7 +414,6 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {

    # [Cross-encoder]
    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
-    "BertForTokenClassification": _HfExamplesInfo("boltuix/NeuroBERT-NER"),
    "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
                                                       trust_remote_code=True,
                                                       hf_overrides={
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
@ -22,7 +22,7 @@ class DataModuleConfig(TypedDict):

 class ImagePrompt(BaseModel):

-    data_format: Literal["b64_json", "bytes", "url", "path"]
+    data_format: Literal["b64_json", "bytes", "url"]
    """
    This is the data type for the input image
    """
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm import SamplingParams
+
+from ..conftest import VllmRunner
+
+MODELS = ["distilbert/distilgpt2"]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module is V0 only since it uses dtype=float, so
+    set VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["float"])  # needed for comparing logprobs with HF
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_get_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype,
+    chunked_prefill_token_size: int,
+    num_top_logprobs: int,
+    detokenize: bool,
+    example_prompts,
+):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    max_tokens = 5
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens=max_tokens,
+        )
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=num_top_logprobs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                              logprobs=num_top_logprobs,
+                                              prompt_logprobs=num_top_logprobs,
+                                              temperature=0.0,
+                                              detokenize=detokenize)
+        vllm_results = vllm_model.llm.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
+
+    # Test whether logprobs are included in the results.
+    for result in vllm_results:
+        assert result.prompt_logprobs is not None
+        assert result.outputs[0].logprobs is not None
+        assert len(result.outputs[0].logprobs) == max_tokens
+        for logprobs in result.outputs[0].logprobs:
+            # If the output token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(logprobs) == num_top_logprobs
+                    or len(logprobs) == num_top_logprobs + 1)
+        output_text = result.outputs[0].text
+        output_string_from_most_likely_tokens_lst: list[str] = []
+        for top_logprobs in result.outputs[0].logprobs:
+            top_logprob = next(iter(top_logprobs.values()))
+            output_string_from_most_likely_tokens_lst.append(
+                top_logprob.decoded_token)
+
+        if detokenize:
+            output_string_from_most_likely_tokens = "".join(
+                output_string_from_most_likely_tokens_lst)
+            assert output_text == output_string_from_most_likely_tokens, (
+                "The output text from the top logprob for each token position "
+                "should be the same as the output text in the result.")
+        else:
+            assert output_text == ''
+            assert output_string_from_most_likely_tokens_lst == ([None] *
+                                                                 max_tokens)
+
+        # The first prompt logprob is always None
+        assert result.prompt_logprobs[0] is None
+        for prompt_logprobs in result.prompt_logprobs[1:]:
+            # If the prompt token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(prompt_logprobs) == num_top_logprobs
+                    or len(prompt_logprobs) == num_top_logprobs + 1)
+
+    # Test whether prompt logprobs are consistent with HF
+    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
+        # Check prompt logprobs
+        # The first prompt logprob is always None, so we compare it from 1:.
+        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+            for token_id, logprob in vllm_prompt_logprob_dict.items():
+                torch.testing.assert_close(logprob.logprob,
+                                           hf_logprob[0][i][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+        for i, top_logprobs in enumerate(vllm_sample_logprobs):
+            for token_id, sample_logprob in top_logprobs.items():
+                logprob = sample_logprob.logprob
+                torch.testing.assert_close(logprob,
+                                           hf_logprob[i][-1][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+                if detokenize:
+                    assert isinstance(sample_logprob.decoded_token, str), (
+                        "The token should be decoded by the time it is returned"
+                        " to the user.")
+
+    # Test if prompt logprobs are correctly set.
+    for vllm_result in vllm_results:
+        token_ids = vllm_result.prompt_token_ids
+        prompt_logprobs = vllm_result.prompt_logprobs
+
+        # The first token doesn't have logprob.
+        assert prompt_logprobs[0] is None
+
+        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
+            assert token_id in logprob_dict
+
+
+def test_max_logprobs():
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
+                       detokenize: bool, example_prompts):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.llm.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test hashing of cache blocks.
+
+Run `pytest tests/test_cache_block_hashing.py`.
+"""
+from typing import Optional
+
+import pytest
+
+from vllm.inputs import token_inputs
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Sequence
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Make two prefixes with different first blocks.
+prefix_start = [("You are an expert"), ("You are a")]
+prefix_common = (
+    " school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on this, fulfill "
+    "the following: ")
+prefixes = [start + prefix_common for start in prefix_start]
+
+# Sample prompts.
+sample_prompts = [
+    "Hello, my name is", "The president of the United States is",
+    "The capital of France is", "The future of AI is"
+]
+
+
+# Helper function.
+def flatten_2d(li):
+    return [lss for ls in li for lss in ls]
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("max_num_seqs", [256])
+@pytest.mark.parametrize("concurrent_lora_int_ids",
+                         [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
+                             concurrent_lora_int_ids: list[Optional[int]]):
+
+    tokenizer = get_tokenizer("facebook/opt-125m")
+
+    hashes: list[list[list[int]]] = []
+
+    for prefix in prefixes:
+        for lora_int_id in concurrent_lora_int_ids:
+            lora_request = None
+
+            if lora_int_id is not None:
+                lora_request = LoRARequest(
+                    f"example_lora_{lora_int_id}",
+                    lora_int_id,
+                    f"example/path/to/lora_{lora_int_id}",
+                )
+
+            hashes.append([])
+            prompts = [prefix + prompt for prompt in sample_prompts]
+            for seq_id, prompt in enumerate(prompts):
+                hashes[-1].append([])
+                prompt_token_ids = tokenizer.encode(prompt)
+                seq = Sequence(seq_id,
+                               inputs=token_inputs(prompt_token_ids,
+                                                   prompt=prompt),
+                               block_size=block_size,
+                               eos_token_id=tokenizer.eos_token_id,
+                               lora_request=lora_request)
+
+                num_blocks = len(prompt_token_ids) // block_size
+                for idx in range(num_blocks):
+                    hashes[-1][-1].append(seq.hash_of_block(idx))
+
+    # Check that hashes made with two prefixes with different first blocks are
+    # different everywhere.
+    for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
+        assert (hash0 != hash1)
+
+    # Check that hashes of different prompts made with the same prefix are the
+    # same until the hashes that contain the prompt.
+    for hash_pref in hashes:
+        same_hashes = [tuple(h[:-1]) for h in hash_pref]
+        different_hashes = [h[-1] for h in hash_pref]
+        assert (len(set(same_hashes)) == 1)
+        assert (len(set(different_hashes)) == len(different_hashes))
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the SamplingParams class.
+"""
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+MODEL_NAME = "Qwen/Qwen1.5-7B"
+
+
+def test_max_tokens_none():
+    """max_tokens=None should be allowed"""
+    SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
+
+
+@pytest.fixture(scope="module")
+def model_config():
+    return ModelConfig(
+        MODEL_NAME,
+        seed=0,
+        dtype="float16",
+    )
+
+
+@pytest.fixture(scope="module")
+def default_max_tokens():
+    return 4096
+
+
+def test_sampling_params_from_request_with_no_guided_decoding_backend(
+        model_config, default_max_tokens):
+    # guided_decoding_backend is not present at request level
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # we do not expect any backend to be present and the default
+    # guided_decoding_backend at engine level will be used.
+    assert sampling_params.guided_decoding.backend is None
+
+
+@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
+                         [("xgrammar", "xgrammar"), ("guidance", "guidance"),
+                          ("outlines", "outlines")])
+def test_sampling_params_from_request_with_guided_decoding_backend(
+        request_level_guided_decoding_backend: str, expected: str,
+        model_config, default_max_tokens):
+
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+        'guided_decoding_backend':
+        request_level_guided_decoding_backend,
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # backend correctly identified in resulting sampling_params
+    assert sampling_params.guided_decoding.backend == expected
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@ -68,7 +68,7 @@ EXAMPLE_TOOLS = [
 def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
                       should_match: bool):
    self = MagicMock(tool_choice="required", tools=tools)
-    schema = ChatCompletionRequest._get_json_schema_from_tool(self)
+    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
    assert isinstance(schema, dict)

    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
@ -218,7 +218,7 @@ VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
                }
            }, {}], False),
    ])
-def test_structured_outputs_json(sample_output, should_match):
+def test_guided_json(sample_output, should_match):
    _compile_and_check(tools=TypeAdapter(
        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
                       sample_output=sample_output,
@ -273,9 +273,8 @@ def update_parameters_empty_dict(
@pytest.mark.parametrize(
    "update_parameters",
    [update_parameters_none, update_parameters_empty_dict])
-def test_structured_outputs_json_without_parameters(sample_output,
-                                                    should_match,
-                                                    update_parameters):
+def test_guided_json_without_parameters(sample_output, should_match,
+                                        update_parameters):
    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
    tools = TypeAdapter(
        list[ChatCompletionToolsParam]).validate_python(updated_tools)
@ -335,4 +334,4 @@ def test_streaming_output_valid(output, empty_params, delta_len):
            combined_messages += message.tool_calls[0].function.arguments
    combined_messages += "}]"
    assert json.loads(combined_messages) == output
-    assert json.dumps(json.loads(combined_messages)) == output_json
+    assert json.dumps(json.loads(combined_messages)) == output_json
--- a/tests/tracing/init.py
+++ b/tests/tracing/init.py
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
+
+import threading
+from collections.abc import Iterable
+from concurrent import futures
+from typing import Callable, Generator, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service() -> Generator[FakeTraceService, None, None]:
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        # Model forward and model execute should be none, since detailed traces is
+        # not enabled.
+        assert metrics.model_forward_time is None
+        assert metrics.model_execute_time is None
+
+
+def test_traces_with_detailed_steps(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            collect_detailed_traces=["all"],
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        assert metrics.model_forward_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
+        ) == pytest.approx(metrics.model_forward_time / 1000)
+        assert metrics.model_execute_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        ) == metrics.model_execute_time
+        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import (MultiModalFeatureSpec,
                                    MultiModalKwargsItem, PlaceholderRange)
-from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@ -1796,11 +1796,11 @@ def test_schedule_skip_tokenizer_init():

 def test_schedule_skip_tokenizer_init_structured_output_request():
    scheduler = create_scheduler(skip_tokenizer_init=True)
-    structured_outputs_params = StructuredOutputsParams(regex="[0-9]+")
+    guided_params = GuidedDecodingParams(regex="[0-9]+")
    sampling_params = SamplingParams(
        ignore_eos=False,
        max_tokens=16,
-        structured_outputs=structured_outputs_params,
+        guided_decoding=guided_params,
    )
    request = Request(
        request_id="0",
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -18,7 +18,7 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.utils import set_default_torch_num_threads
 from vllm.v1.engine.async_llm import AsyncLLM
-from vllm.v1.metrics.loggers import AggregatedStatLogger, LoggingStatLogger
+from vllm.v1.metrics.loggers import LoggingStatLogger

 if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.",
@ -389,15 +389,6 @@ class MockLoggingStatLogger(LoggingStatLogger):
        self.log = MagicMock()


-class MockAggregatedStatLogger(AggregatedStatLogger):
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 engine_indexes: Optional[list[int]] = None):
-        super().__init__(vllm_config, engine_indexes)
-        self.log = MagicMock()
-
-
@pytest.mark.asyncio
 async def test_customize_loggers(monkeypatch):
    """Test that we can customize the loggers.
@ -424,35 +415,6 @@ async def test_customize_loggers(monkeypatch):
        stat_loggers[0][0].log.assert_called_once()


-@pytest.mark.asyncio
-async def test_customize_aggregated_loggers(monkeypatch):
-    """Test that we can customize the aggregated loggers.
-    If a customized logger is provided at the init, it should
-    be added to the default loggers.
-    """
-
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with set_default_torch_num_threads(1):
-            engine = AsyncLLM.from_engine_args(
-                TEXT_ENGINE_ARGS,
-                stat_loggers=[MockLoggingStatLogger, MockAggregatedStatLogger],
-            )
-        after.callback(engine.shutdown)
-
-        await engine.do_log_stats()
-
-        stat_loggers = engine.logger_manager.per_engine_logger_dict
-        assert len(stat_loggers) == 1
-        assert len(
-            stat_loggers[0]) == 2  # LoggingStatLogger + MockLoggingStatLogger
-        aggregated_loggers = engine.logger_manager.aggregated_loggers
-        assert len(aggregated_loggers) == 1
-        aggregated_loggers[0].log.assert_called_once()
-        stat_loggers[0][0].log.assert_called_once()
-
-
@pytest.mark.asyncio(scope="module")
 async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m, ExitStack() as after:
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Optional
 import pytest

 from vllm import LLM
-from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector

 if TYPE_CHECKING:
@ -97,7 +97,7 @@ def _get_test_sampling_params(
            top_p=0.95,
            n=n,
            seed=seed,
-            structured_outputs=StructuredOutputsParams(
+            guided_decoding=GuidedDecodingParams(
                regex="[0-9]+") if structured_outputs else None,
        ) for n in n_list
    ], n_list
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@ -151,7 +151,7 @@ def sample_definition_json_schema():


@pytest.fixture
-def sample_structured_outputs_choices():
+def sample_guided_choice():
    return [
        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
        "Ruby", "Swift", "Kotlin"
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -15,13 +15,12 @@ import torch
 from pydantic import BaseModel

 from tests.reasoning.utils import run_reasoning_extraction
-from vllm.config import StructuredOutputsConfig
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams

 if TYPE_CHECKING:
    from vllm.config import TokenizerMode
@ -91,7 +90,7 @@ def _load_json(s: str, backend: str) -> str:

@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize(
-    "model_name, backend, tokenizer_mode, speculative_config",
+    "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
    monkeypatch: pytest.MonkeyPatch,
@ -100,8 +99,8 @@ def test_structured_output(
    sample_sql_ebnf: str,
    sample_sql_lark: str,
    sample_regex: str,
-    sample_structured_outputs_choices: str,
-    backend: str,
+    sample_guided_choice: str,
+    guided_decoding_backend: str,
    tokenizer_mode: str,
    model_name: str,
    speculative_config: dict[str, Any],
@ -116,15 +115,16 @@ def test_structured_output(
    enforce_eager = bool(not current_platform.is_tpu())
    # Use a single LLM instance for several scenarios to
    # speed up the test suite.
-    llm = LLM(model=model_name,
-              enforce_eager=enforce_eager,
-              max_model_len=1024,
-              structured_outputs_config=dict(backend=backend,
-                                             disable_any_whitespace=backend
-                                             in {"xgrammar", "guidance"}),
-              seed=120,
-              tokenizer_mode=tokenizer_mode,
-              speculative_config=speculative_config)
+    llm = LLM(
+        model=model_name,
+        enforce_eager=enforce_eager,
+        max_model_len=1024,
+        guided_decoding_backend=guided_decoding_backend,
+        guided_decoding_disable_any_whitespace=(guided_decoding_backend
+                                                in {"xgrammar", "guidance"}),
+        seed=120,
+        tokenizer_mode=tokenizer_mode,
+        speculative_config=speculative_config)

    #
    # Test 1: Generate JSON output based on a provided schema
@ -132,7 +132,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=4096,
-        structured_outputs=StructuredOutputsParams(json=sample_json_schema))
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))

    prompt = ("Give an example JSON for an employee profile that fits this "
              "schema. Make the response as short as possible. Schema: "
@ -152,7 +152,7 @@ def test_structured_output(

        generated_text = output.outputs[0].text
        assert generated_text is not None
-        if backend != 'lm-format-enforcer':
+        if guided_decoding_backend != 'lm-format-enforcer':
            assert "\n" not in generated_text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        output_json = json.loads(generated_text)
@ -161,12 +161,12 @@ def test_structured_output(
    #
    # Test 2: Generate JSON object without a schema
    #
-    if backend != "outlines":
+    if guided_decoding_backend != "outlines":
        sampling_params = SamplingParams(
            temperature=1.0,
            max_tokens=4096,
            n=2,
-            structured_outputs=StructuredOutputsParams(json_object=True))
+            guided_decoding=GuidedDecodingParams(json_object=True))

        outputs = llm.generate(prompts=(
            "Generate a JSON object with curly braces for a person with "
@ -195,9 +195,8 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=4096,
-        structured_outputs=StructuredOutputsParams(
-            json=unsupported_json_schema))
-    if backend.startswith("xgrammar"):
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+    if guided_decoding_backend.startswith("xgrammar"):
        with pytest.raises(ValueError,
                           match="The provided JSON schema contains features "
                           "not supported by xgrammar."):
@ -231,7 +230,7 @@ def test_structured_output(
            parsed_json = json.loads(generated_text)
            assert isinstance(parsed_json, dict)

-    if backend not in ["outlines", "lm-format-enforcer"]:
+    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
        #
        # Test 4: Generate SQL statement using EBNF grammar
        #
@ -239,8 +238,7 @@ def test_structured_output(
            temperature=0.8,
            top_p=0.95,
            max_tokens=1000,
-            structured_outputs=StructuredOutputsParams(
-                grammar=sample_sql_ebnf))
+            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
        outputs = llm.generate(
            ("Generate a sql statement that selects col_1 from "
             "table_1 where it is equal to 1. Make the response as short as "
@ -273,8 +271,7 @@ def test_structured_output(
            temperature=0.8,
            top_p=0.95,
            max_tokens=1000,
-            structured_outputs=StructuredOutputsParams(
-                grammar=sample_sql_lark))
+            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
        outputs = llm.generate(
            ("Generate a sql statement that selects col_1 from "
             "table_1 where it is equal to 1. Make the response as short as "
@ -312,8 +309,7 @@ def test_structured_output(
            temperature=0.8,
            top_p=0.95,
            max_tokens=1000,
-            structured_outputs=StructuredOutputsParams(
-                grammar="not a grammar"))
+            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
        with pytest.raises(ValueError, match="Failed to convert the grammar "):
            llm.generate(
                ("Generate a sql statement that selects col_1 from "
@ -329,7 +325,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-        structured_outputs=StructuredOutputsParams(regex=sample_regex))
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))

    prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
              f"Make the response as short as possible.")
@ -356,8 +352,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-        structured_outputs=StructuredOutputsParams(
-            choice=sample_structured_outputs_choices))
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))

    outputs = llm.generate(
        ("The best language for type-safe systems programming is "
@ -373,7 +368,7 @@ def test_structured_output(
        generated_text = output.outputs[0].text
        print(generated_text)
        assert generated_text is not None
-        assert generated_text in sample_structured_outputs_choices
+        assert generated_text in sample_guided_choice
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

    #
@ -383,7 +378,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-        structured_outputs=StructuredOutputsParams(json=json_schema))
+        guided_decoding=GuidedDecodingParams(json=json_schema))

    outputs = llm.generate(
        ("Generate a JSON with the brand, model and car_type of the most "
@ -427,7 +422,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=4096,
-        structured_outputs=StructuredOutputsParams(json=json_schema))
+        guided_decoding=GuidedDecodingParams(json=json_schema))

    outputs = llm.generate(
        ("Generate a description of a frog using 50 characters. "
@ -449,7 +444,7 @@ def test_structured_output(
        output_json = json.loads(generated_text)
        jsonschema.validate(instance=output_json, schema=json_schema)

-    if backend not in ["outlines", "lm-format-enforcer"]:
+    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
        #
        # Test 11: Generate structured output using structural_tag format
        #
@ -475,7 +470,7 @@ def test_structured_output(
        sampling_params = SamplingParams(
            temperature=0.0,
            max_tokens=4096,
-            structured_outputs=StructuredOutputsParams(
+            guided_decoding=GuidedDecodingParams(
                structural_tag=json.dumps(structural_tag_config)))

        prompt = """
@ -552,7 +547,7 @@ Make the response as short as possible.

@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize(
-    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
    [
        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
         "deepseek_r1", NGRAM_SPEC_CONFIG),
@ -561,7 +556,7 @@ Make the response as short as possible.
 )
 def test_structured_output_with_reasoning_matrices(
    monkeypatch: pytest.MonkeyPatch,
-    backend: str,
+    guided_decoding_backend: str,
    tokenizer_mode: TokenizerMode,
    reasoning_parser: str,
    model_name: str,
@ -581,11 +576,10 @@ def test_structured_output_with_reasoning_matrices(
        enforce_eager=bool(not current_platform.is_tpu()),
        max_model_len=1024,
        max_num_seqs=16,
-        structured_outputs_config=dict(backend=backend,
-                                       disable_any_whitespace=backend
-                                       in {"xgrammar", "guidance"},
-                                       reasoning_parser=reasoning_parser),
+        guided_decoding_backend=guided_decoding_backend,
+        guided_decoding_disable_any_whitespace=True,
        tokenizer_mode=tokenizer_mode,
+        reasoning_parser=reasoning_parser,
        speculative_config=speculative_config,
    )
    tokenizer = llm.get_tokenizer()
@ -609,7 +603,7 @@ def test_structured_output_with_reasoning_matrices(
    sampling_params = SamplingParams(
        temperature=0.1,
        max_tokens=8192,
-        structured_outputs=StructuredOutputsParams(json=reasoning_schema),
+        guided_decoding=GuidedDecodingParams(json=reasoning_schema),
    )
    outputs = llm.generate(
        [reasoning_prompt],
@ -646,14 +640,13 @@ def test_structured_output_auto_mode(

    llm = LLM(model=model_name,
              max_model_len=1024,
-              structured_outputs_config=dict(backend="auto"),
+              guided_decoding_backend="auto",
              tokenizer_mode=tokenizer_mode)

    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-        structured_outputs=StructuredOutputsParams(
-            json=unsupported_json_schema))
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))

    prompts = (
        "Give an example JSON object for a grade "
@ -688,10 +681,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):

    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
              max_model_len=1024,
-              structured_outputs_config=dict(
-                  backend="guidance",
-                  disable_any_whitespace=True,
-                  disable_additional_properties=True))
+              guided_decoding_backend="guidance",
+              guided_decoding_disable_any_whitespace=True,
+              guided_decoding_disable_additional_properties=True)

    schema = {
        'type': 'object',
@ -717,15 +709,14 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
        "<|im_end|>\n<|im_start|>assistant\n")

    def generate_with_backend(backend):
-        structured_outputs_params = StructuredOutputsParams(
+        guided_params = GuidedDecodingParams(
            json=schema,
            backend=backend,
            disable_any_whitespace=True,
            disable_additional_properties=True)
-        sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=256,
-            structured_outputs=structured_outputs_params)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)

        outputs = llm.generate(prompt, sampling_params=sampling_params)
        assert outputs is not None
@ -745,11 +736,12 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
    assert "a6" not in generated


-@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
-def test_structured_output_batched_with_non_structured_outputs_requests(
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["guidance", "xgrammar", "outlines"])
+def test_structured_output_batched_with_non_guided_requests(
    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
-    backend: str,
+    guided_decoding_backend: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")

@ -761,25 +753,24 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        enforce_eager=enforce_eager,
        max_model_len=1024,
-        structured_outputs_config=StructuredOutputsConfig(
-            backend=backend,
-            disable_any_whitespace=backend in {"xgrammar", "guidance"},
-        ),
+        guided_decoding_backend=guided_decoding_backend,
+        guided_decoding_disable_any_whitespace=(guided_decoding_backend
+                                                in {"xgrammar", "guidance"}),
    )

-    structured_outputs_prompt = (
+    guided_prompt = (
        "Give an example JSON for an employee profile that fits this "
        "schema. Make the response as short as possible. Schema: "
        f"{sample_json_schema}")

-    non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
+    non_guided_prompt = "The diameter of the Earth in kilometers is "

-    prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
+    prompts = [guided_prompt, non_guided_prompt]
    sampling_params = [
-        SamplingParams(temperature=1.0,
-                       max_tokens=400,
-                       structured_outputs=StructuredOutputsParams(
-                           json=sample_json_schema)),
+        SamplingParams(
+            temperature=1.0,
+            max_tokens=400,
+            guided_decoding=GuidedDecodingParams(json=sample_json_schema)),
        # No max tokens, temp=0 to assert on contents
        SamplingParams(
            seed=42,
@ -810,16 +801,16 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
        print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")

        if index == 0:
-            # First prompt is structured outputs, expect valid JSON
+            # First prompt is guided, expect valid JSON
            assert "\n" not in generated_text
            output_json = json.loads(generated_text)
            jsonschema.validate(instance=output_json,
                                schema=sample_json_schema)
        else:
-            # Second prompt is not structured outputs, expect valid output
+            # Second prompt is not guided, expect valid output
            # Cannot assert on exact output, but we can expect it to be factual
            assert "12,742" in generated_text

-            # non-structured outputs requests should not return a valid JSON here
+            # non-guided requests should not return a valid JSON here
            with pytest.raises(ValueError):
                output_json = json.loads(generated_text)
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@ -77,9 +77,7 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
                "role": "user",
                "content": prompt,
            }],
-            extra_body={"structured_outputs": {
-                "json": invalid_json_schema
-            }},
+            extra_body={"guided_json": invalid_json_schema},
        )


@ -101,9 +99,7 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
                "content": prompt,
            }],
            extra_body={
-                "structured_outputs": {
-                    "regex": r"[.*"
-                },
+                "guided_regex": r"[.*",
                "stop": ["\n"]
            },
        )
@ -138,9 +134,5 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
                "role": "user",
                "content": prompt,
            }],
-            extra_body={
-                "structured_outputs": {
-                    "grammar": invalid_simplified_sql_grammar
-                }
-            },
+            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
        )
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@ -627,9 +627,7 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
        await client.completions.create(
            model=model_name,
            prompt=prompt,
-            extra_body={"structured_outputs": {
-                "json": invalid_json_schema
-            }},
+            extra_body={"guided_json": invalid_json_schema},
        )


@ -648,9 +646,7 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
            model=model_name,
            prompt=prompt,
            extra_body={
-                "structured_outputs": {
-                    "regex": r"[.*"
-                },
+                "guided_regex": r"[.*",
                "stop": ["\n"]
            },
        )
@ -682,11 +678,7 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
        await client.completions.create(
            model=model_name,
            prompt=prompt,
-            extra_body={
-                "structured_outputs": {
-                    "grammar": invalid_simplified_sql_grammar
-                }
-            },
+            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
        )


--- a/tests/v1/offloading/test_worker.py
+++ b/tests/v1/offloading/test_worker.py
@ -1,152 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.offloading.abstract import LoadStoreSpec
-from vllm.v1.offloading.worker.worker import (OffloadingHandler,
-                                              OffloadingWorker, TransferResult,
-                                              TransferSpec)
-
-
-class LoadStoreSpec1(LoadStoreSpec):
-
-    def __init__(self,
-                 submit_success: bool = True,
-                 async_success: bool = True,
-                 exception: bool = False):
-        self.finished = False
-        self.submit_success = submit_success
-        self.async_success = async_success
-        self.exception = exception
-
-    @staticmethod
-    def medium() -> str:
-        return "1"
-
-    def __repr__(self):
-        return f"{self.medium()}: {id(self)}"
-
-
-class LoadStoreSpec2(LoadStoreSpec):
-
-    @staticmethod
-    def medium() -> str:
-        return "2"
-
-    def __repr__(self):
-        return f"{self.medium()}: {id(self)}"
-
-
-class OffloadingHandler1To2(OffloadingHandler):
-
-    def __init__(self):
-        self.transfers: dict[int, LoadStoreSpec1] = {}
-
-    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
-        src, dst = spec
-        assert isinstance(src, LoadStoreSpec1)
-        assert isinstance(dst, LoadStoreSpec2)
-
-        if src.exception:
-            raise Exception("An expected exception. Don't worry!")
-        if not src.submit_success:
-            return False
-
-        self.transfers[job_id] = src
-        return True
-
-    def get_finished(self) -> list[TransferResult]:
-        finished = []
-        for job_id, spec in list(self.transfers.items()):
-            if spec.finished:
-                finished.append((job_id, spec.async_success))
-                del self.transfers[job_id]
-        return finished
-
-
-class OffloadingHandler2To1(OffloadingHandler):
-
-    def __init__(self):
-        self.transfers: dict[int, LoadStoreSpec1] = {}
-
-    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
-        src, dst = spec
-        assert isinstance(src, LoadStoreSpec2)
-        assert isinstance(dst, LoadStoreSpec1)
-
-        self.transfers[job_id] = dst
-        return True
-
-    def get_finished(self) -> list[TransferResult]:
-        finished = []
-        for job_id, spec in list(self.transfers.items()):
-            if spec.finished:
-                finished.append((job_id, spec.async_success))
-                del self.transfers[job_id]
-        return finished
-
-
-def test_offloading_worker():
-    """
-    Tests OffloadingWorker with 2 handlers.
-    One handler performs 1->2 transfers, and the other handles 2->1.
-    """
-    worker = OffloadingWorker()
-    handler1to2 = OffloadingHandler1To2()
-    handler2to1 = OffloadingHandler2To1()
-    worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2)
-    worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1)
-
-    # 1st transfer 1->2 (exception)
-    src1 = LoadStoreSpec1(exception=True)
-    dst1 = LoadStoreSpec2()
-    assert not worker.transfer_async(1, (src1, dst1))
-
-    # 2ed transfer 1->2 (failure to submit)
-    src2 = LoadStoreSpec1(submit_success=False)
-    dst2 = LoadStoreSpec2()
-    assert not worker.transfer_async(2, (src2, dst2))
-
-    # 3rd transfer 1->2 (failure)
-    src3 = LoadStoreSpec1(async_success=False)
-    dst3 = LoadStoreSpec2()
-    assert worker.transfer_async(3, (src3, dst3))
-
-    # 4th transfer 1->2 (success)
-    src4 = LoadStoreSpec1()
-    dst4 = LoadStoreSpec2()
-    worker.transfer_async(4, (src4, dst4))
-    assert set(handler1to2.transfers.keys()) == {3, 4}
-
-    # 5th transfer 2->1
-    src5 = LoadStoreSpec2()
-    dst5 = LoadStoreSpec1()
-    worker.transfer_async(5, (src5, dst5))
-    assert set(handler2to1.transfers.keys()) == {5}
-
-    # no transfer completed yet
-    assert worker.get_finished() == []
-
-    # complete 3rd, 4th
-    src3.finished = True
-    src4.finished = True
-
-    # 6th transfer 1->2
-    src6 = LoadStoreSpec1()
-    dst6 = LoadStoreSpec2()
-    worker.transfer_async(6, (src6, dst6))
-
-    # 7th transfer 2->1
-    src7 = LoadStoreSpec2()
-    dst7 = LoadStoreSpec1()
-    worker.transfer_async(7, (src7, dst7))
-
-    # 6th and 7th transfers started
-    assert 6 in handler1to2.transfers
-    assert 7 in handler2to1.transfers
-
-    # verify result of 3rd and 4th transfers
-    assert (sorted(worker.get_finished()) == [(3, False), (4, True)])
-
-    # complete 6th and 7th transfers
-    src6.finished = True
-    dst7.finished = True
-    assert (sorted(worker.get_finished()) == [(6, True), (7, True)])
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@ -19,8 +19,6 @@ from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
-from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
-from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch

 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
 eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
@ -66,86 +64,6 @@ def _create_proposer(
                         device=current_platform.device_type)


-def test_prepare_next_token_ids():
-    """
-    Test for prepare_next_token_ids_cpu and prepare_next_token_ids_padded.
-    Each will produce a device tensor of next_token_ids, taking as input
-    either the GPU tensor of sampled_token_ids with -1 for rejected tokens,
-    or the CPU python list[list[int]] with the rejected tokens removed.
-    """
-    device = torch.device(current_platform.device_type)
-
-    num_requests = 4
-    num_speculative_tokens = 4
-    batch_spec = BatchSpec(
-        seq_lens=[num_speculative_tokens + 1] * num_requests,
-        query_lens=[num_speculative_tokens + 1] * num_requests,
-    )
-
-    req_ids = [f"req_{i+1}" for i in range(num_requests)]
-    mock_input_batch = mock.MagicMock(spec=InputBatch)
-    mock_input_batch.req_ids = req_ids
-    mock_input_batch.num_reqs = num_requests
-    mock_input_batch.vocab_size = 100
-
-    mock_num_scheduled_tokens = {req_id: 0 for req_id in req_ids}
-    mock_requests = {}
-    for req_id in req_ids:
-        mock_request = mock.MagicMock(spec=CachedRequestState)
-        # Each request will have a backup next token id of 10, 20, 30, 40
-        mock_request.get_token_id.return_value = int(req_id.split("_")[1]) * 10
-        mock_request.num_computed_tokens = 0
-        mock_requests[req_id] = mock_request
-
-    sampled_token_ids = [
-        [0, 1, -1, -1, -1],  # 1 accepted, 3 rejected, "1" sampled
-        [0, 1, 2, 3, 4],  # all accepted, "4" sampled
-        [-1, -1, -1, -1, -1],  # sampling skipped, use backup token "30"
-        [-1, -1, -1, -1, -1]  # this request will be discarded
-    ]
-    sampled_token_ids_tensor = torch.tensor(sampled_token_ids,
-                                            dtype=torch.int32,
-                                            device=device)
-    sampled_token_ids_cpu = [[i for i in seq if i != -1]
-                             for seq in sampled_token_ids]
-
-    expected_next_token_ids_cpu = [1, 4, 30, 40]
-    expected_next_token_ids_tensor = torch.tensor(expected_next_token_ids_cpu,
-                                                  dtype=torch.int32,
-                                                  device=device)
-
-    proposer = _create_proposer("eagle", num_speculative_tokens)
-
-    next_token_ids_from_cpu = proposer.prepare_next_token_ids_cpu(
-        sampled_token_ids_cpu, mock_requests, mock_input_batch,
-        mock_num_scheduled_tokens)
-
-    assert torch.equal(next_token_ids_from_cpu, expected_next_token_ids_tensor)
-
-    common_attn_metadata = create_common_attn_metadata(
-        batch_spec,
-        block_size=16,
-        device=device,
-    )
-
-    discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
-    num_discarded_reqs = 1
-
-    expected_valid_sampled_tokens_count = torch.tensor([2, 5, 0, 0],
-                                                       dtype=torch.int32,
-                                                       device=device)
-
-    next_token_ids_from_padded, valid_sampled_tokens_count = \
-        proposer.prepare_next_token_ids_padded(
-            common_attn_metadata, sampled_token_ids_tensor, mock_requests,
-            mock_input_batch, discarded_req_indices, num_discarded_reqs)
-
-    assert torch.equal(next_token_ids_from_padded,
-                       expected_next_token_ids_tensor)
-    assert torch.equal(valid_sampled_tokens_count,
-                       expected_valid_sampled_tokens_count)
-
-
 def test_prepare_inputs():
    """
    cu_target_query_lens: [0, a, a + b, a + b + c]
@ -172,24 +90,10 @@ def test_prepare_inputs():
        device=device,
    )

-    # If there are `k` sampled tokens, then `k-1` tokens are draft tokens
-    # from the previous iteration, and the last token is the bonus token sampled
-    # from the base model.
-    num_draft_tokens = [3, 6, 4]  # one less than query_lens
-    # num rejected tokens is [1, 3, 2]
-    ACCEPT_TOKEN = 0
-    BONUS_TOKEN = 1
-    REJECT_TOKEN = -1
-    sampled_token_ids = [
-        [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, BONUS_TOKEN],
-        [
-            ACCEPT_TOKEN, ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN,
-            REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN
-        ],
-        [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN]
-    ]
-    sampled_token_ids = [[i for i in seq if i != REJECT_TOKEN]
-                         for seq in sampled_token_ids]
+    # Rejected tokens per request: [1, 3, 2]
+    num_rejected_tokens = torch.tensor([1, 3, 2],
+                                       dtype=torch.int32,
+                                       device=device)

    # Expected calculations:
    # query_len_per_req = [4, 7, 5]
@ -221,7 +125,7 @@ def test_prepare_inputs():
    proposer = _create_proposer("eagle", 1)

    updated_metadata, token_indices = proposer.prepare_inputs(
-        common_attn_metadata, sampled_token_ids, num_draft_tokens)
+        common_attn_metadata, num_rejected_tokens.cpu())

    assert torch.equal(updated_metadata.query_start_loc,
                       expected_cu_num_tokens)
@ -229,77 +133,6 @@ def test_prepare_inputs():
    assert torch.equal(token_indices, expected_token_indices)


-def test_prepare_inputs_padded():
-    """
-    Input scenario is 3 requests with num_speculative_tokens == 2 and:
-    - Request 1: query_len = 3, rejected = 1
-    - Request 2: query_len = 3, rejected = 0
-    - Request 3: query_len = 3, rejected = 2
-
-    Expected outputs:
-    token_indices: [0, 1, 2,
-                    3, 4, 5,
-                    6, 7, 8]
-    Reason: Deferred computation should not disturb the original indices.
-
-    token_indices_to_sample: [1, 5, 6]
-    Reason: After accounting for rejections, these are the valid token positions
-            from the original indices to sample from.
-    """
-
-    device = torch.device(current_platform.device_type)
-
-    expected_token_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8],
-                                          dtype=torch.int32,
-                                          device=device)
-    expected_token_indices_to_sample = torch.tensor([1, 5, 6],
-                                                    dtype=torch.int32,
-                                                    device=device)
-
-    num_speculative_tokens = 2
-    batch_spec = BatchSpec(
-        seq_lens=[3, 3, 3],
-        query_lens=[3, 3, 3],
-    )
-
-    common_attn_metadata = create_common_attn_metadata(
-        batch_spec,
-        block_size=16,
-        device=device,
-    )
-
-    # Needed for cu_num_draft_tokens, which is expected to be [3, 6, 9]
-    expected_query_start_loc = torch.tensor([0, 3, 6, 9],
-                                            dtype=torch.int32,
-                                            device=device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-        draft_token_ids=[[0] * num_speculative_tokens] * 3,
-        device=device,
-    )
-
-    # num_rejected_tokens = [1, 0, 2]
-    # num_draft_tokens = [2, 2, 2]
-    # valid_sampled_tokens_count = num_draft_tokens + 1 - num_rejected_tokens
-    valid_sampled_tokens_count = torch.tensor([2, 3, 1],
-                                              dtype=torch.int32,
-                                              device=device)
-
-    proposer = _create_proposer("eagle", num_speculative_tokens)
-
-    output_metadata, token_indices, token_indices_to_sample = \
-        proposer.prepare_inputs_padded(
-            common_attn_metadata,
-            spec_decode_metadata,
-            valid_sampled_tokens_count)
-
-    assert output_metadata.max_query_len == 3
-    assert torch.equal(output_metadata.query_start_loc,
-                       expected_query_start_loc)
-    assert torch.equal(token_indices, expected_token_indices)
-    assert torch.equal(token_indices_to_sample,
-                       expected_token_indices_to_sample)
-
-
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
@pytest.mark.parametrize("attn_backend",
                         get_attn_backend_list_based_on_platform())
@ -540,7 +373,6 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
                              target_positions=target_positions,
                              target_hidden_states=target_hidden_states,
                              next_token_ids=next_token_ids,
-                              last_token_indices=None,
                              common_attn_metadata=common_attn_metadata,
                              sampling_metadata=sampling_metadata)

@ -694,7 +526,6 @@ def test_propose_tree(spec_token_tree):
                              target_positions=target_positions,
                              target_hidden_states=target_hidden_states,
                              next_token_ids=next_token_ids,
-                              last_token_indices=None,
                              common_attn_metadata=common_attn_metadata,
                              sampling_metadata=sampling_metadata)
    assert result.shape == (batch_size, num_speculative_tokens)
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -7,6 +7,7 @@ import pytest
 import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine

 MODEL = "meta-llama/Llama-3.2-1B-Instruct"

@ -95,3 +96,20 @@ def test_v1_attn_backend(monkeypatch):
        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
        assert envs.VLLM_USE_V1
        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_using_constructor_directly(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Sets VLLM_USE_V1=1.
+        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
+
+        # This uses the V0 constructor directly.
+        with pytest.raises(ValueError):
+            AsyncLLMEngine(vllm_config,
+                           AsyncLLMEngine._get_executor_cls(vllm_config),
+                           log_stats=True)
+
+        m.delenv("VLLM_USE_V1")
--- a/tests/worker/init.py
+++ b/tests/worker/init.py
--- a/tests/worker/conftest.py
+++ b/tests/worker/conftest.py
@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+
+import torch
+
+from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.model_executor import SamplingMetadata
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class MockAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_impl_cls():
+        raise NotImplementedError
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["AttentionMetadataBuilder"]:
+        return AttentionMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: list[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        pass
+
+
+def test_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
+    )
+    model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithSamplingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_model_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_model_input.sampling_metadata.seq_groups is None
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@ -0,0 +1,462 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import get_open_port
+from vllm.worker.model_runner import ModelRunner
+
+
+def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    model_runner = ModelRunner(
+        vllm_config=engine_config,
+        is_driver_worker=True,
+    )
+    return model_runner
+
+
+def test_deepseek_mla_attn_backend_module():
+    model_runner = _create_model_runner(
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        trust_remote_code=True,
+        enable_chunked_prefill=False,
+    )
+    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
+
+
+@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enable_prompt_embeds=True,
+    )
+
+    seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    block_tables = {0: [1]}
+    expected_input_embeds_len = 0
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * seq_len,
+                prompt_embeds=torch.rand(seq_len, 10),
+            )
+            expected_input_embeds_len += seq_len
+        else:
+            seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len))
+
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=True,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+        )
+        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    expected_selected_token_indices = []
+    selected_token_start_idx = 0
+    for seq_len in seq_lens:
+        expected_selected_token_indices.append(selected_token_start_idx +
+                                               seq_len - 1)
+        selected_token_start_idx += seq_len
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = attn_metadata.slot_mapping
+    assert return_seq_lens == seq_lens
+    assert len(slot_mapping) == len(input_tokens)
+
+    # Verify input metadata is correct for prompts.
+    device = model_runner.device
+    assert attn_metadata.num_prefills > 0
+    assert attn_metadata.num_decode_tokens == 0
+    torch.testing.assert_close(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
+    assert attn_metadata.max_decode_seq_len == 0
+
+    # Test subquery start locs.
+    start_idx = 0
+    start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        start_loc.append(start_idx)
+    torch.testing.assert_close(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+
+    # Test seq start locs. Note that for normal prefill it is
+    # equivalent to query_start_loc.
+    start_idx = 0
+    seq_start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        seq_start_loc.append(start_idx)
+
+    torch.testing.assert_close(
+        attn_metadata.seq_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+    torch.testing.assert_close(
+        attn_metadata.context_lens_tensor,
+        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
+                    dtype=torch.int,
+                    device=device))
+
+    expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
+                            dtype=torch.int32,
+                            device=model_runner.device)
+    torch.testing.assert_close(attn_metadata.block_tables, expected)
+    # Cuda graph should not be used for prerill.
+    assert attn_metadata.use_cuda_graph is False
+
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
+    if expected_input_embeds_len == 0:
+        torch.testing.assert_close(input_tokens, input_positions)
+        assert input_embeds is None
+    else:
+        assert len(input_embeds) == expected_input_embeds_len
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=model_runner.device,
+        pin_memory=model_runner.pin_memory)
+    assert len(input_tokens) == sum(seq_lens)
+    assert len(input_positions) == sum(seq_lens)
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(expected_selected_token_indices,
+                            device=actual.device,
+                            dtype=actual.dtype)
+    torch.testing.assert_close(actual, expected)
+    torch.allclose(input_tokens, input_positions)
+
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(expected_selected_token_indices,
+                            device=actual.device,
+                            dtype=actual.dtype)
+    torch.testing.assert_close(actual, expected)
+
+
+@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        enforce_eager=False,
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enable_prompt_embeds=True,
+    )
+
+    context_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    # Assume each seq group finishes prefill.
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        context_len = i % (model_runner.block_size - 1) + 1
+        context_lens.append(context_len)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * context_len,
+                prompt_embeds=torch.rand(context_len, 10),
+            )
+            output_embed = torch.rand(10)
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(context_len))
+            output_embed = None
+        seq_data.update_num_computed_tokens(context_len)
+        # Append one token ID since prefill is finished.
+        seq_data.append_token_id(1, 0, output_embed)
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: [1]},
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    slot_mapping = attn_metadata.slot_mapping
+
+    assert len(slot_mapping) == len(input_tokens)
+
+    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
+        len(seq_group_metadata_list))
+    # Verify input metadata is correct for prompts.
+    device = model_runner.device
+    assert attn_metadata.num_prefills == 0
+    assert attn_metadata.num_prefill_tokens == 0
+    seq_lens = [context_len + 1 for context_len in context_lens]
+    # seq_lens are padded to expected_bs
+    for _ in range(expected_bs - len(seq_lens)):
+        seq_lens.append(1)
+    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.num_decode_tokens == len(seq_lens)
+    start_idx = 0
+    start_loc = [start_idx]
+    for _ in context_lens:
+        # decode has only 1 token for query.
+        start_idx += 1
+        start_loc.append(start_idx)
+    torch.testing.assert_close(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+
+    start_idx = 0
+    seq_start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        seq_start_loc.append(start_idx)
+    torch.testing.assert_close(
+        attn_metadata.seq_start_loc,
+        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
+
+    torch.testing.assert_close(
+        attn_metadata.context_lens_tensor,
+        torch.tensor(context_lens, dtype=torch.int, device=device))
+    assert attn_metadata.max_decode_seq_len == max(seq_lens)
+    torch.testing.assert_close(
+        attn_metadata.seq_lens_tensor[:len(seq_lens)],
+        torch.tensor(seq_lens, dtype=torch.int, device=device))
+
+    # block table's first index corresponds to each batch, meaning in
+    # decoding it is each token.
+    assert attn_metadata.block_tables.shape[0] == len(input_tokens)
+    # Block table's second dim corresponds to each token's block number.
+    # It is padded up to
+    assert attn_metadata.block_tables.shape[1] == (
+        model_runner.get_max_block_per_batch())
+    assert attn_metadata.use_cuda_graph is True
+
+    assert len(input_tokens) == expected_bs
+    assert len(input_positions) == expected_bs
+    if use_prompt_embeds:
+        expected_input_embeds_length = start_loc[-1]
+        assert len(input_embeds) == expected_input_embeds_length
+        assert expected_input_embeds_length <= expected_bs
+    else:
+        assert input_embeds is None
+
+    # Verify Sampling
+    expected_selected_token_indices = []
+    for selected_token_start_idx, _ in enumerate(context_lens):
+        expected_selected_token_indices.append(selected_token_start_idx)
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        # query lens is all 1 for decode.
+        query_lens=[1 for _ in range(len(context_lens))],
+        device=model_runner.device,
+        pin_memory=model_runner.pin_memory)
+    actual = sampling_metadata.selected_token_indices
+    expected = torch.tensor(expected_selected_token_indices,
+                            device=actual.device,
+                            dtype=actual.dtype)
+    torch.testing.assert_close(actual, expected)
+
+
+def test_empty_seq_group():
+    """Verify prepare prompt and decode returns empty output."""
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        enforce_eager=False,
+    )
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+
+    assert input_tokens is None
+    assert input_positions is None
+    assert attn_metadata is None
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+
+    assert input_tokens is None
+    assert input_positions is None
+    assert input_embeds is None
+    assert attn_metadata is None
+    assert return_seq_lens is None
+
+
+@pytest.fixture
+def distributed_init():
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
+        local_rank=0)
+    ensure_model_parallel_initialized(1, 1)
+
+
+@pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize('use_prompt_embeds', [True, False])
+def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
+                        distributed_init, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        enforce_eager=enforce_eager,
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=True,
+        enable_prompt_embeds=True,
+    )
+
+    # Add prefill requests.
+    seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    prefill_metadata_list: list[SequenceGroupMetadata] = []
+    decode_metadata_list: list[SequenceGroupMetadata] = []
+    block_tables = {0: [1]}
+    prefill_batch_size = batch_size // 2
+    decode_batch_size = batch_size - prefill_batch_size
+    expected_input_embeds_len = 0
+    for i in range(prefill_batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * seq_len,
+                prompt_embeds=torch.rand(seq_len, 10),
+            )
+            expected_input_embeds_len += seq_len
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(seq_len), )
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=True,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+        )
+        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
+        seq_group_metadata_list.append(seq_group_metadata)
+        prefill_metadata_list.append(seq_group_metadata)
+
+    # Add decode requests
+    for i in range(prefill_batch_size, batch_size):
+        # make sure all tokens fit into one block
+        context_len = i % (model_runner.block_size - 1) + 1
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * context_len,
+                prompt_embeds=torch.rand(context_len, 10),
+            )
+            output_embed = torch.rand(10)
+            # This also iterates the expected input_embeds, because the model
+            # needs both the input and output embeddings passed into together
+            expected_input_embeds_len += 1
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(context_len), )
+            output_embed = None
+        assert len(seq_data.prompt_token_ids) == context_len
+        seq_data.append_token_id(1, 0, output_embed)
+        seq_data.update_num_computed_tokens(context_len)
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: [1]},
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
+        decode_metadata_list.append(seq_group_metadata)
+
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+
+    prefill_meta_actual = attn_metadata.prefill_metadata
+    decode_meta_actual = attn_metadata.decode_metadata
+
+    assert len(attn_metadata.slot_mapping) == len(input_tokens)
+    assert len(input_positions) == len(input_tokens)
+    assert attn_metadata.num_prefills == prefill_batch_size
+    assert attn_metadata.num_decode_tokens == decode_batch_size
+    assert attn_metadata.num_prefill_tokens == sum(seq_lens)
+    if expected_input_embeds_len == 0:
+        assert input_embeds is None
+    else:
+        assert len(input_embeds) == expected_input_embeds_len
+
+    # Verify attn metadata is consistent. We don't need to test individual
+    # values here because they are tested above.
+    attn_metadata = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list).attn_metadata
+
+    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
+                                          vars(prefill_meta_actual)):
+        assert attr_expected[1] == attr_actual[1]
+    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
+                                          vars(decode_meta_actual)):
+        assert attr_expected[1] == attr_actual[1]
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.worker import Worker
+
+
+def test_gpu_memory_profiling():
+    # Tests the gpu profiling that happens in order to determine the number of
+    # KV cache blocks that we can allocate on the GPU.
+    # This test mocks the maximum available gpu memory so that it can run on
+    # any gpu setup.
+
+    # Set up engine args to build a worker.
+    engine_args = EngineArgs(model="facebook/opt-125m",
+                             dtype="half",
+                             load_format="dummy")
+    engine_config = engine_args.create_engine_config()
+    engine_config.cache_config.num_gpu_blocks = 1000
+    engine_config.cache_config.num_cpu_blocks = 1000
+
+    # Create the worker.
+    distributed_init_method = get_distributed_init_method(
+        get_ip(), get_open_port())
+    worker = Worker(
+        vllm_config=engine_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+
+    # Set 10GiB as the total gpu ram to be device-agnostic
+    def mock_mem_info():
+        current_usage = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        mock_total_bytes = 10 * 1024**3
+        free = mock_total_bytes - current_usage
+
+        return (free, mock_total_bytes)
+
+    from unittest.mock import patch
+    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        # Load the model so we can profile it
+        worker.init_device()
+        worker.load_model()
+        gpu_blocks, _ = worker.determine_num_available_blocks()
+
+    # Peak vram usage by torch should be 0.47 GiB
+    # Model weights take 0.25 GiB
+    # No memory should be allocated outside of torch
+    # 9.0 GiB should be the utilization target
+    # 8.28 GiB should be available for the KV cache
+    block_size = CacheEngine.get_cache_block_size(
+        engine_config.cache_config, engine_config.model_config,
+        engine_config.parallel_config)
+
+    expected_blocks = (8.28 * 1024**3) // block_size
+
+    # Check within a small tolerance for portability
+    # Hardware, kernel, or dependency changes could all affect memory
+    # utilization.
+    # A 100 block tolerance here should be about 60MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 100
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.worker import Worker
+
+
+def test_swap() -> None:
+    # Configure the engine.
+    engine_args = EngineArgs(model="distilbert/distilgpt2",
+                             dtype="half",
+                             load_format="dummy")
+    engine_config = engine_args.create_engine_config()
+    engine_config.cache_config.num_gpu_blocks = 1000
+    engine_config.cache_config.num_cpu_blocks = 1000
+
+    # Create the worker.
+    distributed_init_method = get_distributed_init_method(
+        get_ip(), get_open_port())
+    worker = Worker(
+        vllm_config=engine_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+
+    # Initialize the worker.
+    worker.init_device()
+    worker.load_model()
+    worker.initialize_cache(
+        num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
+        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
+
+    # Randomly initialize the cache.
+    gpu_cache = worker.cache_engine[0].gpu_cache
+    cpu_cache = worker.cache_engine[0].cpu_cache
+    num_layers = len(gpu_cache)
+    for i in range(num_layers):
+        gpu_key_cache, gpu_value_cache = gpu_cache[i]
+        gpu_key_cache.random_()
+        gpu_value_cache.random_()
+        cpu_key_cache, cpu_value_cache = cpu_cache[i]
+        cpu_key_cache.random_()
+        cpu_value_cache.random_()
+
+    allclose = lambda a, b: torch.allclose(
+        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
+
+    # Test swap out.
+    blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=[],
+        blocks_to_swap_in=[],
+        blocks_to_swap_out=blocks_to_swap_out,
+        blocks_to_copy=[],
+    )
+    worker.execute_model(execute_model_req=execute_model_req)
+
+    for i in range(num_layers):
+        gpu_key_cache, gpu_value_cache = gpu_cache[i]
+        cpu_key_cache, cpu_value_cache = cpu_cache[i]
+        for src, dst in blocks_to_swap_out:
+            assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
+            assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
+
+    # Test swap in.
+    execute_model_req.blocks_to_swap_out = []
+    execute_model_req.blocks_to_swap_in = [
+        (19, 45),
+        (67, 23),
+        (12, 78),
+        (40, 99),
+        (1, 71),
+    ]
+    worker.execute_model(execute_model_req=execute_model_req)
+
+    for i in range(num_layers):
+        gpu_key_cache, gpu_value_cache = gpu_cache[i]
+        cpu_key_cache, cpu_value_cache = cpu_cache[i]
+        for src, dst in execute_model_req.blocks_to_swap_in:
+            assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
+            assert allclose(gpu_value_cache[dst], cpu_value_cache[src])
--- a/tools/validate_config.py
+++ b/tools/validate_config.py
@ -9,8 +9,6 @@ import ast
 import inspect
 import sys

-import regex as re
-

 def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]:
    """
@ -90,12 +88,11 @@ def validate_class(class_node: ast.ClassDef):
    for stmt in class_node.body:
        # A field is defined as a class variable that has a type annotation.
        if isinstance(stmt, ast.AnnAssign):
-            # Skip ClassVar and InitVar
+            # Skip ClassVar
            # see https://docs.python.org/3/library/dataclasses.html#class-variables
-            # and https://docs.python.org/3/library/dataclasses.html#init-only-variables
-            if (isinstance(stmt.annotation, ast.Subscript)
-                    and isinstance(stmt.annotation.value, ast.Name)
-                    and stmt.annotation.value.id in {"ClassVar", "InitVar"}):
+            if isinstance(stmt.annotation, ast.Subscript) and isinstance(
+                    stmt.annotation.value,
+                    ast.Name) and stmt.annotation.value.id == "ClassVar":
                continue

            if isinstance(stmt.target, ast.Name):
@ -135,7 +132,7 @@ def validate_ast(tree: ast.stmt):

 def validate_file(file_path: str):
    try:
-        print(f"Validating {file_path} config dataclasses ", end="")
+        print(f"validating {file_path} config dataclasses ", end="")
        with open(file_path, encoding="utf-8") as f:
            source = f.read()

@ -143,7 +140,7 @@ def validate_file(file_path: str):
        validate_ast(tree)
    except ValueError as e:
        print(e)
-        raise SystemExit(1) from e
+        SystemExit(2)
    else:
        print("✅")

@ -154,13 +151,7 @@ def fail(message: str, node: ast.stmt):

 def main():
    for filename in sys.argv[1:]:
-        # Only run for Python files in vllm/ or tests/
-        if not re.match(r"^(vllm|tests)/.*\.py$", filename):
-            continue
-        # Only run if the file contains @config
-        with open(filename, encoding="utf-8") as f:
-            if "@config" in f.read():
-                validate_file(filename)
+        validate_file(filename)


 if __name__ == "__main__":
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module):
            backend = _Backend.FLASH_ATTN
            use_upstream_fa = True

-        if current_platform.is_rocm() or current_platform.is_xpu():
-            # currently, only torch_sdpa is supported on rocm/xpu
+        if current_platform.is_rocm():
+            # currently, only torch_sdpa is supported on rocm
            self.attn_backend = _Backend.TORCH_SDPA
        else:

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
copilot-swe-agent[bot]	6b814cf1e8	Remove temporary test file that was accidentally committed Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-09-18 00:41:19 +00:00
copilot-swe-agent[bot]	bf253afc1c	Add comprehensive documentation and examples for CPU weight loading Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-09-18 00:41:05 +00:00
copilot-swe-agent[bot]	8c64cf87f0	Initial plan	2025-09-18 00:34:31 +00:00