merge

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
[CI/Build] Add tests for the V1 tpu_model_runner. (#14843 )
2025-03-25 10:26:20 -07:00 · 2025-03-25 12:27:16 -04:00 · 2025-03-25 07:05:39 -07:00 · 2025-03-25 13:50:49 +00:00 · 2025-03-25 11:01:58 +00:00 · 2025-03-25 18:27:57 +08:00
181 changed files with 7427 additions and 2553 deletions
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -38,6 +38,8 @@ function cpu_tests() {
    set -e
    pip install -r vllm/requirements/test.txt
    pip install -r vllm/requirements/cpu.txt
+    pytest -v -s tests/kernels/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -14,6 +14,7 @@ DOCKER_BUILDKIT=1 docker build . \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
+  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"

@ -23,6 +24,6 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference
-docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo TEST_1 \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
    && echo TEST_3 \
@ -30,7 +30,9 @@ docker run --privileged --net host --shm-size=16G -it \
    && echo TEST_4 \
    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && echo TEST_5 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+    && echo TEST_6 \
+    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \


 # TODO: This test fails because it uses RANDOM_SEED sampling
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -118,7 +118,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -148,8 +148,8 @@ steps:
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
-  - python3 rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

 - label: Metrics, Tracing Test # 10min
@ -515,7 +515,7 @@ steps:
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  commands:
-  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,8 @@
 /vllm/_version.py

 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+vllm/vllm_flash_attn/*
+!vllm/vllm_flash_attn/fa_utils.py

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -228,6 +228,7 @@ endif()

 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
+  "csrc/block_table.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/pos_encoding_kernels.cu"
--- a/87
+++ b/87
@ -14,17 +14,22 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -46,19 +51,22 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 WORKDIR /workspace

+# install build and runtime dependencies
+
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -83,7 +91,7 @@ COPY requirements/build.txt requirements/build.txt
 ENV UV_HTTP_TIMEOUT=500

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -155,7 +163,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
@ -171,18 +179,23 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo vim \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -200,13 +213,14 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose

 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@ -221,8 +235,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl

 RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples

@ -232,7 +247,7 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt

 #################### vLLM installation IMAGE ####################

@ -249,15 +264,15 @@ ENV UV_HTTP_TIMEOUT=500

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils
+    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install hf_transfer
+    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
@ -282,9 +297,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi

 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -1,37 +1,267 @@
-FROM mambaorg/micromamba
-ARG MAMBA_DOCKERFILE_ACTIVATE=1
-USER root
+ARG BASE_UBI_IMAGE_TAG=9.5-1741850109

-ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+###############################################################
+# base stage with basic dependencies
+###############################################################

-RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder

-# Some packages in requirements/cpu are installed here
-# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
-# Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv, cargo & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:/root/.cargo/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# install gcc-13, python, rust, openblas
+# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
+# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
+#       when `--jobs=<N>` is passed with podman build command
+RUN microdnf install -y openssl-devel dnf \
+    && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
+    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
+    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
+    && dnf config-manager --set-enabled crb \
+    && dnf install -y \
+       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
+       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
+       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
+       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
+       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && dnf clean all \
+    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
+    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && cd /tmp && touch control
+
+###############################################################
+# Stage to build torch family
+###############################################################
+
+FROM base-builder AS torch-builder
+
+ARG MAX_JOBS
+ARG TORCH_VERSION=2.6.0
+ARG _GLIBCXX_USE_CXX11_ABI=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable &&  \
+    git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
+    cd pytorch && \
+    uv pip install -r requirements.txt && \
+    python setup.py develop && \
+    rm -f dist/torch*+git*whl && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
+
+ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_USE_NVJPEG=0
+ARG TORCHVISION_USE_FFMPEG=0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
+    cd vision && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHVISION_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+ARG TORCHAUDIO_VERSION=2.6.0
+ARG BUILD_SOX=1
+ARG BUILD_KALDI=1
+ARG BUILD_RNNT=1
+ARG USE_FFMPEG=0
+ARG USE_ROCM=0
+ARG USE_CUDA=0
+ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
+    cd audio && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHAUDIO_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+###############################################################
+# Stage to build pyarrow
+###############################################################
+
+FROM base-builder AS arrow-builder
+
+ARG MAX_JOBS
+ARG PYARROW_PARALLEL
+ARG PYARROW_VERSION=19.0.1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
+    cd arrow/cpp && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=release \
+        -DCMAKE_INSTALL_PREFIX=/usr/local \
+        -DARROW_PYTHON=ON \
+        -DARROW_BUILD_TESTS=OFF \
+        -DARROW_JEMALLOC=ON \
+        -DARROW_BUILD_STATIC="OFF" \
+        -DARROW_PARQUET=ON \
+        .. && \
+    make install -j ${MAX_JOBS:-$(nproc)} && \
+    cd ../../python/ && \
+    uv pip install -v -r requirements-wheel-build.txt && \
+    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
+    python setup.py build_ext \
+    --build-type=release --bundle-arrow-cpp \
+    bdist_wheel --dist-dir /arrowwheels/
+
+###############################################################
+# Stage to build opencv
+###############################################################
+
+FROM base-builder AS cv-builder
+
+ARG MAX_JOBS
+ARG OPENCV_VERSION=84
+ARG ENABLE_HEADLESS=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
+    cd opencv-python && \
+    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
+    python -m build --wheel --installer=uv --outdir /opencvwheels/
+
+###############################################################
+# Stage to build vllm - this stage builds and installs
+# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
+# for transitive dependencies - eg. grpcio
+###############################################################
+
+FROM base-builder AS vllmcache-builder
+
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+
+ARG VLLM_TARGET_DEVICE=cpu
+
+# this step installs vllm and populates uv cache
+# with all the transitive dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,src=.,dst=/src/,rw \
+    source /opt/rh/gcc-toolset-13/enable && \
+    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
+    uv pip install pandas pythran pybind11 && \
+    # sentencepiece.pc is in some pkgconfig inside uv cache
+    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
+    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    cd /src/ && \
+    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
+    uv pip install /vllmwheel/*.whl
+
+
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+###############################################################
+# Stage to build lapack
+###############################################################
+
+FROM base-builder AS lapack-builder
+
+ARG MAX_JOBS
+ARG LAPACK_VERSION=3.12.1
+RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
+    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cmake -B build -S . \
+    && cmake --build build -j ${MAX_JOBS:-$(nproc)}
+
+
+###############################################################
+#                   FINAL VLLM IMAGE STAGE                    #
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
+
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# create artificial dependencies between stages for independent stages to build in parallel
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=vllmcache-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
+COPY --from=lapack-builder /tmp/control /dev/null
+
+# install gcc-11, python, openblas, numactl, lapack
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    microdnf install --nodocs -y \
+    tar findutils openssl \
+    pkgconfig xsimd g++ gcc-fortran libsndfile \
+    libtiff libjpeg openjpeg2 zlib zeromq \
+    freetype lcms2 libwebp tcl tk utf8proc \
+    harfbuzz fribidi libraqm libimagequant libxcb \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && microdnf clean all \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv --no-cache \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
+    && make -C /numactl install \
+    && uv pip install cmake \
+    && cmake --install /lapack/build \
+    && uv pip uninstall cmake
+
+# consume previously built wheels (including vllm)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl

 COPY ./ /workspace/vllm
-
 WORKDIR /workspace/vllm
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

-RUN --mount=type=cache,target=/root/.cache/pip  \
-    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/cpu.txt \
-        xformers uvloop==0.20.0
-
-RUN --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py install
-
 # install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils

 WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -40,7 +40,7 @@ ARG USE_CYTHON
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
--- a/README.md
+++ b/README.md
@ -28,10 +28,27 @@ Easy, fast, and cheap LLM serving for everyone
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+
+<details>
+<summary>Previous News</summary>
+
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
+- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
+- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
+- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
+- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
+- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+
+</details>

 ---
-
 ## About

 vLLM is a fast and easy-to-use library for LLM inference and serving.
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from sglang quantization/tuning_block_wise_kernel.py
+
+import argparse
+import json
+import multiprocessing as mp
+import os
+import time
+from datetime import datetime
+from typing import Any
+
+import torch
+import tqdm
+import triton
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+mp.set_start_method("spawn", force=True)
+
+assert current_platform.is_cuda(
+), "Only support tune w8a8 block fp8 kernel on CUDA device."
+
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+}
+
+
+def w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    config: dict[str, Any],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with 
+    block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. 
+                    It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    if A.dtype == torch.float8_e4m3fn:
+        kernel = _w8a8_block_fp8_matmul
+    else:
+        raise RuntimeError(
+            "Currently, only support tune w8a8 block fp8 kernel.")
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+def get_configs_compute_bound():
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_m,
+                                "BLOCK_SIZE_N": block_n,
+                                "BLOCK_SIZE_K": block_k,
+                                "GROUP_SIZE_M": group_size,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+    return configs
+
+
+def get_weight_shapes(tp_size):
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
+    # Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (12288, 7168),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def benchmark_config(A,
+                     B,
+                     As,
+                     Bs,
+                     block_size,
+                     config,
+                     out_dtype=torch.float16,
+                     num_iters=10):
+
+    def run():
+        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
+
+    torch.cuda.synchronize()
+    # JIT complication & warmup
+    for _ in range(5):
+        run()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    return avg
+
+
+def tune(M, N, K, block_size, out_dtype, search_space, input_type):
+    factor_for_scale = 1e-2
+
+    if input_type == "fp8":
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
+            fp8_max)
+        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
+            fp8_max)
+        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    else:
+        raise RuntimeError(
+            "Currently, only support tune w8a8 block fp8 kernel.")
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32,
+                    device="cuda") * factor_for_scale
+    Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
+          factor_for_scale)
+
+    best_config = None
+    best_time = float("inf")
+    for config in tqdm(search_space):
+        try:
+            kernel_time = benchmark_config(
+                A,
+                B,
+                As,
+                Bs,
+                block_size,
+                config,
+                out_dtype,
+                num_iters=10,
+            )
+        except triton.runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
+
+        if kernel_time < best_time:
+            best_time = kernel_time
+            best_config = config
+    now = datetime.now()
+    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
+    assert best_config is not None
+    return best_config
+
+
+def save_configs(
+    N,
+    K,
+    block_n,
+    block_k,
+    configs,
+    save_path,
+    input_type="fp8",
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = (
+        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
+        f"block_shape=[{block_n},{block_k}].json")
+
+    config_file_path = os.path.join(save_path, json_file_name)
+    print(f"Writing best config to {config_file_path}...")
+
+    with open(config_file_path, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def tune_on_gpu(args_dict):
+    """Run tuning on a specific GPU."""
+    gpu_id = args_dict["gpu_id"]
+    batch_sizes = args_dict["batch_sizes"]
+    weight_shapes = args_dict["weight_shapes"]
+    args = args_dict["args"]
+
+    torch.cuda.set_device(gpu_id)
+    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
+
+    block_n = args.block_n
+    block_k = args.block_k
+    out_dtype = DTYPE_MAP[args.out_dtype]
+    save_path = args.save_path
+    input_type = args.input_type
+
+    search_space = get_configs_compute_bound()
+    search_space = [
+        config for config in search_space
+        if block_k % config["BLOCK_SIZE_K"] == 0
+    ]
+
+    start = time.time()
+    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
+        N, K = shape[0], shape[1]
+        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
+        benchmark_results = [
+            tune(
+                batch_size,
+                N,
+                K,
+                [block_n, block_k],
+                out_dtype,
+                search_space,
+                input_type,
+            ) for batch_size in tqdm(batch_sizes,
+                                     desc=f"GPU {gpu_id} - Batch sizes")
+        ]
+        best_configs = {
+            M: config
+            for M, config in zip(batch_sizes, benchmark_results)
+        }
+        save_configs(N, K, block_n, block_k, best_configs, save_path,
+                     input_type)
+
+    end = time.time()
+    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
+
+
+def distribute_batch_sizes(batch_sizes, num_gpus):
+    """Distribute batch sizes across available GPUs."""
+    batches_per_gpu = []
+    for i in range(num_gpus):
+        start_idx = i * len(batch_sizes) // num_gpus
+        end_idx = (i + 1) * len(batch_sizes) // num_gpus
+        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
+    return batches_per_gpu
+
+
+def main(args):
+    print(args)
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPU available for tuning")
+    print(f"Found {num_gpus} GPUs for parallel tuning")
+
+    torch.cuda.init()
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+        num_gpus = 1  # If only one batch size, use only one GPU
+
+    weight_shapes = get_weight_shapes(args.tp_size)
+
+    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
+
+    process_args = []
+    for gpu_id in range(num_gpus):
+        process_args.append({
+            "gpu_id": gpu_id,
+            "batch_sizes": batches_per_gpu[gpu_id],
+            "weight_shapes":
+            weight_shapes,  # Each GPU processes all weight shapes
+            "args": args,
+        })
+
+    ctx = mp.get_context("spawn")
+    with ctx.Pool(num_gpus) as pool:
+        pool.map(tune_on_gpu, process_args)
+
+    print("Multi-GPU tuning completed")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""
+Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
+    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
+Then copy to model_executor/layers/quantization/utils/configs
+        """,
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--tp-size", "-tp", type=int, default=8)
+    parser.add_argument("--input-type",
+                        type=str,
+                        choices=["fp8"],
+                        default="fp8")
+    parser.add_argument(
+        "--out-dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16", "half"],
+        default="float16",
+    )
+    parser.add_argument("--block-n", type=int, default=128)
+    parser.add_argument("--block-k", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--save-path", type=str, default="./")
+    args = parser.parse_args()
+
+    main(args)
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -190,6 +190,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
+    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")

--- a/csrc/block_table.cu
+++ b/csrc/block_table.cu
@ -0,0 +1,92 @@
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+namespace vllm {
+__global__ void append_kernel(const int* __restrict__ row_indices,
+                              const int* __restrict__ cu_num_appends,
+                              const int* __restrict__ block_ids,
+                              int* __restrict__ block_table,
+                              int max_num_blocks_per_row) {
+  int bid = blockIdx.x;
+  int tgt_row = row_indices[2 * bid];
+  int tgt_offset = row_indices[2 * bid + 1];
+
+  int start = cu_num_appends[bid];
+  int end = cu_num_appends[bid + 1];
+  int length = end - start;
+  int tid = threadIdx.x;
+  int64_t offset = tgt_row * max_num_blocks_per_row + tgt_offset;
+  for (int i = tid; i < length; i += blockDim.x) {
+    block_table[offset + i] = block_ids[start + i];
+  }
+}
+
+__global__ void move_kernel(const int* __restrict__ src_dst_n,
+                            int* __restrict__ block_table,
+                            int max_num_blocks_per_row) {
+  int bid = blockIdx.x;
+  int src_row = src_dst_n[3 * bid];
+  int tgt_row = src_dst_n[3 * bid + 1];
+  int num_blocks = src_dst_n[3 * bid + 2];
+
+  int tid = threadIdx.x;
+  for (int i = tid; i < num_blocks; i += blockDim.x) {
+    block_table[tgt_row * max_num_blocks_per_row + i] =
+        block_table[src_row * max_num_blocks_per_row + i];
+  }
+}
+}  // namespace vllm
+
+void block_table_appends(
+    torch::Tensor& append_row_indices,
+    torch::Tensor& append_row_indices_cpu,
+    torch::Tensor& append_cumsums,
+    torch::Tensor& append_cumsums_cpu,
+    torch::Tensor& append_block_ids,
+    torch::Tensor& append_block_ids_cpu,
+    torch::Tensor& block_table,
+    int64_t num_appends,
+    int64_t total_num_append_blocks) {
+  int* append_row_indices_ptr = append_row_indices.data_ptr<int>();
+  const int* append_row_indices_cpu_ptr = append_row_indices_cpu.data_ptr<int>();
+  int* append_cumsums_ptr = append_cumsums.data_ptr<int>();
+  const int* append_cumsums_cpu_ptr = append_cumsums_cpu.data_ptr<int>();
+  int* append_block_ids_ptr = append_block_ids.data_ptr<int>();
+  const int* append_block_ids_cpu_ptr = append_block_ids_cpu.data_ptr<int>();
+  int* block_table_ptr = block_table.data_ptr<int>();
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(block_table));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cudaMemcpyAsync(append_row_indices_ptr, append_row_indices_cpu_ptr,
+                  num_appends * 2 * sizeof(int), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(append_cumsums_ptr, append_cumsums_cpu_ptr,
+                  (num_appends + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(append_block_ids_ptr, append_block_ids_cpu_ptr,
+                  total_num_append_blocks * sizeof(int), cudaMemcpyHostToDevice, stream);
+
+  int64_t max_num_blocks_per_row = block_table.size(1);
+  vllm::append_kernel<<<num_appends, 1024, 0, stream>>>(
+      append_row_indices_ptr, append_cumsums_ptr, append_block_ids_ptr,
+      block_table_ptr, max_num_blocks_per_row);
+}
+
+void block_table_moves(
+    torch::Tensor& src_dst_n,
+    torch::Tensor& src_dst_n_cpu,
+    torch::Tensor& block_table,
+    int64_t num_moves) {
+  int* src_dst_n_ptr = src_dst_n.data_ptr<int>();
+  const int* src_dst_n_cpu_ptr = src_dst_n_cpu.data_ptr<int>();
+  int* block_table_ptr = block_table.data_ptr<int>();
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(block_table));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cudaMemcpyAsync(src_dst_n_ptr, src_dst_n_cpu_ptr,
+                  num_moves * 3 * sizeof(int), cudaMemcpyHostToDevice, stream);
+
+  int64_t max_num_blocks_per_row = block_table.size(1);
+  vllm::move_kernel<<<num_moves, 1024, 0, stream>>>(
+      src_dst_n_ptr, block_table_ptr, max_num_blocks_per_row);
+}
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -88,6 +88,48 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace

+template <typename scalar_t>
+void concat_and_cache_mla_cpu_impl(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                      // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int num_tokens,                      //
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size                       //
+) {
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    const int64_t slot_idx = slot_mapping[token_idx];
+    // NOTE: slot_idx can be -1 if the token is padded
+    if (slot_idx < 0) {
+      continue;
+    }
+    const int64_t block_idx = slot_idx / block_size;
+    const int64_t block_offset = slot_idx % block_size;
+
+    auto copy = [&](const scalar_t* __restrict__ src,
+                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
+                    int size, int offset) {
+      for (int i = 0; i < size; i++) {
+        const int64_t src_idx = token_idx * src_stride + i;
+        const int64_t dst_idx =
+            block_idx * block_stride + block_offset * entry_stride + i + offset;
+        dst[dst_idx] = src[src_idx];
+      }
+    };
+
+    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+  }
+}
+
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
@ -134,6 +176,38 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  });
 }

+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  TORCH_CHECK(kv_cache_dtype != "fp8");
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
+        concat_and_cache_mla_cpu_impl<scalar_t>(
+            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
+            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
+            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
+            kv_lora_rank, pe_dim, block_size);
+        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
+      });
+}
+
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping) {
  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -130,6 +130,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {

  __m512i reg;

+  explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
+
  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}

  explicit BF16Vec32(__m512i data) : reg(data) {}
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@ -0,0 +1,393 @@
+#include "cpu_types.hpp"
+#include <float.h>
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using qk_load_vec_type = void;
+  using qk_vec_type = void;
+  using v_load_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures, including x86
+  using qk_load_vec_type = vec_op::FP16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+};
+
+#ifdef __AVX512BF16__
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec32;
+  using qk_vec_type = vec_op::BF16Vec32;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+// pass
+#else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#endif
+
+template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
+          typename qk_vec_type>
+void mla_decode_block_head(
+    const qk_vec_type* __restrict__ q_vecs,          // [HEAD_UNROLL, head_dim]
+    const qk_vec_type* __restrict__ k_vecs,          // [block_size, head_dim]
+    const vec_op::FP32Vec16* __restrict v_vecs_f32,  // [block_size, v_head_dim]
+    float* __restrict__ acc_out,  // [HEAD_UNROLL, v_head_dim]
+    float* __restrict__ acc_lse,  // [HEAD_UNROLL]
+    const float scale, const int num_tokens) {
+  using f32_vec_type = vec_op::FP32Vec16;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
+
+  float logits[BLOCK_SIZE][HEAD_UNROLL] = {};  // initialize to zeros
+  float max_val[HEAD_UNROLL];
+  std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
+
+  f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
+  for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
+    // load to registers
+    qk_vec_type q_vec[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      q_vec[unroll] =
+          qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+      qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
+    }
+  }
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
+      logits[block_offset][unroll] = acc;
+      max_val[unroll] = std::max(max_val[unroll], acc);
+    }
+  }
+
+  float sum_exp[HEAD_UNROLL] = {};
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float val =
+          std::exp(logits[block_offset][unroll] - max_val[unroll]);
+      logits[block_offset][unroll] = val;
+      sum_exp[unroll] += val;
+    }
+  }
+
+  f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+    // load to registers
+    f32_vec_type scale_[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      scale_[unroll] =
+          f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
+
+    for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+      f32_vec_type v_vec(
+          v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
+    }
+  }
+
+  // merge attention state
+  // section 2.2 in https://arxiv.org/pdf/2501.01005
+  f32_vec_type prev_scale[HEAD_UNROLL];
+  f32_vec_type curr_scale[HEAD_UNROLL];
+
+#pragma unroll
+  for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+    const float prev_lse = acc_lse[unroll];
+    const float curr_lse = std::log(sum_exp[unroll]) +
+                           max_val[unroll];  // add back max_val to get true lse
+    // softmax trick
+    const float max_lse = std::max(prev_lse, curr_lse);
+    const float prev_sum_exp = std::exp(prev_lse - max_lse);
+    const float curr_sum_exp = std::exp(curr_lse - max_lse);
+
+    const float new_sum_exp = prev_sum_exp + curr_sum_exp;
+    acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
+
+    prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
+    curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
+  }
+
+  for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
+      o_vec = o_vec * prev_scale[unroll] +
+              this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
+      o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
+    }
+  }
+
+  q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
+  acc_out += V_HEAD_DIM * HEAD_UNROLL;
+}
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
+          typename qk_vec_type>
+void mla_decode_block(
+    const qk_vec_type* __restrict__ q_vecs,  // [num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,   // [block_size, head_dim]
+    float* __restrict__ acc_out,             // [num_heads, v_head_dim]
+    float* __restrict__ acc_lse,             // [num_heads]
+    const int num_heads, const float scale, const int num_tokens) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  static_assert(
+      std::is_same<qk_vec_type,
+                   typename KernelVecType<scalar_t>::qk_vec_type>::value);
+  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+  using f32_vec_type = vec_op::FP32Vec16;
+  static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
+  static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
+
+  const qk_vec_type* k_vecs;
+  const f32_vec_type* v_vecs_f32;
+  float* kv_cache_f32 = nullptr;
+
+  if constexpr (!std::is_same<scalar_t, float>::value) {
+    // convert KV cache block to FP32 to reuse it across query heads and
+    // attn @ V computation, since FP16/BF16->FP32 is expensive.
+    // TODO: move malloc outside of this fn to reuse across iterations.
+    const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
+    kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
+      for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
+        v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
+        f32_vec_type kv_vec_f32(kv_load_vec);
+        kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
+      }
+
+    if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
+      // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
+      // NOTE: in this case, we only need to convert the V section to FP32.
+      // But for simplicity, we will convert the whole KV block to FP32.
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    } else {
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
+    }
+
+    // attn @ V always use FP32 for V, since attn is FP32.
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
+
+  } else {
+    // KV cache is FP32. don't need to do anything.
+    k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
+  }
+
+  // compute 2 heads at the same time to improve ILP and
+  // take advantage of register cache for K and V.
+  constexpr int HEAD_UNROLL = 2;
+  for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
+    acc_out += HEAD_UNROLL * V_HEAD_DIM;
+    acc_lse += HEAD_UNROLL;
+  }
+
+  // take care of the remaining heads
+  for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_DIM / QK_NUM_ELEM;
+    acc_out += V_HEAD_DIM;
+    acc_lse += 1;
+  }
+
+  if (kv_cache_f32 != nullptr) {
+    std::free(kv_cache_f32);
+  }
+}
+}  // namespace
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
+void mla_decode_kvcache_cpu_impl(
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, v_head_dim]
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                            // head_dim]
+    const int num_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
+    const int kv_stride, const int num_seqs) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+
+  // shared across threads
+  const int max_threads = omp_get_max_threads();
+  const int acc_out_nbytes =
+      max_threads * num_heads * V_HEAD_DIM * sizeof(float);
+  float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
+  std::vector<float> acc_lse(max_threads * num_heads);
+
+  // allocate memory to pre-convert query to FP32 later
+  float* q_f32;
+  constexpr bool PRE_CONVERT_QUERY =
+      !std::is_same<scalar_t, float>::value &&
+      std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
+  if constexpr (PRE_CONVERT_QUERY) {
+    const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
+    q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
+  }
+
+#pragma omp parallel
+  {
+    const int num_threads = omp_get_num_threads();
+    const int thread_id = omp_get_thread_num();
+    float* __restrict__ acc_out_thread =
+        acc_out + thread_id * num_heads * V_HEAD_DIM;
+    float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
+
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      // reset accumulator
+      std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
+      std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
+
+      const int seq_len = seq_lens[seq_idx];
+      const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
+
+      const qk_vec_type* q_vecs;
+      if constexpr (PRE_CONVERT_QUERY) {
+// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
+#pragma omp for
+        for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
+          qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
+          qk_vec_type q_vec(q_load_vec);
+          q_vec.save(q_f32 + i);
+        }
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
+      } else {
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
+      }
+
+#pragma omp for
+      for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+        const int physical_block_idx =
+            block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
+        const int num_tokens =
+            block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
+
+        mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
+            q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
+            acc_lse_thread, num_heads, scale, num_tokens);
+      }
+
+// merge attention states across threads
+// section 2.2 in https://arxiv.org/pdf/2501.01005
+// each thread is responsible for 1 head
+#pragma omp for
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        float* acc_lse_head = acc_lse.data() + head_idx;
+        float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
+
+        float max_val = -FLT_MAX;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
+        }
+
+        float sum_exp = 0.0f;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
+          acc_lse_head[thread_id_ * num_heads] = val;
+          sum_exp += val;
+        }
+
+        float inv_sum = 1.0f / sum_exp;
+        float out_head[V_HEAD_DIM] = {};
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
+          for (int i = 0; i < V_HEAD_DIM; ++i) {
+            out_head[i] +=
+                acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
+          }
+        }
+
+        for (int i = 0; i < V_HEAD_DIM; ++i) {
+          vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
+                                             head_idx * V_HEAD_DIM + i);
+        }
+      }
+    }
+  }
+  if (PRE_CONVERT_QUERY) {
+    std::free(q_f32);
+  }
+  std::free(acc_out);
+}
+
+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens) {
+  const int num_seqs = query.size(0);
+  const int num_heads = query.size(1);
+  const int head_dim = query.size(2);
+  const int block_size = kv_cache.size(1);
+  const int v_head_dim = out.size(2);
+
+  const int max_num_blocks_per_seq = block_tables.size(1);
+  const int o_stride = out.stride(0);
+  const int q_stride = query.stride(0);
+  const int kv_stride = kv_cache.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
+        if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
+          mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
+              out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
+              kv_cache.data_ptr<scalar_t>(), num_heads, scale,
+              block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
+              max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
+        else
+          TORCH_CHECK(false, "Unsupported block size: ", block_size);
+        CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
+      });
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -18,6 +18,10 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                        const std::optional<torch::Tensor>& azp,
                        const std::optional<torch::Tensor>& bias);

+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops

@ -150,6 +154,14 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                  str kv_cache_dtype,"
      "                  Tensor k_scale, Tensor v_scale) -> ()");
  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
@ -157,4 +169,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }

+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
+  cpu_ops.def(
+      "mla_decode_kvcache("
+      "   Tensor! out, Tensor query, Tensor kv_cache,"
+      "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
+  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
+}
+
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -119,6 +119,18 @@ void advance_step_flashinfer(
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);

+void block_table_appends(torch::Tensor& append_row_indices,
+                         torch::Tensor& append_row_indices_cpu,
+                         torch::Tensor& append_cumsums,
+                         torch::Tensor& append_cumsums_cpu,
+                         torch::Tensor& append_block_ids,
+                         torch::Tensor& append_block_ids_cpu,
+                         torch::Tensor& block_table, int64_t num_appends,
+                         int64_t total_num_append_blocks);
+
+void block_table_moves(torch::Tensor& src_dst_n, torch::Tensor& src_dst_n_cpu,
+                       torch::Tensor& block_table, int64_t num_moves);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -375,25 +375,25 @@ torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
 int64_t ggml_moe_get_block_size(int64_t type) {
  switch (type) {
    case 2:
-      return MMQ_X_Q4_0;
+      return MOE_X_Q4_0;
    case 3:
-      return MMQ_X_Q4_1;
+      return MOE_X_Q4_1;
    case 6:
-      return MMQ_X_Q5_0;
+      return MOE_X_Q5_0;
    case 7:
-      return MMQ_X_Q5_1;
+      return MOE_X_Q5_1;
    case 8:
-      return MMQ_X_Q8_0;
+      return MOE_X_Q8_0;
    case 10:
-      return MMQ_X_Q2_K;
+      return MOE_X_Q2_K;
    case 11:
-      return MMQ_X_Q3_K;
+      return MOE_X_Q3_K;
    case 12:
-      return MMQ_X_Q4_K;
+      return MOE_X_Q4_K;
    case 13:
-      return MMQ_X_Q5_K;
+      return MOE_X_Q5_K;
    case 14:
-      return MMQ_X_Q6_K;
+      return MOE_X_Q6_K;
  }
  return 0;
 }
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@ -129,12 +129,12 @@ static __device__ __forceinline__ void moe_q(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_0 64
-  #define MMQ_Y_Q4_0 128
+  #define MOE_X_Q4_0 64
+  #define MOE_Y_Q4_0 128
  #define NWARPS_Q4_0 8
 #else
-  #define MMQ_X_Q4_0 4
-  #define MMQ_Y_Q4_0 32
+  #define MOE_X_Q4_0 4
+  #define MOE_Y_Q4_0 32
  #define NWARPS_Q4_0 4
 #endif

@ -149,8 +149,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q4_0;
-  const int mmq_y = MMQ_Y_Q4_0;
+  const int mmq_x = MOE_X_Q4_0;
+  const int mmq_y = MOE_Y_Q4_0;
  const int nwarps = NWARPS_Q4_0;

  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
@ -167,8 +167,8 @@ static void ggml_moe_q4_0_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MMQ_X_Q4_0;
-  int mmq_y = MMQ_Y_Q4_0;
+  int mmq_x = MOE_X_Q4_0;
+  int mmq_y = MOE_Y_Q4_0;
  int nwarps = NWARPS_Q4_0;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -190,12 +190,12 @@ static void ggml_moe_q4_0_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_1 64
-  #define MMQ_Y_Q4_1 128
+  #define MOE_X_Q4_1 64
+  #define MOE_Y_Q4_1 128
  #define NWARPS_Q4_1 8
 #else
-  #define MMQ_X_Q4_1 4
-  #define MMQ_Y_Q4_1 32
+  #define MOE_X_Q4_1 4
+  #define MOE_Y_Q4_1 32
  #define NWARPS_Q4_1 4
 #endif

@ -210,8 +210,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q4_1;
-  const int mmq_y = MMQ_Y_Q4_1;
+  const int mmq_x = MOE_X_Q4_1;
+  const int mmq_y = MOE_Y_Q4_1;
  const int nwarps = NWARPS_Q4_1;

  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
@ -228,8 +228,8 @@ static void ggml_moe_q4_1_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MMQ_X_Q4_1;
-  int mmq_y = MMQ_Y_Q4_1;
+  int mmq_x = MOE_X_Q4_1;
+  int mmq_y = MOE_Y_Q4_1;
  int nwarps = NWARPS_Q4_1;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -251,12 +251,12 @@ static void ggml_moe_q4_1_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_0 64
-  #define MMQ_Y_Q5_0 128
+  #define MOE_X_Q5_0 64
+  #define MOE_Y_Q5_0 128
  #define NWARPS_Q5_0 8
 #else
-  #define MMQ_X_Q5_0 4
-  #define MMQ_Y_Q5_0 32
+  #define MOE_X_Q5_0 4
+  #define MOE_Y_Q5_0 32
  #define NWARPS_Q5_0 4
 #endif

@ -271,8 +271,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q5_0;
-  const int mmq_y = MMQ_Y_Q5_0;
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
  const int nwarps = NWARPS_Q5_0;

  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
@ -289,8 +289,8 @@ static void ggml_moe_q5_0_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_0;
-  const int mmq_y = MMQ_Y_Q5_0;
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
  const int nwarps = NWARPS_Q5_0;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -312,12 +312,12 @@ static void ggml_moe_q5_0_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_1 64
-  #define MMQ_Y_Q5_1 128
+  #define MOE_X_Q5_1 64
+  #define MOE_Y_Q5_1 128
  #define NWARPS_Q5_1 8
 #else
-  #define MMQ_X_Q5_1 4
-  #define MMQ_Y_Q5_1 32
+  #define MOE_X_Q5_1 4
+  #define MOE_Y_Q5_1 32
  #define NWARPS_Q5_1 4
 #endif

@ -332,8 +332,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q5_1;
-  const int mmq_y = MMQ_Y_Q5_1;
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
  const int nwarps = NWARPS_Q5_1;

  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
@ -350,8 +350,8 @@ static void ggml_moe_q5_1_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_1;
-  const int mmq_y = MMQ_Y_Q5_1;
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
  const int nwarps = NWARPS_Q5_1;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -373,12 +373,12 @@ static void ggml_moe_q5_1_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q8_0 64
-  #define MMQ_Y_Q8_0 128
+  #define MOE_X_Q8_0 64
+  #define MOE_Y_Q8_0 128
  #define NWARPS_Q8_0 8
 #else
-  #define MMQ_X_Q8_0 4
-  #define MMQ_Y_Q8_0 32
+  #define MOE_X_Q8_0 4
+  #define MOE_Y_Q8_0 32
  #define NWARPS_Q8_0 4
 #endif

@ -393,8 +393,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q8_0;
-  const int mmq_y = MMQ_Y_Q8_0;
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
  const int nwarps = NWARPS_Q8_0;

  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
@ -411,8 +411,8 @@ static void ggml_moe_q8_0_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q8_0;
-  const int mmq_y = MMQ_Y_Q8_0;
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
  const int nwarps = NWARPS_Q8_0;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -434,12 +434,12 @@ static void ggml_moe_q8_0_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q2_K 64
-  #define MMQ_Y_Q2_K 128
+  #define MOE_X_Q2_K 64
+  #define MOE_Y_Q2_K 128
  #define NWARPS_Q2_K 8
 #else
-  #define MMQ_X_Q2_K 4
-  #define MMQ_Y_Q2_K 32
+  #define MOE_X_Q2_K 4
+  #define MOE_Y_Q2_K 32
  #define NWARPS_Q2_K 4
 #endif

@ -454,8 +454,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q2_K;
-  const int mmq_y = MMQ_Y_Q2_K;
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
  const int nwarps = NWARPS_Q2_K;

  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
@ -472,8 +472,8 @@ static void ggml_moe_q2_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q2_K;
-  const int mmq_y = MMQ_Y_Q2_K;
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
  const int nwarps = NWARPS_Q2_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -495,12 +495,12 @@ static void ggml_moe_q2_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q3_K 64
-  #define MMQ_Y_Q3_K 128
+  #define MOE_X_Q3_K 64
+  #define MOE_Y_Q3_K 128
  #define NWARPS_Q3_K 8
 #else
-  #define MMQ_X_Q3_K 4
-  #define MMQ_Y_Q3_K 32
+  #define MOE_X_Q3_K 4
+  #define MOE_Y_Q3_K 32
  #define NWARPS_Q3_K 4
 #endif

@ -516,8 +516,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {

-  const int mmq_x = MMQ_X_Q3_K;
-  const int mmq_y = MMQ_Y_Q3_K;
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
  const int nwarps = NWARPS_Q3_K;

  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
@ -533,8 +533,8 @@ static void ggml_moe_q3_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q3_K;
-  const int mmq_y = MMQ_Y_Q3_K;
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
  const int nwarps = NWARPS_Q3_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -556,12 +556,12 @@ static void ggml_moe_q3_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_K 64
-  #define MMQ_Y_Q4_K 128
+  #define MOE_X_Q4_K 64
+  #define MOE_Y_Q4_K 128
  #define NWARPS_Q4_K 8
 #else
-  #define MMQ_X_Q4_K 4
-  #define MMQ_Y_Q4_K 32
+  #define MOE_X_Q4_K 4
+  #define MOE_Y_Q4_K 32
  #define NWARPS_Q4_K 4
 #endif

@ -576,8 +576,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q4_K;
-  const int mmq_y = MMQ_Y_Q4_K;
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
  const int nwarps = NWARPS_Q4_K;

  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
@ -594,8 +594,8 @@ static void ggml_moe_q4_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q4_K;
-  const int mmq_y = MMQ_Y_Q4_K;
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
  const int nwarps = NWARPS_Q4_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -617,12 +617,12 @@ static void ggml_moe_q4_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_K 64
-  #define MMQ_Y_Q5_K 128
+  #define MOE_X_Q5_K 64
+  #define MOE_Y_Q5_K 128
  #define NWARPS_Q5_K 8
 #else
-  #define MMQ_X_Q5_K 4
-  #define MMQ_Y_Q5_K 32
+  #define MOE_X_Q5_K 4
+  #define MOE_Y_Q5_K 32
  #define NWARPS_Q5_K 4
 #endif

@ -637,8 +637,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q5_K;
-  const int mmq_y = MMQ_Y_Q5_K;
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
  const int nwarps = NWARPS_Q5_K;

  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
@ -655,8 +655,8 @@ static void ggml_moe_q5_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_K;
-  const int mmq_y = MMQ_Y_Q5_K;
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
  const int nwarps = NWARPS_Q5_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -678,12 +678,12 @@ static void ggml_moe_q5_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q6_K 64
-  #define MMQ_Y_Q6_K 128
+  #define MOE_X_Q6_K 64
+  #define MOE_Y_Q6_K 128
  #define NWARPS_Q6_K 8
 #else
-  #define MMQ_X_Q6_K 4
-  #define MMQ_Y_Q6_K 32
+  #define MOE_X_Q6_K 4
+  #define MOE_Y_Q6_K 32
  #define NWARPS_Q6_K 4
 #endif

@ -698,8 +698,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q6_K;
-  const int mmq_y = MMQ_Y_Q6_K;
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
  const int nwarps = NWARPS_Q6_K;

  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
@ -716,8 +716,8 @@ static void ggml_moe_q6_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q6_K;
-  const int mmq_y = MMQ_Y_Q6_K;
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
  const int nwarps = NWARPS_Q6_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@ -14,7 +14,7 @@ __global__ void awq_marlin_repack_kernel(
  int n_tiles = size_n / tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);

-  int start_k_tile = blockIdx.x * block_k_tiles;
+  auto start_k_tile = blockIdx.x * block_k_tiles;
  if (start_k_tile >= k_tiles) {
    return;
  }
@ -51,8 +51,8 @@ __global__ void awq_marlin_repack_kernel(
    int4* sh_ptr = sh + stage_size * pipe;

    if (threadIdx.x < stage_size) {
-      int k_id = threadIdx.x / stage_n_threads;
-      int n_id = threadIdx.x % stage_n_threads;
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;

      int first_k = k_tile_id * tile_k_size;

@ -70,8 +70,8 @@ __global__ void awq_marlin_repack_kernel(
      return;
    }

-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;

    if (warp_id >= 4) {
      return;
@ -265,4 +265,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
+}
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -42,7 +42,7 @@ namespace marlin {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {}
+                                    int size_k, int lda, int block_rows) {}

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
@ -459,29 +459,32 @@ __device__ inline void barrier_release(int* lock, bool reset = false) {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {
-  int start_row = block_rows * blockIdx.x;
+                                    int size_k, int lda, int block_rows) {
+  auto start_row = block_rows * blockIdx.x;
  int finish_row = start_row + block_rows;
  if (finish_row > size_m) {
    finish_row = size_m;
  }
  int cur_block_rows = finish_row - start_row;

-  int row_stride = size_k * sizeof(half) / 16;
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;

  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;

-    int offset = row * row_stride;
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;

-    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
-    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);

    int base_k = 0;

    for (int i = 0; i < iters; i++) {
-      int cur_k = base_k + threadIdx.x;
+      auto cur_k = base_k + threadIdx.x;
      int src_pos = perm_int_ptr[cur_k];

      out_half[cur_k] = a_row_half[src_pos];
@ -491,7 +494,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,

    if (rest) {
      if (threadIdx.x < rest) {
-        int cur_k = base_k + threadIdx.x;
+        auto cur_k = base_k + threadIdx.x;
        int src_pos = perm_int_ptr[cur_k];

        out_half[cur_k] = a_row_half[src_pos];
@ -537,6 +540,7 @@ __global__ void Marlin(
    int prob_m,           // batch dimension m
    int prob_n,           // output dimension n
    int prob_k,           // reduction dimension k
+    int lda,              // A.stride(0), equal to prob_k is A is contiguous
    int* locks,           // extra global storage for barrier synchronization
    bool use_atomic_add,  // whether to use atomic add to reduce
    bool use_fp32_reduce  // whether to use fp32 global reduce
@ -600,7 +604,7 @@ __global__ void Marlin(
  // We can easily implement parallel problem execution by just remapping
  // indices and advancing global pointers
  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
    locks += (slice_col_par / n_tiles) * n_tiles;
    slice_col = slice_col_par % n_tiles;
@ -631,7 +635,7 @@ __global__ void Marlin(
      }
    }
    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
+      A += 16 * thread_m_blocks * lda / 8;
      C += 16 * thread_m_blocks * prob_n / 8;
      locks += n_tiles;
      slice_col = 0;
@ -643,7 +647,7 @@ __global__ void Marlin(
  // A sizes/strides

  // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = lda / 8;
  // stride of an A matrix tile in shared memory
  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
  // delta between subsequent A tiles in global memory
@ -719,8 +723,8 @@ __global__ void Marlin(
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;

  // For act_order
  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@ -739,7 +743,7 @@ __global__ void Marlin(
                s_sh_stride * slice_col + threadIdx.x;
    }
  }
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

  // Zero-points
@ -752,7 +756,7 @@ __global__ void Marlin(
                 zp_sh_stride * slice_col + threadIdx.x;
    }
  }
-  int zp_sh_wr = threadIdx.x;
+  auto zp_sh_wr = threadIdx.x;
  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;

  // We use a different scale layout for grouped and column-wise quantization as
@ -1043,7 +1047,7 @@ __global__ void Marlin(
          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
        } else {
-          int warp_id = threadIdx.x / 32;
+          auto warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;
@ -1081,7 +1085,7 @@ __global__ void Marlin(

    // Determine "position" inside the thread-block (based on warp and
    // thread-id)
-    int warp_id = threadIdx.x / 32;
+    auto warp_id = threadIdx.x / 32;
    int n_warps =
        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

@ -1090,7 +1094,7 @@ __global__ void Marlin(

    cur_k += warp_row * 16;

-    int th_id = threadIdx.x % 32;
+    auto th_id = threadIdx.x % 32;
    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

    int s_col_shift =
@ -1155,7 +1159,7 @@ __global__ void Marlin(
              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
        }
      } else {
-        int warp_id = threadIdx.x / 32;
+        auto warp_id = threadIdx.x / 32;
        int n_warps = thread_n_blocks / 4;

        int warp_row = warp_id / n_warps;
@ -1193,7 +1197,7 @@ __global__ void Marlin(
                                     (pipe / (group_blocks / thread_k_blocks)));
          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
        } else {
-          int warp_id = threadIdx.x / 32;
+          auto warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;
@ -1319,7 +1323,7 @@ __global__ void Marlin(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
@ -1386,7 +1390,7 @@ __global__ void Marlin(
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

@ -1780,8 +1784,8 @@ __global__ void Marlin(
               HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
-                num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add,     \
-                use_fp32_reduce);                                              \
+                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
+                use_atomic_add, use_fp32_reduce);                              \
      }                                                                        \
    }

@ -2071,7 +2075,7 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
               void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
-               int prob_n, int prob_k, void* workspace,
+               int prob_n, int prob_k, int lda, void* workspace,
               vllm::ScalarType const& q_type, bool has_act_order,
               bool is_k_full, bool has_zp, int num_groups, int group_size,
               int dev, cudaStream_t stream, int thread_k, int thread_n,
@ -2184,8 +2188,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
    // Permute A columns
    int block_rows = div_ceil(prob_m, blocks);
    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
-        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
    A_ptr = a_tmp_ptr;
+    lda = prob_k;
  }

  // If we have a full K, then we can run the non-act-order version of Marlin
@ -2244,7 +2249,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                  ", num_bits = ", num_bits);
    }

-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    A_ptr += 16 * thread_m_blocks * (lda / 8) * par;
    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
  }
 }
@ -2300,7 +2305,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,

  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");

  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
@ -2432,7 +2440,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
@ -2443,10 +2451,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
-        use_fp32_reduce, is_zp_float);
+        a.stride(0), workspace.data_ptr(), b_q_type, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        marlin::max_par, use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@ -15,7 +15,7 @@ __global__ void gptq_marlin_repack_kernel(
  int n_tiles = size_n / tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);

-  int start_k_tile = blockIdx.x * block_k_tiles;
+  auto start_k_tile = blockIdx.x * block_k_tiles;
  if (start_k_tile >= k_tiles) {
    return;
  }
@ -71,8 +71,8 @@ __global__ void gptq_marlin_repack_kernel(

    if constexpr (has_perm) {
      if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;

        uint32_t const* sh_perm_int_ptr =
            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
@ -88,8 +88,8 @@ __global__ void gptq_marlin_repack_kernel(

    } else {
      if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;

        int first_k = k_tile_id * tile_k_size;
        int first_k_packed = first_k / pack_factor;
@ -109,8 +109,8 @@ __global__ void gptq_marlin_repack_kernel(
      return;
    }

-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;

    if (warp_id >= 4) {
      return;
@ -339,4 +339,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
+}
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@ -277,12 +277,12 @@ __global__ void Marlin(
      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;

  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                s_sh_stride * slice_col + threadIdx.x;
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
  int s_sh_rd;
  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
@ -455,7 +455,7 @@ __global__ void Marlin(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
+      auto red_idx = threadIdx.x / b_sh_stride;
      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
@ -522,7 +522,7 @@ __global__ void Marlin(
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@ -353,10 +353,10 @@ __global__ void Marlin(
      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;

-  int s_tok_gl_rd = threadIdx.x;
+  auto s_tok_gl_rd = threadIdx.x;
  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
@ -368,8 +368,8 @@ __global__ void Marlin(
  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;

-  int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
-  int s_ch_sh_wr = threadIdx.x;
+  auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+  auto s_ch_sh_wr = threadIdx.x;
  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
                   2 * ((threadIdx.x % 32) % 4);
  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
@ -558,7 +558,7 @@ __global__ void Marlin(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
+      auto red_idx = threadIdx.x / b_sh_stride;
      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
@ -628,7 +628,7 @@ __global__ void Marlin(
                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
      c_gl_wr += (4 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads * 2;
-      int c_sh_wr = 2 * threadIdx.x;
+      auto c_sh_wr = 2 * threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@ -273,15 +273,15 @@ __global__ void Marlin_24(
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;

  int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
                (threadIdx.x % (m_sh_stride));
  m_gl_rd += (m_sh_stride)*slice_col;
  m_gl_rd += m_gl_rd_delta_o * slice_row;
-  int m_sh_wr = threadIdx.x;
-  int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
+  auto m_sh_wr = threadIdx.x;
+  auto m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;

  int s_gl_rd;
  if constexpr (group_blocks == -1) {
@ -291,7 +291,7 @@ __global__ void Marlin_24(
              s_sh_stride * slice_col + threadIdx.x;
  }

-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
  int s_sh_rd;
  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
@ -516,7 +516,7 @@ __global__ void Marlin_24(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
@ -583,7 +583,7 @@ __global__ void Marlin_24(
                    8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;

      int col = 2 * ((threadIdx.x % 32) % 4);

--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -284,18 +284,18 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
  // clang-format on
  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
  const int lane4id = laneid % 4;
  const int lane16id = laneid % 16;
  const int rowid = laneid / 16;

-  const int seq_idx = blockIdx.x;
-  const int partition_idx = blockIdx.y;
+  const auto seq_idx = blockIdx.x;
+  const auto partition_idx = blockIdx.y;

  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256

-  const int max_num_partitions = gridDim.y;
+  const auto max_num_partitions = gridDim.y;

  const int context_len = context_lens[seq_idx];

@ -346,9 +346,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  // can be interpreted as B8x16 for 8 bit types
  _B16x8 Klocal[TLOOP][QKHELOOP];

-  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
-  const int wg_start_kv_head_idx = blockIdx.z;
-  const int total_num_heads = gridDim.z * GQA_RATIO;
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;
+  const auto total_num_heads = gridDim.z * GQA_RATIO;

  // for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps
  // each mfma takes QH16xT16x16HE across warp
@ -789,14 +789,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
  // clang-format on
  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
  const int lane4id = laneid % 4;

-  const int seq_idx = blockIdx.x;
-  const int partition_idx = blockIdx.y;
-  const int partition_size = blockDim.x;
-  const int max_num_partitions = gridDim.y;
+  const auto seq_idx = blockIdx.x;
+  const auto partition_idx = blockIdx.y;
+  const auto partition_size = blockDim.x;
+  const auto max_num_partitions = gridDim.y;

  const int context_len = context_lens[seq_idx];
  const int partition_start_token_idx = partition_idx * partition_size;
@ -838,8 +838,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    qk_max[h] = -FLT_MAX;
  }

-  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
-  const int wg_start_kv_head_idx = blockIdx.z;
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;

  const int warp_start_token_idx =
      partition_start_token_idx + warpid * WARP_SIZE;
@ -857,7 +857,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(

    const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
    // token id within partition
-    const int local_token_idx = threadIdx.x;
+    const auto local_token_idx = threadIdx.x;
    // token id within sequence
    const int global_token_idx = partition_start_token_idx + local_token_idx;

@ -1126,7 +1126,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(

  __syncthreads();

-  const int num_heads = gridDim.z * GQA_RATIO;
+  const auto num_heads = gridDim.z * GQA_RATIO;
  float* max_logits_ptr =
      max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
  float* exp_sums_ptr =
@ -1268,14 +1268,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                           // max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
    const int max_num_partitions) {
-  const int num_heads = gridDim.x;
-  const int head_idx = blockIdx.x;
-  const int seq_idx = blockIdx.y;
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;

  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
@ -1294,7 +1294,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

  #pragma unroll
    for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
      valid_partition[i] =
          (partition_no < num_partitions) ? partition_no : last_valid_partition;
    }
@ -1324,7 +1324,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    }
  #pragma unroll
    for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
      rescaled_exp_sum[i] *= (partition_no < num_partitions)
                                 ? expf(reg_max_logit[i] - max_logit)
                                 : 0.0f;
@ -1336,7 +1336,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    }
  #pragma unroll
    for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
    }

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -111,6 +111,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      ") -> ()");
  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);

+  ops.def(
+      "block_table_appends(Tensor append_row_indices, "
+      "Tensor append_row_indices_cpu, Tensor append_cumsums, "
+      "Tensor append_cumsums_cpu, Tensor append_block_ids, "
+      "Tensor append_block_ids_cpu, Tensor! block_table, int num_appends, "
+      "int total_num_append_blocks) -> ()");
+  ops.impl("block_table_appends", torch::kCUDA, &block_table_appends);
+
+  ops.def(
+      "block_table_moves(Tensor src_dst_n, Tensor src_dst_n_cpu, "
+      "Tensor! block_table, int num_moves) -> ()");
+  ops.impl("block_table_moves", torch::kCUDA, &block_table_moves);
+
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -85,6 +85,7 @@ html_static_path = ["_static"]
 html_js_files = ["custom.js"]
 html_css_files = ["custom.css"]

+myst_heading_anchors = 2
 myst_url_schemes = {
    'http': None,
    'https': None,
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -4,6 +4,9 @@

 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.

+* [Deployment with CPUs](#deployment-with-cpus)
+* [Deployment with GPUs](#deployment-with-gpus)
+
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [Helm](frameworks/helm.md)
 * [InftyAI/llmaz](integrations/llmaz.md)
@ -14,11 +17,107 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
 * [vllm-project/production-stack](integrations/production-stack.md)

-## Pre-requisite
+## Deployment with CPUs

-Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
+:::{note}
+The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
+:::

-## Deployment using native K8s
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: $(HF_TOKEN)
+```
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+    spec:
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+EOF
+```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```console
+kubectl logs -l app.kubernetes.io/name=vllm
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
+
+## Deployment with GPUs
+
+**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).

 1. Create a PVC, Secret and Deployment for vLLM

--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.

 ```console
-pip install bitsandbytes>=0.45.0
+pip install bitsandbytes>=0.45.3
 ```

 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,

 vLLM currently supports the following reasoning models:

-| Model Series | Parser Name | Structured Output Support |
-|--------------|-------------|------------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
+| Model Series | Parser Name | Structured Output Support | Tool Calling |
+|--------------|-------------|------------------|-------------|
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |

 ## Quickstart

@ -170,10 +170,51 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content)
 print("content: ", completion.choices[0].message.content)
 ```

+## Tool Calling
+
+The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+            },
+            "required": ["location", "unit"]
+        }
+    }
+}]
+
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    tools=tools,
+    tool_choice="auto"
+)
+
+print(response)
+tool_call = response.choices[0].message.tool_calls[0].function
+
+print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+```
+
+For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
+
 ## Limitations

 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
- It is not compatible with [`tool_calling`](#tool_calling).

 ## How to support a new reasoning model

--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -30,8 +30,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="facebook/opt-6.7b",
    tensor_parallel_size=1,
-    speculative_model="facebook/opt-125m",
-    num_speculative_tokens=5,
+    speculative_config={
+        "model": "facebook/opt-125m",
+        "num_speculative_tokens": 5,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)

@ -45,10 +47,14 @@ To perform the same with an online mode launch the server:

 ```bash
 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --speculative_model facebook/opt-125m \
-    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
+    --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
 ```

+:::{warning}
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+:::
+
 Then use a client:

 ```python
@ -101,9 +107,11 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="facebook/opt-6.7b",
    tensor_parallel_size=1,
-    speculative_model="[ngram]",
-    num_speculative_tokens=5,
-    ngram_prompt_lookup_max=4,
+    speculative_config={
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 4,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)

@ -131,8 +139,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    tensor_parallel_size=4,
-    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "ibm-ai-platform/llama3-70b-accelerator",
+        "draft_tensor_parallel_size": 1,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)

@ -175,8 +185,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    tensor_parallel_size=4,
-    speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        "draft_tensor_parallel_size": 1,
+    },
 )

 outputs = llm.generate(prompts, sampling_params)
@ -194,11 +206,10 @@ A few important things to consider when using the EAGLE based draft models:
   be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
   If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
-   and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
-   the latest version of vLLM, please leave a comment or raise an issue.
+   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.

 2. The EAGLE based draft models need to be run without tensor parallelism
-   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   (i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
   it is possible to run the main model using tensor parallelism (see example above).

 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -193,7 +193,7 @@ vLLM CPU backend supports the following vLLM features:

 ## Related runtime environment variables

- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).

--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -58,6 +58,11 @@ from vllm import LLM, SamplingParams
 ```

 The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
+:::{important}
+By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+
+However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
+:::

 ```python
 prompts = [
@ -76,7 +81,7 @@ llm = LLM(model="facebook/opt-125m")
 ```

 :::{note}
-By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
 :::

 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
@ -107,6 +112,11 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
 :::
+:::{important}
+By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+
+To disable this behavior, please pass `--generation-config vllm` when launching the server.
+:::

 This server can be queried in the same format as OpenAI API. For example, to list the models:

--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -129,6 +129,9 @@ in progress.
 - **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.

+- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet.
+  See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations.
+
 #### Features to Be Supported

 - **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
@ -156,6 +159,9 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring

 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).

-## FAQ
+## Frequently Asked Questions

-TODO
+**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
+The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
+
+On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
@ -0,0 +1,5 @@
+Loading Model weights with fastsafetensors
+===================================================================
+
+Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@ -5,4 +5,5 @@

 runai_model_streamer
 tensorizer
+fastsafetensor
 :::
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@ -46,6 +46,11 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```

+:::{important}
+By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+
+However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
+:::
 A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>

 ### `LLM.beam_search`
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:

 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
+- <project:#distributed-serving> (requires `transformers>=4.49.0`)

 #### Remote code

--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@ -2,7 +2,12 @@

 # Engine Arguments

-Below, you can find an explanation of every engine argument for vLLM:
+Engine arguments control the behavior of the vLLM engine.
+
+- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
+- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
+
+Below, you can find an explanation of every engine argument:

 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
@ -15,7 +20,7 @@ Below, you can find an explanation of every engine argument for vLLM:

 ## Async Engine Arguments

-Below are the additional arguments related to the asynchronous engine:
+Additional arguments are available to the asynchronous engine which is used for online serving:

 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@ -97,6 +97,13 @@ llm = LLM(model="adept/fuyu-8b",
          max_num_seqs=2)
 ```

+#### Adjust cache size
+
+If you run out of CPU RAM, try the following options:
+
+- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
+
 ### Performance optimization and tuning

 You can potentially improve the performance of vLLM by finetuning various options.
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -33,7 +33,11 @@ print(completion.choices[0].message)
 vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
 You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 :::
+:::{important}
+By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.

+To disable this behavior, please pass `--generation-config vllm` when launching the server.
+:::
 ## Supported APIs

 We currently support the following OpenAI APIs:
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@ -50,7 +50,9 @@ if __name__ == "__main__":
    # Create an LLM with spec decoding
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-ai-platform/llama-13b-accelerator",
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
    )

    print("With speculation")
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -361,6 +361,7 @@ def run_llava_next_video(questions: list[str],
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
+        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+An example demonstrates how to use tool calling with reasoning models 
+like QwQ-32B. The reasoning_content will not be parsed by the tool 
+calling process; only the final output will be parsed.
+
+To run this example, you need to start the vLLM server with both 
+the reasoning parser and tool calling enabled.
+
+```bash
+vllm serve Qwen/QwQ-32B \
+     --enable-reasoning --reasoning-parser deepseek_r1 \
+     --enable-auto-tool-choice --tool-call-parser hermes
+     
+```
+
+"""
+
+from openai import OpenAI
+
+
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: 'str'):
+    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+            "partly cloudly, with highs in the 90's.")
+
+
+available_tools = {"get_current_weather": get_current_weather}
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+messages = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+print("---------Full Generate With Automatic Function Calling-------------")
+tool_calls = client.chat.completions.create(messages=messages,
+                                            model=model,
+                                            tools=tools)
+print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+print(f"function name: "
+      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+print(f"function arguments: "
+      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+
+print("----------Stream Generate With Automatic Function Calling-----------")
+tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                   model=model,
+                                                   tools=tools,
+                                                   stream=True)
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+
+reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    chunks)
+
+print(f"reasoning_content: {reasoning_content}")
+print(f"function name: {function_names[0]}")
+print(f"function arguments: {arguments[0]}")
+
+print("----------Full Generate With Named Function Calling-----------------")
+tool_calls = client.chat.completions.create(messages=messages,
+                                            model=model,
+                                            tools=tools,
+                                            tool_choice={
+                                                "type": "function",
+                                                "function": {
+                                                    "name":
+                                                    "get_current_weather"
+                                                }
+                                            })
+
+tool_call = tool_calls.choices[0].message.tool_calls[0].function
+print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+print(f"function name: {tool_call.name}")
+print(f"function arguments: {tool_call.arguments}")
+print("----------Stream Generate With Named Function Calling--------------")
+
+tool_calls_stream = client.chat.completions.create(
+    messages=messages,
+    model=model,
+    tools=tools,
+    tool_choice={
+        "type": "function",
+        "function": {
+            "name": "get_current_weather"
+        }
+    },
+    stream=True)
+
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+
+reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    chunks)
+print(f"reasoning_content: {reasoning_content}")
+print(f"function name: {function_names[0]}")
+print(f"function arguments: {arguments[0]}")
+print("\n\n")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -86,6 +86,7 @@ exclude = [
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
+"vllm/utils.py" = ["UP006", "UP035"]

 [tool.ruff.lint]
 select = [
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -18,7 +18,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
-llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -4,14 +4,14 @@
 # Dependencies for CPUs
 torch==2.6.0+cpu; platform_machine == "x86_64"
 torch==2.6.0; platform_system == "Darwin"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.5.1; platform_machine == "ppc64le"
+torchaudio==2.6.0; platform_machine == "ppc64le"

 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
-torchvision==0.20.1; platform_machine == "ppc64le"
+torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
--- a/requirements/test.in
+++ b/requirements/test.in
@ -41,3 +41,4 @@ tritonclient==2.51.0
 numpy < 2.0.0
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -67,6 +67,7 @@ click==8.1.7
    #   jiwer
    #   nltk
    #   ray
+    #   typer
 colorama==0.4.6
    # via
    #   awscli
@ -122,6 +123,8 @@ fastparquet==2024.11.0
    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
+fastsafetensors==0.1.10
+    # via -r requirements/test.in
 filelock==3.16.1
    # via
    #   datasets
@ -505,7 +508,9 @@ requests==2.32.3
 responses==0.25.3
    # via genai-perf
 rich==13.9.4
-    # via genai-perf
+    # via
+    #   genai-perf
+    #   typer
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.20.1
@ -550,6 +555,8 @@ setuptools==75.8.0
    # via
    #   pytablewriter
    #   torch
+shellingham==1.5.4
+    # via typer
 six==1.16.0
    # via
    #   python-dateutil
@ -600,6 +607,7 @@ torch==2.6.0
    #   accelerate
    #   bitsandbytes
    #   encodec
+    #   fastsafetensors
    #   lm-eval
    #   peft
    #   runai-model-streamer
@ -654,6 +662,8 @@ typepy==1.3.2
    #   dataproperty
    #   pytablewriter
    #   tabledata
+typer==0.15.2
+    # via fastsafetensors
 typing-extensions==4.12.2
    # via
    #   huggingface-hub
@ -663,6 +673,7 @@ typing-extensions==4.12.2
    #   pydantic
    #   pydantic-core
    #   torch
+    #   typer
 tzdata==2024.2
    # via pandas
 urllib3==2.2.3
--- a/setup.py
+++ b/setup.py
@ -680,6 +680,7 @@ setup(
    install_requires=get_requirements(),
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
+        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
        "audio": ["librosa", "soundfile"],  # Required for audio processing
        "video": ["decord"]  # Required for video processing
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+import Cython.Compiler.Options
+from Cython.Build import cythonize
+from setuptools import setup
+
+Cython.Compiler.Options.annotate = True
+
+infiles = []
+
+infiles += [
+    "vllm/engine/llm_engine.py",
+    "vllm/transformers_utils/detokenizer.py",
+    "vllm/engine/output_processor/single_step.py",
+    "vllm/outputs.py",
+    "vllm/engine/output_processor/stop_checker.py",
+]
+
+infiles += [
+    "vllm/core/scheduler.py",
+    "vllm/sequence.py",
+    "vllm/core/block_manager.py",
+]
+
+infiles += [
+    "vllm/model_executor/layers/sampler.py",
+    "vllm/sampling_params.py",
+    "vllm/utils.py",
+]
+
+setup(ext_modules=cythonize(infiles,
+                            annotate=False,
+                            force=True,
+                            compiler_directives={
+                                'language_level': "3",
+                                'infer_types': True
+                            }))
+
+# example usage: python3 build_cython.py build_ext --inplace
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import pickle
+import copy

 import pytest
 import torch
@ -10,32 +9,63 @@ from vllm.compilation.pass_manager import PostGradPassManager
 from vllm.config import CompilationConfig


+# dummy custom pass that doesn't inherit
 def simple_callable(graph: torch.fx.Graph):
    pass


-callable_uuid = CallableInductorPass(simple_callable,
-                                     InductorPass.hash_source(__file__))
-
-
-@pytest.mark.parametrize(
-    "works, callable",
-    [
-        (False, simple_callable),
-        (True, callable_uuid),
-        (True, CallableInductorPass(simple_callable)),
-    ],
-)
-def test_pass_manager(works: bool, callable):
+# Should fail to add directly to the pass manager
+def test_bad_callable():
    config = CompilationConfig().pass_config

    pass_manager = PostGradPassManager()
    pass_manager.configure(config)

-    # Try to add the callable to the pass manager
-    if works:
-        pass_manager.add(callable)
-        pickle.dumps(pass_manager)
-    else:
-        with pytest.raises(AssertionError):
-            pass_manager.add(callable)
+    with pytest.raises(AssertionError):
+        pass_manager.add(simple_callable)  # noqa, type wrong on purpose
+
+
+# Pass that inherits from InductorPass
+class ProperPass(InductorPass):
+
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        pass
+
+
+@pytest.mark.parametrize(
+    "callable",
+    [
+        ProperPass(),
+        # Can also wrap callables in CallableInductorPass for compliance
+        CallableInductorPass(simple_callable),
+        CallableInductorPass(simple_callable,
+                             InductorPass.hash_source(__file__))
+    ],
+)
+def test_pass_manager_uuid(callable):
+    config = CompilationConfig().pass_config
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    # Check that UUID is different if the same pass is added 2x
+    pass_manager.add(callable)
+    uuid1 = pass_manager.uuid()
+    pass_manager.add(callable)
+    uuid2 = pass_manager.uuid()
+    assert uuid1 != uuid2
+
+    # UUID should be the same as the original one,
+    # as we constructed in the same way.
+    pass_manager2 = PostGradPassManager()
+    pass_manager2.configure(config)
+    pass_manager2.add(callable)
+    assert uuid1 == pass_manager2.uuid()
+
+    # UUID should be different due to config change
+    config2 = copy.deepcopy(config)
+    config2.enable_fusion = not config2.enable_fusion
+    pass_manager3 = PostGradPassManager()
+    pass_manager3.configure(config2)
+    pass_manager3.add(callable)
+    assert uuid1 != pass_manager3.uuid()
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -175,6 +175,8 @@ TEXT_GENERATION_MODELS = {
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
+    # Tests TransformersModel
+    "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
    # Uses Llama
@ -243,6 +245,7 @@ TEST_MODELS = [
    # [LANGUAGE GENERATION]
    "microsoft/Phi-3.5-MoE-instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
+    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
    "ibm/PowerLM-3b",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
    def echo_rank(self):
        return self.rank

-    from vllm.worker.worker import Worker
-
-    class MyWorker(Worker):
-
-        def echo_rank(self):
-            return self.rank
-
    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              load_format="dummy",
              tensor_parallel_size=tp_size,
-              distributed_executor_backend=backend,
-              worker_cls=MyWorker)
-    for method in ["echo_rank", echo_rank]:
-        assert llm.collective_rpc(method) == list(range(tp_size))
+              distributed_executor_backend=backend)
+    assert llm.collective_rpc(echo_rank) == list(range(tp_size))
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@ -107,8 +107,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
    # Call the function and get the result
    result = apply_hf_chat_template(
        tokenizer,
+        trust_remote_code=True,
        conversation=mock_request.messages,
        chat_template=mock_request.chat_template or template_content,
+        tools=None,
        add_generation_prompt=mock_request.add_generation_prompt,
        continue_final_message=mock_request.continue_final_message,
    )
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# a reasoning and tool calling model
+MODEL_NAME = "Qwen/QwQ-32B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
+        "--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
+        "--tool-call-parser", "hermes"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+MESSAGES = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+FUNC_NAME = "get_current_weather"
+FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+# test streaming
+@pytest.mark.asyncio
+async def test_chat_streaming_of_tool_and_reasoning(
+        client: openai.AsyncOpenAI):
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    assert len(reasoning_content) > 0
+    assert len(function_names) > 0 and function_names[0] == FUNC_NAME
+    assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
+
+
+# test full generate
+@pytest.mark.asyncio
+async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
+
+    tool_calls = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    assert len(tool_calls.choices[0].message.reasoning_content) > 0
+    assert tool_calls.choices[0].message.tool_calls[0].function.name \
+          == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
+          == FUNC_ARGS
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@ -87,7 +87,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)

    message = choice.message
    message = chat_completion.choices[0].message
@ -180,7 +180,7 @@ async def test_single_chat_session_video_base64encoded(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)

    message = choice.message
    message = chat_completion.choices[0].message
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -4,10 +4,13 @@ import warnings
 from typing import Optional

 import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION

 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
+                                         _try_extract_ast, load_chat_template,
                                         parse_chat_messages,
                                         parse_chat_messages_futures,
                                         resolve_chat_template_content_format)
@ -23,8 +26,10 @@ EXAMPLES_DIR = VLLM_PATH / "examples"
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
+HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"


@pytest.fixture(scope="function")
@ -703,25 +708,27 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):

    vllm_result = apply_hf_chat_template(
        tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
        conversation=conversation,
        chat_template=None,
+        tools=None,
        add_generation_prompt=True,
    )

    assert hf_result == vllm_result


-# yapf: disable
@pytest.mark.parametrize(
-    ("model", "expected_format"),
-    [(PHI3V_MODEL_ID, "string"),
-     (QWEN2VL_MODEL_ID, "openai"),
-     (ULTRAVOX_MODEL_ID, "string"),
-     (MLLAMA_MODEL_ID, "openai"),
-     (LLAMA_GUARD_MODEL_ID, "openai")],
-)
-# yapf: enable
-def test_resolve_content_format_hf_defined(model, expected_format):
+    "model",
+    [
+        QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
+        HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
+    ])
+@pytest.mark.parametrize("use_tools", [True, False])
+def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
+    """checks that chat_template is a dict type for HF models."""
+
+    # Build the tokenizer group and grab the underlying tokenizer
    tokenizer_group = TokenizerGroup(
        model,
        enable_lora=False,
@ -730,7 +737,56 @@ def test_resolve_content_format_hf_defined(model, expected_format):
    )
    tokenizer = tokenizer_group.tokenizer

-    chat_template = tokenizer.chat_template
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema
+        }
+    }] if use_tools else None
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=tools,
+        trust_remote_code=True,
+    )
+    assert isinstance(chat_template, str)
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("model", "expected_format"),
+    [(PHI3V_MODEL_ID, "string"),
+     (QWEN2VL_MODEL_ID, "openai"),
+     (QWEN25VL_MODEL_ID, "openai"),
+     (ULTRAVOX_MODEL_ID, "string"),
+     (MLLAMA_MODEL_ID, "openai"),
+     (LLAMA_GUARD_MODEL_ID, "openai")],
+)
+# yapf: enable
+def test_resolve_content_format_hf_defined(model, expected_format):
+    if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
+            "4.49.0"):
+        pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
+
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=None,
+        trust_remote_code=True,
+    )
    assert isinstance(chat_template, str)

    print("[TEXT]")
@ -740,8 +796,10 @@ def test_resolve_content_format_hf_defined(model, expected_format):

    resolved_format = resolve_chat_template_content_format(
        None,  # Test detecting the tokenizer's chat_template
+        None,
        "auto",
        tokenizer,
+        trust_remote_code=True,
    )

    assert resolved_format == expected_format
@ -791,8 +849,10 @@ def test_resolve_content_format_examples(template_path, expected_format):

    resolved_format = resolve_chat_template_content_format(
        chat_template,
+        None,
        "auto",
        dummy_tokenizer,
+        trust_remote_code=True,
    )

    assert resolved_format == expected_format
--- a/tests/fastsafetensors_loader/init.py
+++ b/tests/fastsafetensors_loader/init.py
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import SamplingParams
+from vllm.config import LoadFormat
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def test_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model,
+                     load_format=LoadFormat.FASTSAFETENSORS) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, fastsafetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_fastsafetensors_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        fastsafetensors_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in fastsafetensors_weights_iterator(
+                safetensors, True):
+            fastsafetensors_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
+
+        for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
+            fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
+            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
+                name].dtype
+            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
+                name].shape
+            assert torch.all(
+                fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_fastsafetensors_model_loader()
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@ -749,3 +749,72 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,

    ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
    torch.testing.assert_close(dst, expected)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+@torch.inference_mode()
+def test_concat_and_cache_mla_cpu(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    device = "cpu"
+    kv_cache_dtype = "auto"
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+    torch.testing.assert_close(kv_cache, ref_kv_cache)
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@ -606,6 +606,51 @@ def test_marlin_qqq_gemm(
    assert max_diff < 0.04


+def test_marlin_gemm_subset_input():
+    quant_type = scalar_types.uint4b8
+    group_size = 128
+
+    size_m, size_k, size_n = 32, 1024, 2048
+    big_m = size_m * 2
+    big_k = size_k * 2
+
+    a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
+    b_weight = rand_data((size_k, size_n))
+
+    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        b_weight, quant_type, group_size, False)
+
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        marlin_q_w,
+        marlin_s,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace.scratch,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=True,
+        has_zp=False,
+        use_atomic_add=False,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    )
+    output_ref = torch.matmul(a_input, w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
 def test_marlin_gemm_opcheck():
    size_m = 2048
    size_n = 4096
--- a/tests/kernels/test_mla_decode_cpu.py
+++ b/tests/kernels/test_mla_decode_cpu.py
@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("bs", [4])
+@pytest.mark.parametrize("mean_seq_len", [256])
+@pytest.mark.parametrize("h_q", [16])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_mla_decode_cpu(
+    bs: int,
+    mean_seq_len: int,
+    h_q: int,
+    d: int,
+    dv: int,
+    block_size: int,
+    dtype: torch.dtype,
+    varlen: bool,
+):
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    scale = d**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
+
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.arange(bs * seqlen_pad // block_size,
+                               dtype=torch.int32)
+    block_table = block_table.view(bs, seqlen_pad // block_size)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+    for i, seq_len in enumerate(seq_lens.tolist()):
+        kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
+
+    out_mla = q.new_zeros(bs, h_q, dv)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
+                               seq_lens)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+
+    assert not out_mla.isnan().any(), "Likely read out of bounds"
+    torch.testing.assert_close(out_mla, out_ref)
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@ -3,8 +3,11 @@

 Run `pytest tests/kernels/test_moe.py`.
 """
+
 import pytest
 import torch
+from torch.nn import Parameter
+from torch.nn import functional as F
 from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock

@ -37,6 +40,7 @@ TOP_KS = [2, 6]
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
 def test_fused_moe(
    m: int,
    n: int,
@ -45,6 +49,7 @@ def test_fused_moe(
    topk: int,
    ep_size: int,
    dtype: torch.dtype,
+    padding: bool,
 ):
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@ -65,16 +70,7 @@ def test_fused_moe(
    else:
        e_map = None

-    triton_output = fused_moe(a,
-                              w1,
-                              w2,
-                              score,
-                              topk,
-                              global_num_experts=e,
-                              expert_map=e_map,
-                              renormalize=False)
    torch_output = torch_moe(a, w1, w2, score, topk, e_map)
-    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
    iterative_output = iterative_moe(a,
                                     w1,
                                     w2,
@ -83,6 +79,23 @@ def test_fused_moe(
                                     global_num_experts=e,
                                     expert_map=e_map,
                                     renormalize=False)
+
+    # Pad the weight if moe padding is enabled
+    if padding:
+        w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
+        torch.cuda.empty_cache()
+        w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
+        torch.cuda.empty_cache()
+
+    triton_output = fused_moe(a,
+                              w1,
+                              w2,
+                              score,
+                              topk,
+                              global_num_experts=e,
+                              expert_map=e_map,
+                              renormalize=False)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
    torch.testing.assert_close(iterative_output,
                               torch_output,
                               atol=2e-2,
@ -202,8 +215,9 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,

@pytest.mark.parametrize("dtype",
                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
@torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype):
+def test_mixtral_moe(dtype: torch.dtype, padding: bool):
    """Make sure our Mixtral MoE implementation agrees with the one from
    huggingface."""

@ -233,6 +247,17 @@ def test_mixtral_moe(dtype: torch.dtype):
    # vLLM uses 1D query [num_tokens, hidden_dim]
    vllm_inputs = hf_inputs.flatten(0, 1)

+    # Pad the weight if moe padding is enabled
+    if padding:
+        vllm_moe.experts.w13_weight = Parameter(F.pad(
+            vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[..., 0:-128],
+                                                requires_grad=False)
+        torch.cuda.empty_cache()
+        vllm_moe.experts.w2_weight = Parameter(F.pad(
+            vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
+                                               requires_grad=False)
+        torch.cuda.empty_cache()
+
    # Run forward passes for both MoE blocks
    hf_states, _ = hf_moe.forward(hf_inputs)
    vllm_states = vllm_moe.forward(vllm_inputs)
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@ -163,24 +163,24 @@ VLM_TEST_SETTINGS = {
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    #### Extended model tests
-    # "aria": VLMTestInfo(
-    #     models=["rhymes-ai/Aria"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
-    #     max_model_len=4096,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     single_image_prompts=IMAGE_ASSETS.prompts({
-    #         "stop_sign": "<vlm_image>Please describe the image shortly.",
-    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
-    #     }),
-    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     stop_str=["<|im_end|>"],
-    #     image_size_factors=[(0.10, 0.15)],
-    #     max_tokens=64,
-    #     marks=[large_gpu_mark(min_gb=64)],
-    # ),
+    "aria": VLMTestInfo(
+        models=["rhymes-ai/Aria"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<vlm_image>Please describe the image shortly.",
+            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
+        }),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        stop_str=["<|im_end|>"],
+        image_size_factors=[(0.10, 0.15)],
+        max_tokens=64,
+        marks=[large_gpu_mark(min_gb=64)],
+    ),
    "blip2": VLMTestInfo(
        models=["Salesforce/blip2-opt-2.7b"],
        test_type=VLMTestType.IMAGE,
@ -352,6 +352,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
        num_video_frames=16,
        max_model_len=4096,
+        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
    ),
@ -384,7 +385,7 @@ VLM_TEST_SETTINGS = {
    ),
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
@ -393,9 +394,21 @@ VLM_TEST_SETTINGS = {
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
    ),
+    "minicpmo_26_multi_image": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
    "minicpmv_26": VLMTestInfo(
        models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
@ -404,6 +417,18 @@ VLM_TEST_SETTINGS = {
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
+    "minicpmv_26_multi_image": VLMTestInfo(
+        models=["openbmb/MiniCPM-V-2_6"],
+        test_type=(VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
        test_type=(VLMTestType.IMAGE),
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0

-import copy
 from functools import partial
 from typing import Optional, Union

@ -29,7 +28,7 @@ def _test_processing_correctness(
    hit_rate: float,
    num_batches: int,
    simplify_rate: float,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_available_online(on_fail="skip")
@ -145,7 +144,7 @@ def _test_processing_correctness_hf(
    baseline_processor: BaseMultiModalProcessor,
    cached_processor: BaseMultiModalProcessor,
    batch_idx: int,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
    if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
        # For some multimodal models, tokenizer will always add bos_token
@ -167,11 +166,12 @@ def _test_processing_correctness_hf(
        hf_processor_mm_kwargs={},
    )

-    assert _inputs_equal(
+    _assert_inputs_equal(
        baseline_result,
        cached_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )

    baseline_tokenized_result = baseline_processor.apply(
        token_prompt,
@ -179,11 +179,12 @@ def _test_processing_correctness_hf(
        hf_processor_mm_kwargs={},
    )

-    assert _inputs_equal(
+    _assert_inputs_equal(
        baseline_result,
        baseline_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )

    cached_tokenized_result = cached_processor.apply(
        token_prompt,
@ -191,11 +192,12 @@ def _test_processing_correctness_hf(
        hf_processor_mm_kwargs={},
    )

-    assert _inputs_equal(
+    _assert_inputs_equal(
        cached_result,
        cached_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )


 def _test_processing_correctness_mistral(
@ -206,7 +208,7 @@ def _test_processing_correctness_mistral(
    baseline_processor: BaseMultiModalProcessor,
    cached_processor: BaseMultiModalProcessor,
    batch_idx: int,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
    images = mm_data.get("image", [])
    if not isinstance(images, list):
@ -233,11 +235,12 @@ def _test_processing_correctness_mistral(
        hf_processor_mm_kwargs={},
    )

-    assert _inputs_equal(
+    _assert_inputs_equal(
        baseline_tokenized_result,
        cached_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )


 # yapf: disable
@ -261,6 +264,7 @@ def _test_processing_correctness_mistral(
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "mistralai/Pixtral-12B-2409",
    "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-Llama3-V-2_5",
    "openbmb/MiniCPM-o-2_6",
    "openbmb/MiniCPM-V-2_6",
    "allenai/Molmo-7B-D-0924",
@ -290,7 +294,7 @@ def test_processing_correctness(
        # In Ultravox, the audio_features can be different depending on padding
        # The slight difference should not be a problem though, since
        # attention_mask lets us ignore the difference.
-        ignore_mm_keys = ['audio_features']
+        ignore_mm_keys = {"audio_features"}

    _test_processing_correctness(
        model_id,
@ -328,38 +332,26 @@ def test_processing_correctness_phi3v(
    )


-def _inputs_equal(
+def _assert_inputs_equal(
    a: MultiModalInputs,
    b: MultiModalInputs,
-    ignore_mm_keys: Optional[list[str]] = None,
+    *,
+    ignore_mm_keys: Optional[set[str]] = None,
+    msg: str = "",
 ):
-    return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
-        b, ignore_mm_keys)
+    if ignore_mm_keys is None:
+        ignore_mm_keys = set()

+    if msg is None:
+        assert "mm_kwargs" in a and "mm_kwargs" in b
+    else:
+        assert "mm_kwargs" in a and "mm_kwargs" in b, msg

-def _drop_mm_kwargs_keys(
-    result: MultiModalInputs,
-    ignore_mm_keys: Optional[list[str]] = None,
-) -> MultiModalInputs:
-    """Drop specified keys from result['mm_kwargs'].
+    for key in ignore_mm_keys:
+        a["mm_kwargs"].pop(key, None)
+        b["mm_kwargs"].pop(key, None)

-    This is mainly to avoid doing exact match of audio_features in ultravox.
-
-    Args:
-        result: Result to drop keys from
-        ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
-    """
-    if not ignore_mm_keys:
-        return result
-
-    if 'mm_kwargs' in result:
-        result = copy.deepcopy(result)
-        mm_kwargs = result['mm_kwargs']
-        for key in ignore_mm_keys:
-            mm_kwargs.pop(key, None)
-        for items in mm_kwargs._items_by_modality.values():
-            for item in items:
-                for key in ignore_mm_keys:
-                    item.pop(key, None)
-
-    return result
+    if msg is None:
+        assert a == b
+    else:
+        assert a == b, msg
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@ -56,7 +56,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
 def maybe_assert_ngram_worker(llm):
    # Verify the proposer worker is ngram if ngram is specified.
    if (llm.llm_engine.speculative_config is not None
-            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
+            and llm.llm_engine.speculative_config.method == "ngram"):
        from vllm.spec_decode.ngram_worker import NGramWorker
        assert isinstance(
            llm.llm_engine.model_executor.driver_worker.proposer_worker,
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@ -7,28 +7,39 @@ from vllm import SamplingParams
 from .conftest import get_output_from_llm_generator


-@pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "meta-llama/Llama-3.2-1B-Instruct",
-    "speculative_model": "JackFram/llama-68m",
-    "num_speculative_tokens": 5,
-}])
+@pytest.mark.parametrize("common_llm_kwargs",
+                         [{
+                             "model": "meta-llama/Llama-3.2-1B-Instruct",
+                         }])
@pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [
        {
            # Speculative max model len > overridden max model len should raise.
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 129,
+            },
            "max_model_len": 128,
-            "speculative_max_model_len": 129,
        },
        {
            # Speculative max model len > draft max model len should raise.
            # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
-            "speculative_max_model_len": 2048 + 1,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 2048 + 1,
+            },
        },
        {
            # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
-            "speculative_max_model_len": 131072 + 1,
+            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 131072 + 1,
+            },
        },
    ])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@ -57,8 +57,10 @@ PRECISION = "float32"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -95,18 +97,19 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": SPEC_MODEL,
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "disable_logprobs": False,
    },
-    {
-        "speculative_model": SPEC_MODEL,
+}, {
+    "speculative_config": {
+        "model": SPEC_MODEL,
        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "disable_logprobs": True,
    },
-])
+}])
@pytest.mark.parametrize("output_len", [
    128,
 ])
@ -119,18 +122,19 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                   batch_size: int, output_len: int, seed: int,
                                   logprobs: int):

-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -151,8 +155,10 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -193,8 +199,10 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -236,8 +244,10 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
    "test_llm_kwargs",
    [
        {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
        }
        # Try a range of num. speculative tokens
        for k in range(1, 1 + MAX_SPEC_TOKENS)
@ -277,12 +287,13 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
@ -324,8 +335,10 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-llama2-chat-7B",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -372,8 +385,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -420,8 +435,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize(
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@ -23,8 +23,10 @@ MAIN_MODEL = "JackFram/llama-68m"
    [
        {
            # Identical models.
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@ -57,26 +59,33 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
-        "num_speculative_tokens": 5,
-    },
-])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
@pytest.mark.parametrize(
    "test_llm_kwargs",
    [
        # Explicitly specify draft model quantization
        {
-            "speculative_model_quantization": "gptq",
+            "speculative_config": {
+                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "num_speculative_tokens": 5,
+                "quantization": "gptq",
+            },
        },
        # Explicitly specify GPTQ-based draft model to use marlin quantization
        {
-            "speculative_model_quantization": "marlin",
+            "speculative_config": {
+                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "num_speculative_tokens": 5,
+                "quantization": "marlin",
+            },
        },
        # Not explicitly specify draft model quantization
        {
-            "speculative_model_quantization": None,
+            "speculative_config": {
+                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "num_speculative_tokens": 5,
+                "quantization": None,
+            },
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@ -107,15 +116,16 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_mqa_scorer": True,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
@ -127,7 +137,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
                    output_len: int, seed: int):
-    """Verify that ngram speculative decoding generates the same output 
+    """Verify that speculative decoding generates the same output
    with batch expansion scorer and mqa scorer.
    """
    run_equality_correctness_test(vllm_runner,
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@ -27,18 +27,19 @@ from .conftest import run_equality_correctness_test_tp
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("test_llm_kwargs", [
    [
-        "--speculative-model",
-        "JackFram/llama-68m",
-        "--num-speculative-tokens",
-        "3",
+        "--speculative_config",
+        str({
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 3,
+        }),
    ],
    [
-        "--speculative-model",
-        "[ngram]",
-        "--num-speculative-tokens",
-        "5",
-        "--ngram-prompt-lookup-max",
-        "3",
+        "--speculative_config",
+        str({
+            "model": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+        }),
    ],
 ])
@pytest.mark.parametrize("batch_size", [2])
@ -83,23 +84,24 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
    ]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
-                             "--speculative-model",
-                             "JackFram/llama-68m",
-                             "--num_speculative-tokens",
-                             "5",
-                             "--speculative-draft-tensor-parallel-size",
-                             "1",
-                         ]),
-                          ("ibm-granite/granite-3b-code-instruct", [
-                              "--speculative-model",
-                              "ibm-granite/granite-3b-code-instruct",
-                              "--num_speculative-tokens",
-                              "5",
-                              "--speculative-draft-tensor-parallel-size",
-                              "1",
-                          ])])
+@pytest.mark.parametrize(
+    "model, test_llm_kwargs",
+    [("JackFram/llama-68m", [
+        "--speculative_config",
+        str({
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "draft_tensor_parallel_size": 1,
+        }),
+    ]),
+     ("ibm-granite/granite-3b-code-instruct", [
+         "--speculative_config",
+         str({
+             "model": "ibm-granite/granite-3b-code-instruct",
+             "num_speculative_tokens": 5,
+             "draft_tensor_parallel_size": 1,
+         }),
+     ])])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
 def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@ -144,18 +146,19 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
                         [("JackFram/llama-68m", [
-                             "--speculative-model",
-                             "JackFram/llama-68m",
-                             "--num_speculative-tokens",
-                             "3",
+                             "--speculative_config",
+                             str({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                             }),
                         ]),
                          ("JackFram/llama-68m", [
-                              "--speculative-model",
-                              "JackFram/llama-68m",
-                              "--num_speculative-tokens",
-                              "3",
-                              "--speculative-draft-tensor-parallel-size",
-                              "1",
+                              "--speculative_config",
+                              str({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                              }),
                          ])])
@pytest.mark.parametrize("logprobs", [None, 2])
@pytest.mark.parametrize("batch_size", [2])
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@ -24,12 +24,7 @@ SPEC_MODEL = "JackFram/llama-68m"
        "4",
    ]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    [
-        "--speculative-model",
-        f"{SPEC_MODEL}",
-        "--num-speculative-tokens",
-        "5",
-    ],
+    [],
 ])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize(
@ -37,8 +32,12 @@ SPEC_MODEL = "JackFram/llama-68m"
    [
        #TODO(wooyeon): add spec_draft_dp=2 case
        [
-            "--speculative-draft-tensor-parallel-size",
-            "1",
+            "--speculative_config",
+            str({
+                "model": f"{SPEC_MODEL}",
+                "num_speculative_tokens": 5,
+                "draft_tensor_parallel_size": 1,
+            }),
        ],
    ])
@pytest.mark.parametrize("batch_size", [2])
@ -78,15 +77,14 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
    "test_llm_kwargs",
    [
        [
-            "--speculative-model",
-            f"{SPEC_MODEL}",
-            "--num-speculative-tokens",
-            "5",
-
            # Artificially limit the draft model max model len; this forces vLLM
            # to skip speculation once the sequences grow beyond 32-k tokens.
-            "--speculative-max-model-len",
-            "32",
+            "--speculative_config",
+            str({
+                "model": f"{SPEC_MODEL}",
+                "num_speculative_tokens": 5,
+                "max_model_len": 32,
+            }),
        ],
    ])
@pytest.mark.parametrize("batch_size", [8])
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@ -20,16 +20,19 @@ from .conftest import run_equality_correctness_test
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }, {
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+}, {
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": True,
+    },
+}])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize(
    "output_len",
@ -48,19 +51,20 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
        as well as with and without chunked prefill.
    """
    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -73,16 +77,19 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }, {
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 6,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-160m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+}, {
+    "speculative_config": {
+        "model": "JackFram/llama-160m",
+        "num_speculative_tokens": 6,
+        "disable_logprobs": False,
+    },
+}])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize(
    "output_len",
@ -98,18 +105,19 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
                              output_len: int, seed: int, logprobs: int):
    """Veriy logprob greedy equality with different speculation lens.
    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -125,13 +133,15 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
    "test_llm_kwargs",
    [{
-        "speculative_model": "JackFram/llama-160m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs_during_spec_decoding": False,
-
-        # Artificially limit the draft model max model len; this forces vLLM
-        # to skip speculation once the sequences grow beyond 32-k tokens.
-        "speculative_max_model_len": 32,
+        "speculative_config": {
+            "model": "JackFram/llama-160m",
+            "num_speculative_tokens": 3,
+            "disable_logprobs": False,
+            # Artificially limit the draft model max model len; this forces
+            # vLLM to skip speculation once the sequences grow beyond 32-k
+            # tokens.
+            "max_model_len": 32,
+        },
    }])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize(
@ -149,18 +159,19 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
                                        seed: int, logprobs: int):
    """Verify logprobs greedy equality when some sequences skip speculation.
    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -173,12 +184,13 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-160m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize(
    "output_len",
@ -248,12 +260,13 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": True,
+    },
+}])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize(
@ -270,15 +283,16 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
    """Check the behavior when logprobs are disabled.
    Token choices should match with the base model.
    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@ -60,8 +60,10 @@ PRECISION = "float32"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -107,14 +109,18 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": False,
+        },
    },
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": True,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -132,19 +138,20 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                    prefill_chunk_size: int):
    """Verify greedy equality with different batch size."""
    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -165,8 +172,10 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -214,8 +223,10 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -264,8 +275,10 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
    "test_llm_kwargs",
    [
        {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
        }
        # Try a range of num. speculative tokens
        for k in range(1, 1 + MAX_SPEC_TOKENS)
@ -312,12 +325,13 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
@ -359,16 +373,17 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,

        # Main model
        "model_name": MAIN_MODEL,
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "speculative_disable_by_batch_size": 4
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+        "disable_mqa_scorer": True,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@ -62,7 +62,9 @@ PRECISION = "float32"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -108,12 +110,16 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "disable_logprobs": False,
+        },
    },
    {
-        "speculative_model": SPEC_MODEL,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "disable_logprobs": True,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [8])
@ -133,19 +139,20 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
    # up sampling different tokens at the tail (ie top tokens don't change).
    # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -167,7 +174,9 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [2048])
@ -209,8 +218,10 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
        # Main model
        "model_name": MAIN_MODEL,

-        # Speculative model
-        "speculative_model": SPEC_MODEL,
+        # Speculative config
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
@ -274,7 +285,9 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -326,7 +339,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -382,8 +397,10 @@ def test_mlp_e2e_greedy_correctness_with_padding(
    "test_llm_kwargs",
    [
        {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
        }
        # Try a range of num. speculative tokens
        for k in range(1, 1 + MAX_SPEC_TOKENS)
@ -430,11 +447,12 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "disable_by_batch_size": 4,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
@ -475,14 +493,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
-        "speculative_model": SPEC_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "disable_mqa_scorer": True,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@ -57,7 +57,9 @@ PRECISION = "bfloat16"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -99,12 +101,16 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": False,
+        },
    },
    {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": True,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -119,18 +125,19 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                 batch_size: int, output_len: int, seed: int,
                                 logprobs: int):

-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -152,7 +159,9 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -198,7 +207,9 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
    },
 ])
@pytest.mark.parametrize(
@ -243,7 +254,9 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
    "test_llm_kwargs",
    [
        {
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "num_speculative_tokens": k,
+            },
        }
        # Try a range of num. speculative tokens
        for k in range(1, 1 + MAX_SPEC_TOKENS)
@ -286,11 +299,12 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@ -61,15 +61,19 @@ from .conftest import (get_output_from_llm_generator,
    "per_test_common_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
            "enable_chunked_prefill": False,
        },
        {
            # Chunked prefill enabled with small value
            # to make sure we get mixed batches.
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
            "enable_chunked_prefill": True,
            "max_num_batched_tokens": 4,
            "max_num_seqs": 4
@ -148,20 +152,23 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 5,
-                             "enable_chunked_prefill": False,
-                             "disable_logprobs_during_spec_decoding": False
-                         }, {
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "enable_chunked_prefill": True,
-                             "max_num_batched_tokens": 4,
-                             "max_num_seqs": 4,
-                             "disable_logprobs_during_spec_decoding": False
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "disable_logprobs": False,
+    },
+    "enable_chunked_prefill": False,
+}, {
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+    "enable_chunked_prefill": True,
+    "max_num_batched_tokens": 4,
+    "max_num_seqs": 4,
+}])
@pytest.mark.parametrize(
    "output_len",
    [
@ -184,7 +191,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
    whether all speculative tokens are accepted.
    """
    ensure_all_accepted = per_test_common_llm_kwargs.get(
-        "model_name") == test_llm_kwargs.get("speculative_model")
+        "model_name") == test_llm_kwargs.get("speculative_config")["model"]
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
@ -224,13 +231,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
@ -283,13 +294,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
@ -336,13 +351,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
@ -391,13 +410,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
@ -449,13 +472,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
@ -514,13 +541,17 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
@ -567,21 +598,25 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,

            # Artificially limit the draft model max model len; this forces vLLM
            # to skip speculation once the sequences grow beyond 32-k tokens.
-            "speculative_max_model_len": 32,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 32,
+            },
            "enable_chunked_prefill": False,
        },
        {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 32,
+            },
            "enable_chunked_prefill": True,
            "max_num_batched_tokens": 4,
            "max_num_seqs": 4,
-            "speculative_max_model_len": 32,
        },
    ])
@pytest.mark.parametrize("batch_size", [8])
@ -627,15 +662,19 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "speculative_disable_by_batch_size": 2,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "disable_by_batch_size": 2,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "speculative_disable_by_batch_size": 2,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "disable_by_batch_size": 2,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4,
@ -676,15 +715,19 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": k,
+            },
            "enable_chunked_prefill": False,
        }
        # Try a range of common k, as well as large speculation.
        for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
    ] + [{
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": k,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4,
@ -729,17 +772,21 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": k,
-            "spec_decoding_acceptance_method": "typical_acceptance_sampler",
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": k,
+                "acceptance_method": "typical_acceptance_sampler",
+            },
            "enable_chunked_prefill": False
        }
        # Try a range of common k.
        for k in [1, 2, 3]
    ] + [{
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": k,
-        "spec_decoding_acceptance_method": "typical_acceptance_sampler",
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+            "acceptance_method": "typical_acceptance_sampler",
+        },
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@ -48,16 +48,20 @@ from .conftest import run_equality_correctness_test
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "speculative_disable_mqa_scorer": False,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": False,
+        },
    },
    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "speculative_disable_mqa_scorer": True,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": True,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -101,16 +105,20 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_logprobs": False,
+        },
    },
    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_logprobs": True,
+        },
    },
 ])
@pytest.mark.parametrize("output_len", [
@ -125,19 +133,20 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                   batch_size: int, output_len: int, seed: int,
                                   logprobs: int):
    """Verify greedy equality on a tiny model with different batch size."""
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])


@pytest.mark.parametrize(
@ -159,17 +168,21 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+        },
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": True,
+        },
        "enable_chunked_prefill": True,
-        "speculative_disable_mqa_scorer": True,
        "max_num_batched_tokens": 4,
        "max_num_seqs": 4
    },
@ -214,17 +227,21 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "[ngram]",
-            "num_speculative_tokens": k,
-            "ngram_prompt_lookup_max": 3,
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": k,
+                "prompt_lookup_max": 3,
+            },
        }
        # Try a range of common k, as well as large speculation.
        for k in [1, 3, 5]
    ] + [
        {
-            "speculative_model": "[ngram]",
-            "num_speculative_tokens": k,
-            "ngram_prompt_lookup_max": 1,
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": k,
+                "prompt_lookup_max": 1,
+            },
        }
        # Try a range of common k, as well as large speculation.
        for k in [1, 3, 5]
@ -243,7 +260,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
                           seed: int):
    """Verify that ngram speculative decoding produces exact equality
    to without spec decode with many different values of k and
-    different ngram_prompt_lookup_max.
+    different ngram prompt_lookup_max.
    """
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
@ -266,22 +283,25 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "[ngram]",
-                             "num_speculative_tokens": 5,
-                             "ngram_prompt_lookup_max": 3,
-                             "speculative_disable_by_batch_size": 4
-                         }, {
-                             "speculative_model": "[ngram]",
-                             "num_speculative_tokens": 5,
-                             "ngram_prompt_lookup_max": 3,
-                             "speculative_disable_by_batch_size": 4,
-                             "enable_chunked_prefill": True,
-                             "speculative_disable_mqa_scorer": True,
-                             "max_num_batched_tokens": 4,
-                             "max_num_seqs": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 3,
+        "disable_by_batch_size": 4
+    },
+}, {
+    "speculative_config": {
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 3,
+        "disable_by_batch_size": 4,
+        "disable_mqa_scorer": True,
+    },
+    "enable_chunked_prefill": True,
+    "max_num_batched_tokens": 4,
+    "max_num_seqs": 4
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
@ -296,7 +316,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
                             seed: int):
    """Verify that ngram speculative decoding produces exact equality
    to without spec decode with many different values of k and
-    different ngram_prompt_lookup_max.
+    different ngram prompt_lookup_max.
    """
    run_equality_correctness_test(vllm_runner,
                                  common_llm_kwargs,
@ -316,18 +336,17 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
-
-        # Required for spec decode.
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 3,
+        "disable_mqa_scorer": True,
+    },
+}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
    "output_len",
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@ -19,11 +19,11 @@ SPEC_MODEL = "JackFram/llama-160m"
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

-        # speculative model
-        "speculative_model": "JackFram/llama-160m",
-
-        # num speculative tokens
-        "num_speculative_tokens": 3,
+        # speculative config
+        "speculative_config": {
+            "model": "JackFram/llama-160m",
+            "num_speculative_tokens": 3,
+        },
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@ -39,7 +39,10 @@ def ensure_system_prompt(messages: list[dict[str, Any]],

 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
-ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
+ARGS: list[str] = [
+    "--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
+    "256"
+]

 CONFIGS: dict[str, ServerConfig] = {
    "hermes": {
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@ -5,92 +5,96 @@ import os
 import tempfile

 import depyf
+import pytest

 from vllm.config import CompilationLevel

-temp_dir = tempfile.mkdtemp()
-with depyf.prepare_debug(temp_dir):
-    from vllm import LLM, SamplingParams

-    prompts = [
-        "A robot may not injure a human being",
-        "It is only with the heart that one can see rightly;",
-        "The greatest glory in living lies not in never falling,",
-    ]
-    answers = [
-        " or, through inaction, allow a human being to come to harm.",
-        " what is essential is invisible to the eye.",
-        " but in rising every time we fall.",
-    ]
-    N = 1
-    # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-    sampling_params = SamplingParams(temperature=0.7,
-                                     top_p=1.0,
-                                     n=N,
-                                     max_tokens=16)
+@pytest.mark.skip(reason="Not working; needs investigation.")
+def test_tpu_compilation():
+    temp_dir = tempfile.mkdtemp()
+    with depyf.prepare_debug(temp_dir):
+        from vllm import LLM, SamplingParams

-    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-    # In real workloads, `enforace_eager` should be `False`.
+        prompts = [
+            "A robot may not injure a human being",
+            "It is only with the heart that one can see rightly;",
+            "The greatest glory in living lies not in never falling,",
+        ]
+        answers = [
+            " or, through inaction, allow a human being to come to harm.",
+            " what is essential is invisible to the eye.",
+            " but in rising every time we fall.",
+        ]
+        N = 1
+        # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        sampling_params = SamplingParams(temperature=0.7,
+                                         top_p=1.0,
+                                         n=N,
+                                         max_tokens=16)

-    # disable custom dispatcher, let Dynamo takes over
-    # all the control
-    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-              max_model_len=512,
-              max_num_seqs=64,
-              enforce_eager=True,
-              compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
-    outputs = llm.generate(prompts, sampling_params)
-    for output, answer in zip(outputs, answers):
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        assert generated_text.startswith(answer)
+        # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+        # In real workloads, `enforace_eager` should be `False`.

-compiled_codes = sorted(
-    glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
+        # disable custom dispatcher, let Dynamo takes over
+        # all the control
+        llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+                  max_model_len=512,
+                  max_num_seqs=64,
+                  enforce_eager=True,
+                  compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
+        outputs = llm.generate(prompts, sampling_params)
+        for output, answer in zip(outputs, answers):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text.startswith(answer)

-for i, compiled_code in enumerate(compiled_codes):
-    print("{} file: {}".format(i + 1, compiled_code))
+    compiled_codes = sorted(
+        glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))

-# We should only trigger Dynamo compilation 4 times:
-# 1. forward pass (symbolic)
-# 2. compute_logits (symbolic)
-# 3. forward pass (shape 16)
-# 4. forward pass (shape 32)
-# and later calls should not trigger Dynamo compilation again.
-# NOTE: It might still trigger XLA compilation.
+    for i, compiled_code in enumerate(compiled_codes):
+        print("{} file: {}".format(i + 1, compiled_code))

-# Check we have 4 compiled codes
-assert len(compiled_codes) == 4
+    # We should only trigger Dynamo compilation 4 times:
+    # 1. forward pass (symbolic)
+    # 2. compute_logits (symbolic)
+    # 3. forward pass (shape 16)
+    # 4. forward pass (shape 32)
+    # and later calls should not trigger Dynamo compilation again.
+    # NOTE: It might still trigger XLA compilation.

-kv_cache_prefix = "kv_cache"
-attn_prefix = "ragged_paged_attention"
+    # Check we have 4 compiled codes
+    assert len(compiled_codes) == 4

-# Check all the compilations are as expected
-compiled_fns = sorted(
-    glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
+    kv_cache_prefix = "kv_cache"
+    attn_prefix = "ragged_paged_attention"

-for i, compiled_fn in enumerate(compiled_fns):
-    print("{} file: {}".format(i + 1, compiled_fn))
+    # Check all the compilations are as expected
+    compiled_fns = sorted(
+        glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))

-# The first compilation is symbolic, so it should not have any kv_caches
-with open(compiled_fns[0]) as f:
-    content = f.read()
-    assert kv_cache_prefix not in content
+    for i, compiled_fn in enumerate(compiled_fns):
+        print("{} file: {}".format(i + 1, compiled_fn))

-# The second compilation is symbolic, so it should not have any kv_caches
-with open(compiled_fns[1]) as f:
-    content = f.read()
-    assert kv_cache_prefix not in content
+    # The first compilation is symbolic, so it should not have any kv_caches
+    with open(compiled_fns[0]) as f:
+        content = f.read()
+        assert kv_cache_prefix not in content

-# The third compilation is shape 16, so it should have kv_caches and the
-# ragged_paged_attention
-with open(compiled_fns[2]) as f:
-    content = f.read()
-    assert (kv_cache_prefix in content and attn_prefix in content)
+    # The second compilation is symbolic, so it should not have any kv_caches
+    with open(compiled_fns[1]) as f:
+        content = f.read()
+        assert kv_cache_prefix not in content

-# The forth compilation is shape 32, so it should have kv_caches and the
-# ragged_paged_attention
-with open(compiled_fns[3]) as f:
-    content = f.read()
-    assert (kv_cache_prefix in content and attn_prefix in content)
+    # The third compilation is shape 16, so it should have kv_caches and the
+    # ragged_paged_attention
+    with open(compiled_fns[2]) as f:
+        content = f.read()
+        assert (kv_cache_prefix in content and attn_prefix in content)
+
+    # The forth compilation is shape 32, so it should have kv_caches and the
+    # ragged_paged_attention
+    with open(compiled_fns[3]) as f:
+        content = f.read()
+        assert (kv_cache_prefix in content and attn_prefix in content)
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@ -70,12 +70,16 @@ def test_ngram_correctness(
        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
        del ref_llm

-        spec_llm = LLM(model=model_name,
-                       speculative_model='[ngram]',
-                       ngram_prompt_lookup_max=5,
-                       ngram_prompt_lookup_min=3,
-                       num_speculative_tokens=3,
-                       max_model_len=1024)
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
        matches = 0
        misses = 0
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@ -11,11 +11,13 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
                                   STOP_STRINGS,
                                   DummyOutputProcessorTestVectors,
                                   MockEngineCore)
+from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
 from vllm.v1.metrics.stats import IterationStats


@ -834,3 +836,88 @@ def test_iteration_stats(dummy_test_vectors):

    assert iteration_stats.num_prompt_tokens == 0
    assert iteration_stats.num_generation_tokens == num_active
+
+
+@pytest.mark.asyncio
+async def test_request_output_collector():
+    NUM_REQS = 3
+    TEXT = "a"
+
+    def make_outputs() -> list[RequestOutput]:
+        return [
+            RequestOutput(
+                request_id="my-request-id",
+                prompt=None,
+                prompt_token_ids=[1, 2, 3],
+                prompt_logprobs=None,
+                outputs=[
+                    CompletionOutput(
+                        index=0,
+                        text=TEXT,
+                        token_ids=[idx],
+                        cumulative_logprob=(idx + 1 * 1.0),
+                        logprobs=[{
+                            "a": idx,
+                            "b": idx
+                        }],
+                        finish_reason="length" if
+                        (idx == NUM_REQS - 1) else None,
+                    )
+                ],
+                finished=(idx == NUM_REQS - 1),
+            ) for idx in range(NUM_REQS)
+        ]
+
+    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+
+    # CASE 1: Put then get.
+    outputs = make_outputs()
+    collector.put(outputs[0])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+    assert output.outputs[0].text == "a"
+    assert output.outputs[0].token_ids == [0]
+
+    # CASE 2: 2 puts then get.
+    num_to_put = 2
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert not output.finished
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+    # CASE 3: Put all 3 (including a finished).
+    num_to_put = 3
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert output.finished
+    assert output.outputs[0].finish_reason == "length"
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -13,7 +13,7 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
+GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
 MODELS_TO_TEST = [
    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
@ -30,12 +30,13 @@ def test_guided_json_completion(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_json_schema,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
    outputs = llm.generate(prompts=[
        f"Give an example JSON for an employee profile "
        f"that fits this schema: {sample_json_schema}"
@ -111,13 +112,14 @@ def test_guided_json_object(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=100,
-                                     n=2,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json_object=True,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        n=2,
+        guided_decoding=GuidedDecodingParams(json_object=True))

    outputs = llm.generate(
        prompts=("Generate a JSON object with curly braces for a person with "
@ -137,12 +139,20 @@ def test_guided_json_object(

            # Parse to verify it is valid JSON
            parsed_json = json.loads(generated_text)
-            assert isinstance(parsed_json, dict)
+            allowed_types: tuple[type, ...] = (dict, )
+            if guided_decoding_backend == "xgrammar":
+                # TODO - we are currently too permissive with xgrammar and
+                # allow # any valid json (typically comes back as a list or
+                # object).  We can fix this by specifying a jsonschema of
+                # {"type": "object"}, # but we need this fix in a release
+                # first: https://github.com/mlc-ai/xgrammar/pull/264
+                allowed_types = (dict, list)
+            assert isinstance(parsed_json, allowed_types)


@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
+                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_json_unsupported_schema(
    monkeypatch: pytest.MonkeyPatch,
@ -151,21 +161,43 @@ def test_guided_json_unsupported_schema(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=unsupported_json_schema,
-                                         backend=guided_decoding_backend))
-    with pytest.raises(ValueError,
-                       match="The provided JSON schema contains features "
-                       "not supported by xgrammar."):
-        llm.generate(prompts=[
-            f"Give an example JSON for an employee profile "
-            f"that fits this schema: {unsupported_json_schema}"
-        ] * 2,
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+    if guided_decoding_backend == "xgrammar":
+        with pytest.raises(ValueError,
+                           match="The provided JSON schema contains features "
+                           "not supported by xgrammar."):
+            llm.generate(prompts=[
+                f"Give an example JSON for an employee profile "
+                f"that fits this schema: {unsupported_json_schema}"
+            ] * 2,
+                         sampling_params=sampling_params,
+                         use_tqdm=True)
+    else:
+        # This should work for both "guidance" and "auto".
+
+        outputs = llm.generate(
+            prompts=("Give an example JSON object for a grade "
+                     "that fits this schema: "
+                     f"{unsupported_json_schema}"),
+            sampling_params=sampling_params,
+            use_tqdm=True)
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+            print(generated_text)
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)


@pytest.mark.skip_global_cleanup
@ -179,13 +211,14 @@ def test_guided_grammar_ebnf(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_ebnf,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
    outputs = llm.generate(
        prompts=("Generate a sql statement that selects col_1 from "
                 "table_1 where it is equal to 1"),
@ -222,13 +255,14 @@ def test_guided_grammar_lark(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_lark,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
    outputs = llm.generate(
        prompts=("Generate a sql statement that selects col_1 from "
                 "table_1 where it is equal to 1"),
@ -269,16 +303,15 @@ def test_guided_grammar_ebnf_invalid(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar="not a grammar",
-                                         backend=guided_decoding_backend))
-    with pytest.raises(ValueError,
-                       match="Failed to convert the grammar "
-                       "from Lark to EBNF."):
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+    with pytest.raises(ValueError, match="Failed to convert the grammar "):
        llm.generate(
            prompts=("Generate a sql statement that selects col_1 from "
                     "table_1 where it is equal to 1"),
@ -298,12 +331,13 @@ def test_guided_regex(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
    outputs = llm.generate(
        prompts=[
            f"Give an example IPv4 address with this regex: {sample_regex}"
@ -335,12 +369,13 @@ def test_guided_choice_completion(
    model_name: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         choice=sample_guided_choice,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
    outputs = llm.generate(
        prompts="The best language for type-safe systems programming is ",
        sampling_params=sampling_params,
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@ -36,6 +36,8 @@ def create_logits_tensor(output_token_ids: list[list[int]],
 def create_sampling_metadata(
    all_greedy: bool,
    temperature: Optional[torch.Tensor] = None,
+    top_k: Optional[torch.Tensor] = None,
+    top_p: Optional[torch.Tensor] = None,
    generators: Optional[dict[int, Any]] = None,
 ) -> SamplingMetadata:
    """Create a v1 sampling metadata object with all_greedy set 
@ -52,8 +54,8 @@ def create_sampling_metadata(
        temperature=temperature,
        all_greedy=all_greedy,
        all_random=not all_greedy,
-        top_p=None,
-        top_k=None,
+        top_p=top_p,
+        top_k=top_k,
        min_p=torch.empty(1, ),
        generators=generators,
        max_num_logprobs=0,
@ -462,3 +464,147 @@ def estimate_rejection_sampling_pdf(
                           density=True)

    return hist.hist
+
+
+def _test_masked_logits(
+    rejection_sampler,
+    batch_size: int,
+    num_draft_tokens: int,
+    vocab_size: int,
+    target_logits: torch.Tensor,
+    unmasked_indices: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+):
+    # Set up test parameters
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create random draft probabilities.
+    draft_probs = torch.rand((num_tokens, vocab_size),
+                             dtype=torch.float32,
+                             device=DEVICE)
+    draft_probs = F.softmax(draft_probs, dim=-1)
+
+    # Randomly sample draft token ids from draft probs
+    draft_token_ids = torch.multinomial(draft_probs, num_samples=1)
+    draft_token_ids = draft_token_ids.reshape(batch_size, num_draft_tokens)
+    draft_token_ids = draft_token_ids.tolist()
+
+    # Bonus tokens not used but required
+    bonus_token_ids = torch.zeros((batch_size, 1),
+                                  dtype=torch.int64,
+                                  device=DEVICE)
+
+    # Create spec decode metadata
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        draft_token_ids,
+        device=DEVICE,
+    )
+
+    # Run rejection sampling
+    output_token_ids = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        target_logits=target_logits,
+        bonus_token_ids=bonus_token_ids,
+        sampling_metadata=sampling_metadata,
+    )
+
+    # Remove bonus tokens and reshape
+    output_token_ids = output_token_ids[:, :-1].flatten().tolist()
+
+    # Check that all sampled tokens are within the unmasked indices.
+    for i in range(num_tokens):
+        token_id = output_token_ids[i]
+        if token_id == PLACEHOLDER_TOKEN_ID:
+            continue
+        assert token_id in unmasked_indices[i]
+
+
+@pytest.mark.parametrize("top_k", [1, 5, 99])
+def test_top_k(rejection_sampler, top_k):
+    """Test rejection sampling with top-k sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Randomly create top-k indices.
+    top_k_indices = [
+        torch.randperm(vocab_size, device=DEVICE)[:top_k]
+        for _ in range(num_tokens)
+    ]
+    top_k_indices = torch.stack(top_k_indices)
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
+
+    # Increment the logits for top-k indices, a little bit more than the other
+    # ones. If the masking is effective, the non-topk indices will never be
+    # sampled despite the small difference in logits.
+    for i in range(num_tokens):
+        target_logits[i, top_k_indices[i]] += 0.1
+
+    # Create sampling metadata
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_k=torch.tensor([top_k] * batch_size,
+                           device=DEVICE,
+                           dtype=torch.int64),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_k_indices,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+@pytest.mark.parametrize("top_p", [0.5, 0.9, 0.99])
+def test_top_p(rejection_sampler, top_p):
+    """Test rejection sampling with top-p sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    rescaled_logits = target_logits / temperature
+
+    logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum <= 1 - top_p
+    # at least one
+    top_p_mask[:, -1] = False
+
+    # Get the top-p indices.
+    top_p_indices = []
+    for i in range(num_tokens):
+        top_p_indices.append(logits_idx[i][~top_p_mask[i]].tolist())
+
+    # Create sampling metadata
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_p=torch.tensor([top_p] * batch_size,
+                           device=DEVICE,
+                           dtype=torch.float32),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_p_indices,
+        sampling_metadata=sampling_metadata,
+    )
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@ -2,7 +2,8 @@

 import numpy as np

-from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp,
+from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
+                                                _find_subarray_kmp,
                                                _kmp_lps_array)


@ -35,3 +36,53 @@ def test_find_subarray_kmp():
    # Return on the first match
    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
                                  np.array([6, 2, 3]))
+
+
+def test_ngram_proposer():
+    proposer = NgramProposer()
+
+    # No match.
+    result = proposer.propose(
+        context_token_ids=np.array([1, 2, 3, 4, 5]),
+        min_n=2,
+        max_n=2,
+        k=2,
+    )
+    assert result is None
+
+    # No match for 4-gram.
+    result = proposer.propose(
+        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
+        min_n=4,
+        max_n=4,
+        k=2,
+    )
+    assert result is None
+
+    # No match for 4-gram but match for 3-gram.
+    result = proposer.propose(
+        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
+        min_n=3,
+        max_n=4,
+        k=2,
+    )
+    assert np.array_equal(result, np.array([4, 1]))
+
+    # Match for both 4-gram and 3-gram.
+    # In this case, the proposer should return the 4-gram match.
+    result = proposer.propose(
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]),
+        min_n=3,
+        max_n=4,
+        k=2,
+    )
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
+
+    # Match for 2-gram and 3-gram, but not 4-gram.
+    result = proposer.propose(
+        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]),
+        min_n=2,
+        max_n=4,
+        k=2,
+    )
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
--- a/tests/v1/tpu/worker/init.py
+++ b/tests/v1/tpu/worker/init.py
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+import unittest.mock as mock
+
+import pytest
+
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.tpu_model_runner import TPUModelRunner
+
+# Mock torch_xla module since it may not be available in the test environments
+torch_xla_patcher = mock.patch.dict(
+    "sys.modules", {
+        "torch_xla": mock.MagicMock(),
+        "torch_xla.core.xla_model": mock.MagicMock(),
+        "torch_xla.runtime": mock.MagicMock(),
+    })
+torch_xla_patcher.start()
+
+# Mock the PallasAttentionBackend
+pallas_attention_backend_patcher = mock.patch(
+    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
+pallas_attention_backend_patcher.start()
+
+
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="facebook/opt-125m",
+        task="generate",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="bfloat16",  # TPUs typically use bfloat16
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+    )
+    device = "xla:0"  # Mocking TPU device
+    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
+         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
+         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
+        return TPUModelRunner(vllm_config, device)
+
+
+@pytest.fixture(autouse=True, scope="session")
+def cleanup_patches():
+    yield
+    torch_xla_patcher.stop()
+    pallas_attention_backend_patcher.stop()
+
+
+def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
+    new_reqs = []
+    num_scheduled_tokens = {}
+    total_num_scheduled_tokens = 0
+    for req_id in req_ids:
+        new_reqs.append(
+            NewRequestData(
+                req_id=req_id,
+                prompt_token_ids=[1, 2, 3],
+                prompt="test",
+                mm_inputs=[],
+                mm_hashes=[],
+                mm_positions=[],
+                sampling_params=SamplingParams(),
+                block_ids=[0],
+                num_computed_tokens=0,
+                lora_request=None,
+            ))
+        num_scheduled_tokens[req_id] = 3
+        total_num_scheduled_tokens += num_scheduled_tokens[req_id]
+
+    return SchedulerOutput(
+        scheduled_new_reqs=new_reqs,
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+
+def _is_req_scheduled(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.input_batch.req_id_to_index
+
+
+def _is_req_added(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.requests
+
+
+def _is_sampling_metadata_changed(model_runner,
+                                  sampling_metadata_before: SamplingMetadata):
+    return model_runner.input_batch.sampling_metadata is not (
+        sampling_metadata_before)
+
+
+def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    req_index = model_runner.input_batch.req_id_to_index[req_id]
+    block_table = model_runner.input_batch.block_table
+    req_state = model_runner.requests[req_id]
+    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
+        return False
+    num_blocks = block_table.num_blocks_per_row[req_index]
+    return (block_table.block_table_np[req_index, :num_blocks] ==
+            req_state.block_ids).all()
+
+
+def test_update_states_new_request(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_finished(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # finish req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids={req_id},
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert not _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_resumed(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # unschedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+    # resume req
+    cached_req_data = CachedRequestData(
+        req_id=req_id,
+        resumed_from_preemption=False,
+        new_token_ids=[],
+        new_block_ids=[],
+        num_computed_tokens=0,
+    )
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[cached_req_data],
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_no_changes(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # schedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_unscheduled(model_runner):
+    req_ids = ("req_0", "req_1")
+
+    # new reqs
+    scheduler_output = _schedule_new_request(*req_ids)
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert _is_req_scheduled(model_runner, req_ids[1])
+
+    # unschedule req_1
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={req_ids[0]: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert not _is_req_scheduled(model_runner, req_ids[1])
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -124,6 +124,18 @@ def paged_attention_rocm(
                                      kv_cache_dtype, k_scale, v_scale)


+def mla_decode_kvcache_cpu(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    kv_cache: torch.Tensor,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> None:
+    torch.ops._C_cpu.mla_decode_kvcache(out, query, kv_cache, scale,
+                                        block_tables, seq_lens)
+
+
 # pos encoding ops
 def rotary_embedding(
    positions: torch.Tensor,
@ -190,6 +202,33 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
        block_table_bound)


+def block_table_appends(
+    append_row_indices: torch.Tensor,
+    append_row_indices_cpu: torch.Tensor,
+    append_cumsums: torch.Tensor,
+    append_cumsums_cpu: torch.Tensor,
+    append_block_ids: torch.Tensor,
+    append_block_ids_cpu: torch.Tensor,
+    block_table: torch.Tensor,
+    num_appends: int,
+    total_num_append_blocks: int,
+) -> None:
+    torch.ops._C.block_table_appends.default(
+        append_row_indices, append_row_indices_cpu, append_cumsums,
+        append_cumsums_cpu, append_block_ids, append_block_ids_cpu,
+        block_table, num_appends, total_num_append_blocks)
+
+
+def block_table_moves(
+    src_dst_n: torch.Tensor,
+    src_dst_n_cpu: torch.Tensor,
+    block_table: torch.Tensor,
+    num_moves: int,
+) -> None:
+    torch.ops._C.block_table_moves.default(src_dst_n, src_dst_n_cpu,
+                                           block_table, num_moves)
+
+
 # fused quant layer norm ops
 def rms_norm_dynamic_per_token_quant(
    input: torch.Tensor,
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@ -187,15 +187,28 @@ class ipex_ops:
        gen_: torch.Generator,
        logits_soft_cap: float,
    ) -> None:
-        ipex.llm.functional.varlen_attention(query.contiguous(),
-                                             key.contiguous(),
-                                             value.contiguous(), out,
-                                             seqlen_q.int(), seqlen_k.int(),
-                                             max_seqlen_q, max_seqlen_k,
-                                             pdropout, softmax_scale,
-                                             zero_tensors, is_causal,
-                                             return_softmax, gen_,
-                                             logits_soft_cap)
+        if ipex.__version__.endswith("cpu"):
+            if logits_soft_cap != 0.0:
+                raise ValueError("IPEX CPU does not support logits_soft_cap")
+            ipex.llm.functional.varlen_attention(query.contiguous(),
+                                                 key.contiguous(),
+                                                 value.contiguous(), out,
+                                                 seqlen_q.int(),
+                                                 seqlen_k.int(), max_seqlen_q,
+                                                 max_seqlen_k, pdropout,
+                                                 softmax_scale, zero_tensors,
+                                                 is_causal, return_softmax,
+                                                 gen_)
+        else:  # XPU build
+            ipex.llm.functional.varlen_attention(query.contiguous(),
+                                                 key.contiguous(),
+                                                 value.contiguous(), out,
+                                                 seqlen_q.int(),
+                                                 seqlen_k.int(), max_seqlen_q,
+                                                 max_seqlen_k, pdropout,
+                                                 softmax_scale, zero_tensors,
+                                                 is_causal, return_softmax,
+                                                 gen_, logits_soft_cap)

    @staticmethod
    def reshape_and_cache(
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@ -0,0 +1,303 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm._ipex_ops import ipex_ops
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadataBuilder,
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState
+from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata
+from vllm.utils import make_tensor_with_pad
+from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
+
+
+class CPUMLABackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "CPU_MLA"
+
+    @staticmethod
+    def get_metadata_cls() -> Type["CPUMLAMetadata"]:
+        return CPUMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]:
+        return CPUMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["MLACommonState"]:
+        return MLACommonState
+
+    @staticmethod
+    def get_impl_cls() -> Type["CPUMLAImpl"]:
+        return CPUMLAImpl
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        ops.copy_blocks_mla(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+
+@dataclass
+class CPUMLAMetadata(TorchSDPAMetadata):
+    # New for MLA
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor = None
+
+    # required by MLACommonImpl
+    is_profile_run: bool = False
+
+
+class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
+
+    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
+        self.chunked_prefill = input_builder.chunked_prefill
+        self.input_builder = input_builder
+        assert not self.chunked_prefill, \
+            "chunked prefill is currently not supported"
+
+    def prepare(self):
+        self.input_data = self.input_builder.input_data
+
+    def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
+        input_data = self.input_data
+        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
+        prefill_query_lens = query_lens[0:input_data.num_prefills]
+        slot_mapping = torch.tensor(input_data.slot_mapping,
+                                    dtype=torch.long,
+                                    device="cpu")
+
+        # metadata for prefill
+        if input_data.num_prefills > 0:
+            query_lens_tensor = torch.tensor(prefill_query_lens,
+                                             dtype=torch.int32,
+                                             device="cpu")
+            kv_lens_tensor = torch.tensor(prefill_seq_lens,
+                                          dtype=torch.int32,
+                                          device="cpu")
+            query_start_loc = torch.zeros(input_data.num_prefills + 1,
+                                          dtype=torch.int32,
+                                          device="cpu")
+            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
+                                       dtype=torch.int32,
+                                       device="cpu")
+            torch.cumsum(query_lens_tensor,
+                         dim=0,
+                         dtype=torch.int32,
+                         out=query_start_loc[1:])
+            torch.cumsum(kv_lens_tensor,
+                         dim=0,
+                         dtype=torch.int32,
+                         out=kv_start_loc[1:])
+            max_query_len = max(prefill_query_lens)
+            max_kv_len = max(prefill_seq_lens)
+
+            # for chunked-prefill
+            if self.chunked_prefill:
+                prefill_block_tables = make_tensor_with_pad(
+                    self.input_data.prefill_block_tables,
+                    pad=0,
+                    dtype=torch.int32,
+                    device="cpu",
+                )
+            else:
+                prefill_block_tables = None
+
+        else:
+            query_start_loc = None
+            kv_start_loc = None
+            max_query_len = None
+            max_kv_len = None
+            prefill_block_tables = None
+
+        # metadata for decode
+        if input_data.num_decode_tokens != 0:
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[input_data.num_prefills:],
+                dtype=torch.int32,
+                device="cpu",
+            )
+            block_tables = make_tensor_with_pad(
+                self.input_data.decode_block_tables,
+                pad=0,
+                dtype=torch.int32,
+                device="cpu",
+            )
+        else:
+            block_tables = torch.tensor([])
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[:input_data.num_prefills],
+                dtype=torch.int32,
+                device="cpu",
+            )
+
+        # For multi-modal models
+        placeholder_index_maps = None
+        if len(input_data.multi_modal_inputs_list) != 0:
+            placeholder_index_maps = {
+                modality: placeholder_map.index_map()
+                for modality, placeholder_map in
+                input_data.multi_modal_placeholder_maps.items()
+            }
+
+        return CPUMLAMetadata(
+            chunked_prefill=self.chunked_prefill,
+            seq_lens=prefill_seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_kv_len=max_kv_len,
+            query_start_loc=query_start_loc,
+            kv_start_loc=kv_start_loc,
+            max_decode_seq_len=input_data.max_decode_seq_len,
+            num_prefills=input_data.num_prefills,
+            num_prefill_tokens=input_data.num_prefill_tokens,
+            num_decode_tokens=input_data.num_decode_tokens,
+            block_tables=block_tables,
+            prefill_block_tables=prefill_block_tables,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
+            input_positions=torch.tensor([self.input_data.input_positions]))
+
+
+class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CPUMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CPUMLAImpl")
+
+        # states is implemented.
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "CPUMLAImpl with FP8 KV cache not yet supported")
+
+    def _forward_prefill(
+            self,
+            q: torch.Tensor,
+            kv_c_normed: torch.Tensor,
+            k_pe: torch.Tensor,
+            kv_c_and_k_pe_cache: torch.Tensor,
+            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
+    ) -> torch.Tensor:
+
+        prefill_metadata = attn_metadata.prefill_metadata
+        assert prefill_metadata is not None
+
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        output = torch.empty_like(q)
+        ipex_ops.varlen_attention(
+            query=q,
+            key=k,
+            value=v_padded,
+            out=output,
+            seqlen_q=prefill_metadata.query_start_loc,
+            seqlen_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_query_len,
+            max_seqlen_k=prefill_metadata.max_query_len,
+            pdropout=0.0,
+            softmax_scale=self.scale,
+            zero_tensors=False,
+            is_causal=True,
+            return_softmax=False,
+            gen_=None,
+            logits_soft_cap=0.0,
+        )
+
+        # remove padding
+        output = output.view(-1, self.num_heads,
+                             q.shape[-1])[..., :v.shape[-1]]
+        output = output.reshape(-1, self.num_heads * v.shape[-1])
+        return self.o_proj(output)[0]
+
+    def _forward_decode(
+            self,
+            q_nope: torch.Tensor,
+            q_pe: torch.Tensor,
+            kv_c_and_k_pe_cache: torch.Tensor,
+            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank)
+
+        # Run MQA
+        ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
+                                   decode_meta.block_tables,
+                                   decode_meta.seq_lens_tensor)
+        return self._v_up_proj_and_o_proj(o)
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@ -22,12 +22,13 @@ from vllm.attention.backends.utils import (
    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
    is_all_encoder_attn_metadata_set, is_block_tables_empty)
-from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                  flash_attn_with_kvcache)
+from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)

 if TYPE_CHECKING:
    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@ -632,10 +633,13 @@ class FlashAttentionImpl(AttentionImpl):
        self.kv_cache_dtype = kv_cache_dtype
        self.vllm_flash_attn_version = get_flash_attn_version(
            requires_alibi=self.alibi_slopes is not None)
-        if (is_quantized_kv_cache(self.kv_cache_dtype)
-                and self.vllm_flash_attn_version != 3):
+        if is_quantized_kv_cache(self.kv_cache_dtype) and (
+                not self.kv_cache_dtype.startswith("fp8")
+                or not flash_attn_supports_fp8()):
            raise NotImplementedError(
-                "Only FlashAttention3 supports FP8 KV cache")
+                f"FlashAttention does not support {self.kv_cache_dtype} "
+                "kv-cache on this device "
+                f"(FA supports fp8 = {flash_attn_supports_fp8()}).")
        if logits_soft_cap is None:
            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
            logits_soft_cap = 0
@ -704,6 +708,10 @@ class FlashAttentionImpl(AttentionImpl):
        logits_soft_cap: Optional[float] = self.logits_soft_cap
        fp8_attention = kv_cache_dtype.startswith("fp8")

+        if fp8_attention and not flash_attn_supports_fp8():
+            raise NotImplementedError(
+                "FlashAttention does not support FP8 kv-cache on this device.")
+
        if kv_cache.numel() > 0:
            key_cache = kv_cache[0]
            value_cache = kv_cache[1]
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@ -204,8 +204,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                           compute_slot_mapping_start_idx,
                                           is_block_tables_empty)
-from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
-from vllm.fa_utils import get_flash_attn_version
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearBase, RowParallelLinear,
                                               UnquantizedLinearMethod)
@ -213,17 +211,27 @@ from vllm.model_executor.layers.rotary_embedding import (
    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
 from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
+from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
+
+if HAS_TRITON:
+    from vllm.attention.ops.triton_flash_attention import triton_attention
+    from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+else:
+    merge_attn_states = None
+    triton_attention = None

 try:
    from vllm.vllm_flash_attn import flash_attn_varlen_func
    is_vllm_fa = True
 except ImportError:
-    # For rocm use upstream flash attention
-    from flash_attn import flash_attn_varlen_func
    is_vllm_fa = False
-
-from vllm.attention.ops.triton_flash_attention import triton_attention
+    try:
+        # For rocm use upstream flash attention
+        from flash_attn import flash_attn_varlen_func
+    except ImportError:
+        flash_attn_varlen_func = None

 if TYPE_CHECKING:
    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -357,6 +357,11 @@ class VllmBackend:
            # graph.

            factors = []
+            # 0. factors come from the env, for example, The values of
+            # VLLM_PP_LAYER_PARTITION will affects the computation graph.
+            env_hash = envs.compute_hash()
+            factors.append(env_hash)
+
            # 1. factors come from the vllm_config (it mainly summarizes how the
            #    model is created)
            config_hash = vllm_config.compute_hash()
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@ -1,26 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0

 import hashlib
+import importlib.metadata
 import inspect
+import json
 import types
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Dict, Optional, Union

 import torch
+from packaging.version import Version
 from torch import fx

+if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+    from torch._inductor.custom_graph_pass import CustomGraphPass
+else:
+    # CustomGraphPass is not present in 2.5 or lower, import our version
+    from .torch25_custom_graph_pass import (  # noqa: yapf
+        Torch25CustomGraphPass as CustomGraphPass)

-class InductorPass(ABC):
-    """
-    General custom inductor pass interface.
-    """

-    @abstractmethod
-    def __call__(self, graph: torch.fx.Graph):
-        """
-        Execute the pass on the given graph.
-        """
-        raise NotImplementedError
+class InductorPass(CustomGraphPass):
+    """
+    A custom graph pass that uses a hash of its source as the UUID.
+    This is defined as a convenience and should work in most cases.
+    """

    def uuid(self) -> Any:
        """
@ -48,7 +51,16 @@ class InductorPass(ABC):
            else:
                src_str = inspect.getsource(src.__class__)
            hasher.update(src_str.encode("utf-8"))
-        return hasher.digest()
+        return hasher.hexdigest()
+
+    @staticmethod
+    def hash_dict(dict_: Dict[Any, Any]):
+        """
+        Utility method to hash a dictionary, can alternatively be used for uuid.
+        :return: A sha256 hash of the json rep of the dictionary.
+        """
+        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+        return hashlib.sha256(encoded).hexdigest()


 class CallableInductorPass(InductorPass):
@ -61,25 +73,10 @@ class CallableInductorPass(InductorPass):
                 callable: Callable[[fx.Graph], None],
                 uuid: Optional[Any] = None):
        self.callable = callable
-        if uuid is None:
-            uuid = InductorPass.hash_source(callable)
-        self._uuid = uuid
+        self._uuid = self.hash_source(callable) if uuid is None else uuid

    def __call__(self, graph: torch.fx.Graph):
        self.callable(graph)

    def uuid(self) -> Any:
        return self._uuid
-
-    def __getstate__(self):
-        """
-        Pickling occurs in the Inductor code cache if a pass is not given to
-        the pass manager but is instead directly added to config as a pass.
-        See PostGradPassManager for more.
-
-        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
-        """
-        return self._uuid
-
-    def __setstate__(self, state):
-        raise ValueError("Cannot unpickle CallableInductorPass")
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Any, Dict, List
+from typing import List

-import torch
 from torch import fx as fx

 from vllm.config import CompilationConfig
@ -10,29 +9,18 @@ from vllm.logger import init_logger

 from .fix_functionalization import FixFunctionalizationPass
 from .fusion import FusionPass
-from .inductor_pass import InductorPass
+from .inductor_pass import CustomGraphPass, InductorPass
 from .noop_elimination import NoOpEliminationPass

 logger = init_logger(__name__)


-class PlaceHolder:
-    pass
-
-
-if torch.__version__ < "2.6":
-    Parent = PlaceHolder  # type: ignore
-else:
-    Parent = torch._inductor.custom_graph_pass.CustomGraphPass  # type: ignore
-
-
-class PostGradPassManager(Parent):
+class PostGradPassManager(CustomGraphPass):
    """
    The pass manager for post-grad passes.
    It handles configuration, adding custom passes, and running passes.
-    It also supports pickling, which is used by the Inductor code cache.
-    TODO(torch==2.6), use CustomGraphPass
-    (torch._inductor.custom_graph_pass.CustomGraphPass)
+    It supports uuid for the Inductor code cache. That includes torch<2.6
+    support using pickling (in .inductor_pass.CustomGraphPass).

    The order of the post-grad post-passes is:
    1. passes (constructor parameter)
@ -67,27 +55,13 @@ class PostGradPassManager(Parent):
        self.passes.append(pass_)

    def uuid(self):
-        return self.__getstate__()
-
-    def __getstate__(self) -> Dict[str, List[Any]]:
        """
-        Custom pickling for the pass manager, as some passes cannot be pickled.
-        Pickling occurs because the pass manager is set as the value of
-        `config["post_grad_custom_post_pass"]` in the Inductor config.
-        The config is pickled to act as a key in the Inductor code cache.
-        Any other passes in the config are pickled as well.
-
-        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
+        The PostGradPassManager is set as a custom pass in the Inductor and
+        affects compilation caching. Its uuid depends on the UUIDs of all
+        dependent passes and the pass config. See InductorPass for more info.
        """
        state = {"pass_config": self.pass_config.uuid(), "passes": []}
        for pass_ in self.passes:
            state["passes"].append(pass_.uuid())
        state["passes"].append(self.fix_functionalization.uuid())
-        return state
-
-    def __setstate__(self, state):
-        """
-        Do not allow unpickling of the pass manager.
-        If this is needed in the future, it should properly pickle the passes.
-        """
-        raise ValueError("Cannot unpickle PostGradPassManager")
+        return InductorPass.hash_dict(state)
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import torch
+
+
+class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
+    """
+    This class replaces CustomGraphPass from torch==2.6 when using torch<2.6.
+    It conforms to the 2.6 interface but also supports pickling, as that's what
+    the inductor code cache uses to determine the cache key before 2.6.
+    (in 2.6 and above, uuid() is used.)
+
+    Subclasses can just "pretend" that uuid is used.
+    """
+
+    @abstractmethod
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        """
+        Implementation of the custom pass.
+        """
+
+    @abstractmethod
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom pass implementation.
+        Return None to skip inductor code caching entirely.
+        """
+
+    def __getstate__(self):
+        """
+        Pickling is used instead of uuid() in torch<2.6. Just return uuid()
+         to enable subclasses to only have to implement uuid.
+        """
+        return self.uuid()
+
+    def __setstate__(self, state):
+        raise ValueError("Cannot unpickle CustomGraphPass because pickling"
+                         " is used for cache key uuid. Use torch>=2.6 with"
+                         " native uuid support for custom passes.")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Woosuk Kwon	44d638a896	merge Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-25 10:26:20 -07:00
yarongmu-google	0a049c7d86	[CI/Build] Add tests for the V1 tpu_model_runner. (#14843 ) Signed-off-by: Yarong Mu <ymu@google.com>	2025-03-25 12:27:16 -04:00
youkaichao	d0cfec7ab9	[bugfix] fix inductor cache on max_position_embeddings (#15436 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-03-25 07:05:39 -07:00
Szymon Ożóg	a608160027	[Kernel] Fix conflicting macro names for gguf kernels (#15456 ) Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>	2025-03-25 13:50:49 +00:00
Cyrus Leung	3f04a7fbf2	[Doc] Update V1 user guide for multi-modality (#15460 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-25 11:01:58 +00:00
Cyrus Leung	5994430b84	[Misc] Remove redundant `num_embeds` (#15443 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-25 18:27:57 +08:00
Cyrus Leung	a9e879b316	[Misc] Clean up MiniCPM-V/O code (#15337 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-25 10:22:52 +00:00
Md. Shafi Hussain	3e2f37a69a	Dockerfile.ppc64le changes to move to UBI (#15402 ) Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>	2025-03-25 10:15:14 +00:00
Thien Tran	4f044b1d67	[Kernel][CPU] CPU MLA (#14744 ) Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>	2025-03-25 09:34:59 +00:00
Siyuan Liu	4157f563b4	[Hardware][TPU][Bugfix] Fix v1 mp profiler (#15409 ) Signed-off-by: Siyuan Liu <lsiyuan@google.com>	2025-03-25 01:43:00 -07:00
Lu Fang	051da7efe3	Fix CUDA kernel index data type in vllm/csrc/quantization/gptq_marlin/awq_marlin_repack.cu +10 (#15160 ) Signed-off-by: Lu Fang <lufang@fb.com> Co-authored-by: Richard Barnes <rbarnes@meta.com>	2025-03-25 15:36:45 +08:00
Woosuk Kwon	25f560a62c	[V1][Spec Decode] Update target_logits in place for rejection sampling (#15427 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-24 21:04:41 -07:00
Russell Bryant	a09ad90a72	[V1] guidance backend for structured output + `auto` fallback mode (#14779 ) Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Loc Huynh <jc1da.3011@gmail.com> Co-authored-by: Michal Moskal <michal@moskal.me>	2025-03-24 21:02:33 -07:00
Chauncey	10b34e36b9	[Bugfix] Fixed the issue of not being able to input video and image simultaneously (#15387 ) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>	2025-03-25 03:48:08 +00:00
Tyler Michael Smith	b5269db959	Revert "Fix non-contiguous input passed to Marlin kernel (#15319 )" (#15398 )	2025-03-24 20:43:51 -07:00
Jee Jee Li	6db94571d7	[Misc] Remove LoRA log (#15388 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-03-24 20:43:48 -07:00
Harry Mellor	97cfa65df7	Add pipeline parallel support to `TransformersModel` (#12832 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-03-25 10:41:45 +08:00
Woosuk Kwon	911c8eb000	[Minor][Spec Decode] Remove compiled_softmax (#15416 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-24 19:09:04 -07:00
Woosuk Kwon	ebcebeeb6b	[V1][Spec Decode] Enable spec decode for top-p & top-k sampling (#15063 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-24 17:16:46 -07:00
Gregory Shtrasberg	f533b5837f	[ROCm][Kernel] MoE weights padding (#14454 ) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: charlifu <charlifu@amd.com> Co-authored-by: charlifu <charlifu@amd.com>	2025-03-24 23:45:30 +00:00
Gregory Shtrasberg	8279201ce6	[Build] Cython compilation support fix (#14296 ) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>	2025-03-24 23:37:54 +00:00
Siyuan Liu	23fdab00a8	[Hardware][TPU] Skip failed compilation test (#15421 ) Signed-off-by: Siyuan Liu <lsiyuan@google.com>	2025-03-24 23:28:57 +00:00
Nick Hill	623e2ed29f	[BugFix][V1] Quick fix for min_tokens with multiple EOS (#15407 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-03-24 15:58:59 -07:00
Nick Hill	9d72daf4ce	[V1][Perf] Simpler request output queues (#15156 ) Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com> Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-24 22:44:08 +00:00
Cyrus Leung	6dd55af6c9	[Doc] Update docs on handling OOM (#15357 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2025-03-24 14:29:34 -07:00
Yuan Tang	3eb08ed9b1	[DOC] Add Kubernetes deployment guide with CPUs (#14865 )	2025-03-24 10:48:43 -07:00
liuzhenwei	5eeadc2642	[Hardware][Gaudi][Feature] Enable Dynamic MoE for Mixtral (#12303 ) Signed-off-by: zhenwei <zhenweiliu@habana.ai>	2025-03-24 09:48:40 -07:00
Nick Hill	3aee6573dc	[V1] Aggregate chunked prompt logprobs in model runner (#14875 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-03-24 12:27:57 -04:00
Yi Liu	9cc645141d	[MISC] Refine no available block debug msg (#15076 ) Signed-off-by: Yi Liu <yiliu4@habana.ai> Signed-off-by: yiliu30 <yi4.liu@intel.com> Co-authored-by: Yi Liu <yiliu4@habana.ai>	2025-03-25 00:01:10 +08:00
Chen1022	0893567db9	[V1][Minor] fix comments (#15392 ) Signed-off-by: chenjincong <chenjincong@baidu.com> Signed-off-by: Chen-0210 <chenjincong11@gmail.com> Co-authored-by: chenjincong <chenjincong@baidu.com>	2025-03-24 08:45:32 -07:00
Russell Bryant	8abe69b499	[Core] Don't force uppercase for VLLM_LOGGING_LEVEL (#15306 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-24 08:27:30 -07:00
Manish Sethi	761702fd19	[Core] Integrate `fastsafetensors` loader for loading model weights (#10647 ) Signed-off-by: Manish Sethi <Manish.sethi1@ibm.com>	2025-03-24 08:08:02 -07:00
youkaichao	9606d572ed	[distributed] fix dp group (#15355 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-03-24 14:54:27 +00:00
Cyrus Leung	cbcdf2c609	[Bugfix] Fix chat template loading (#15143 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: chaunceyjiang <chaunceyjiang@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2025-03-24 13:50:09 +00:00
Russell Bryant	038de04d7b	Fix zmq IPv6 URL format error (#15341 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-24 09:30:41 -04:00
Jinzhen Lin	6b3cc75be0	[Kernel] allow non-contiguous input for marlin kernel (#14658 ) Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>	2025-03-24 09:21:33 -04:00
Simon Mo	7ffcccfa5c	Revert "[CI/Build] Use uv python for docker rather than ppa:deadsnakess/ppa (#13569 )" (#15377 ) Signed-off-by: simon-mo <simon.mo@hey.com>	2025-03-24 05:53:10 -07:00
sfbemerk	cc8accfd53	[Misc] Update guided decoding logs to debug (#15310 ) Signed-off-by: Benjamin Merkel <benjamin.merkel@tngtech.com> Co-authored-by: Benjamin Merkel <benjamin.merkel@tngtech.com>	2025-03-24 04:25:20 -07:00
ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟	948ab03e7e	[Bugfix][V1] Avoid importing PreTrainedModel (#15366 ) Signed-off-by: Hollow Man <hollowman@opensuse.org>	2025-03-24 10:33:12 +00:00
Rui Qiao	5797fb97e9	[Misc] Remove ignore_reinit_error for ray.init() (#15373 )	2025-03-24 07:41:53 +00:00
Jee Jee Li	3892e58ad7	[Misc] Upgrade BNB version (#15183 )	2025-03-24 05:51:42 +00:00
Qubitium-ModelCloud	d20e261199	Fix non-contiguous input passed to Marlin kernel (#15319 )	2025-03-24 03:09:44 +00:00
Luka Govedič	f622dbcf39	[Fix] [torch.compile] Improve UUID system for custom passes (#15249 ) Signed-off-by: luka <luka@neuralmagic.com>	2025-03-24 01:54:07 +00:00
Lucas Wilkinson	dccf535f8e	[V1] Enable V1 Fp8 cache for FA3 in the oracle (#15191 ) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-03-23 15:07:04 -07:00
Roger Wang	9c5c81b0da	[Misc][Doc] Add note regarding loading `generation_config` by default (#15281 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-03-23 14:00:55 -07:00
Robin	d6cd59f122	[Frontend] Support tool calling and reasoning parser (#14511 ) Signed-off-by: WangErXiao <863579016@qq.com>	2025-03-23 14:00:07 -07:00
Woosuk Kwon	bc8ed3c4ba	[V1][Spec Decode] Use better defaults for N-gram (#15358 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-23 10:52:30 -07:00
Woosuk Kwon	b9bd76ca14	[V1][Spec Decode] Respect prompt_lookup_max (#15348 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-23 10:41:44 -07:00
DefTruth	6ebaf9ac71	[Bugfix] consider related env vars for torch.compiled cache hash (#14953 ) Signed-off-by: DefTruth <31974251+DefTruth@users.noreply.github.com>	2025-03-23 15:53:09 +00:00
DefTruth	f90d34b498	[Misc] Add tuned R1 w8a8 and MoE configs for NVIDIA L20 (#15322 ) Signed-off-by: DefTruth <qiustudent_r@163.com>	2025-03-23 01:10:10 -07:00
youkaichao	f68cce8e64	[ci/build] fix broken tests in LLM.collective_rpc (#15350 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-03-23 14:49:48 +08:00
youkaichao	09b6a95551	[ci/build] update torch nightly version for GH200 (#15135 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-03-23 14:04:13 +08:00
shangmingc	50c9636d87	[V1][Usage] Refactor speculative decoding configuration and tests (#14434 ) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>	2025-03-22 19:28:10 -10:00
hijkzzz	0661cfef7a	Fix v1 supported oracle for worker-cls and worker-extension-cls (#15324 ) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>	2025-03-23 10:23:35 +08:00
Chen Zhang	a827aa815d	[doc] Add back previous news (#15331 ) Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-03-22 17:38:33 -07:00
Woosuk Kwon	caacd1ddfb	Merge branch 'main' into v1-block-table-opt	2025-01-23 22:59:55 -08:00
Woosuk Kwon	e68f63ef83	Simplify Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-15 02:31:16 -08:00
Woosuk Kwon	223e17424c	working Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-15 01:24:45 -08:00