Compare commits
50 Commits
v1-block-t
...
rob-fixes
| Author | SHA1 | Date | |
|---|---|---|---|
| 220d694080 | |||
| 70e06dd574 | |||
| 7954461d4c | |||
| a10da86677 | |||
| 284d5df45b | |||
| d5b0db449e | |||
| 66349c33a1 | |||
| 28d0396ff1 | |||
| 2f29ae383a | |||
| cf64b0e6a7 | |||
| f51f182d64 | |||
| 79e465f557 | |||
| 2ba687d39f | |||
| 5d57896e2c | |||
| f6f008ca1d | |||
| 24cbbe4778 | |||
| 2fec6e0b5c | |||
| 47a3f26b2a | |||
| 144162fc8c | |||
| 522279ebb9 | |||
| 85687b43e7 | |||
| 120bbdfd82 | |||
| 2ceb7bc534 | |||
| 9f7fb5ec84 | |||
| a8a621e419 | |||
| b89d89f456 | |||
| 8355358fb3 | |||
| c0b1443345 | |||
| d35dace985 | |||
| 912031ceb5 | |||
| 4f13e89143 | |||
| b9a7dbe769 | |||
| 0cb2e05256 | |||
| d6945ecdf0 | |||
| 298298f97d | |||
| 6c8fae82dd | |||
| 16ed827378 | |||
| 8fa9df7987 | |||
| 27c1afe88b | |||
| ee6607332e | |||
| 7fbf70db57 | |||
| 2c31e4c3ea | |||
| 187f112ccd | |||
| 897db7b93d | |||
| b7ffb43792 | |||
| 6e1fba8a73 | |||
| bfde1688e7 | |||
| 905424ed65 | |||
| 5d20f389d6 | |||
| 2a0cb78016 |
@ -38,8 +38,6 @@ function cpu_tests() {
|
||||
set -e
|
||||
pip install -r vllm/requirements/test.txt
|
||||
pip install -r vllm/requirements/cpu.txt
|
||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
|
||||
@ -14,7 +14,6 @@ DOCKER_BUILDKIT=1 docker build . \
|
||||
-t gh200-test \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg RUN_WHEEL_CHECK=false \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
|
||||
@ -24,6 +23,6 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
||||
|
||||
@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo TEST_1 \
|
||||
&& pytest /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_3 \
|
||||
@ -30,9 +30,7 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
&& echo TEST_4 \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_5 \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
||||
&& echo TEST_6 \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
|
||||
@ -118,7 +118,7 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -148,8 +148,8 @@ steps:
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- python3 rlhf.py
|
||||
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
@ -515,7 +515,7 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
commands:
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,8 +2,7 @@
|
||||
/vllm/_version.py
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
!vllm/vllm_flash_attn/fa_utils.py
|
||||
vllm/vllm_flash_attn/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
@ -228,7 +228,6 @@ endif()
|
||||
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/block_table.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
|
||||
87
Dockerfile
87
Dockerfile
@ -14,22 +14,17 @@ ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install minimal dependencies and uv
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache git curl wget sudo \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Add uv to PATH
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
# Create venv with specified Python and activate by placing at the front of path
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -51,22 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# install build and runtime dependencies
|
||||
|
||||
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||
# we need to install torch and torchvision from the nightly builds first,
|
||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||
# after this step
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
||||
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||
fi
|
||||
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/cuda.txt
|
||||
uv pip install -r requirements/cuda.txt
|
||||
|
||||
# cuda arch list used by torch
|
||||
# can be useful for both `dev` and `test`
|
||||
@ -91,7 +83,7 @@ COPY requirements/build.txt requirements/build.txt
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt
|
||||
uv pip install -r requirements/build.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
@ -163,7 +155,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
uv pip install -r requirements/dev.txt
|
||||
#################### DEV IMAGE ####################
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
@ -179,23 +171,18 @@ ARG TARGETPLATFORM
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install minimal dependencies and uv
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache git curl wget sudo vim \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Add uv to PATH
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
# Create venv with specified Python and activate by placing at the front of path
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -213,14 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
# after this step
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
||||
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||
fi
|
||||
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose
|
||||
uv pip install dist/*.whl --verbose
|
||||
|
||||
# If we need to build FlashInfer wheel before its release:
|
||||
# $ export FLASHINFER_ENABLE_AOT=1
|
||||
@ -235,9 +221,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||
uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||
fi
|
||||
COPY examples examples
|
||||
|
||||
@ -247,7 +232,7 @@ COPY examples examples
|
||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt
|
||||
uv pip install -r requirements/build.txt
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
@ -264,15 +249,15 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
uv pip install -r requirements/dev.txt
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -e tests/vllm_test_utils
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
uv pip install hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
|
||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||
@ -297,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
else \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
fi
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
@ -1,267 +1,37 @@
|
||||
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
||||
FROM mambaorg/micromamba
|
||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||
USER root
|
||||
|
||||
###############################################################
|
||||
# base stage with basic dependencies
|
||||
###############################################################
|
||||
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
|
||||
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
|
||||
# Set Environment Variables for venv, cargo & openblas
|
||||
ENV VIRTUAL_ENV=/opt/vllm
|
||||
ENV PATH=${VIRTUAL_ENV}/bin:/root/.cargo/bin:$PATH
|
||||
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# install gcc-13, python, rust, openblas
|
||||
# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
|
||||
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
|
||||
# when `--jobs=<N>` is passed with podman build command
|
||||
RUN microdnf install -y openssl-devel dnf \
|
||||
&& dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
|
||||
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
|
||||
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
|
||||
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
|
||||
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
|
||||
&& dnf config-manager --set-enabled crb \
|
||||
&& dnf install -y \
|
||||
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
|
||||
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
||||
libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
|
||||
freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
|
||||
harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||
&& dnf clean all \
|
||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||
&& python -m pip install -U pip uv \
|
||||
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
|
||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
||||
&& cd /tmp && touch control
|
||||
|
||||
###############################################################
|
||||
# Stage to build torch family
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS torch-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG TORCH_VERSION=2.6.0
|
||||
ARG _GLIBCXX_USE_CXX11_ABI=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
|
||||
cd pytorch && \
|
||||
uv pip install -r requirements.txt && \
|
||||
python setup.py develop && \
|
||||
rm -f dist/torch*+git*whl && \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
|
||||
|
||||
ARG TORCHVISION_VERSION=0.21.0
|
||||
ARG TORCHVISION_USE_NVJPEG=0
|
||||
ARG TORCHVISION_USE_FFMPEG=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
|
||||
cd vision && \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
BUILD_VERSION=${TORCHVISION_VERSION} \
|
||||
uv build --wheel --out-dir /torchwheels/ --no-build-isolation
|
||||
|
||||
ARG TORCHAUDIO_VERSION=2.6.0
|
||||
ARG BUILD_SOX=1
|
||||
ARG BUILD_KALDI=1
|
||||
ARG BUILD_RNNT=1
|
||||
ARG USE_FFMPEG=0
|
||||
ARG USE_ROCM=0
|
||||
ARG USE_CUDA=0
|
||||
ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
|
||||
cd audio && \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
BUILD_VERSION=${TORCHAUDIO_VERSION} \
|
||||
uv build --wheel --out-dir /torchwheels/ --no-build-isolation
|
||||
|
||||
###############################################################
|
||||
# Stage to build pyarrow
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS arrow-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG PYARROW_PARALLEL
|
||||
ARG PYARROW_VERSION=19.0.1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
|
||||
cd arrow/cpp && \
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=release \
|
||||
-DCMAKE_INSTALL_PREFIX=/usr/local \
|
||||
-DARROW_PYTHON=ON \
|
||||
-DARROW_BUILD_TESTS=OFF \
|
||||
-DARROW_JEMALLOC=ON \
|
||||
-DARROW_BUILD_STATIC="OFF" \
|
||||
-DARROW_PARQUET=ON \
|
||||
.. && \
|
||||
make install -j ${MAX_JOBS:-$(nproc)} && \
|
||||
cd ../../python/ && \
|
||||
uv pip install -v -r requirements-wheel-build.txt && \
|
||||
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
|
||||
python setup.py build_ext \
|
||||
--build-type=release --bundle-arrow-cpp \
|
||||
bdist_wheel --dist-dir /arrowwheels/
|
||||
|
||||
###############################################################
|
||||
# Stage to build opencv
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS cv-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG OPENCV_VERSION=84
|
||||
ARG ENABLE_HEADLESS=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
|
||||
cd opencv-python && \
|
||||
sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
|
||||
python -m build --wheel --installer=uv --outdir /opencvwheels/
|
||||
|
||||
###############################################################
|
||||
# Stage to build vllm - this stage builds and installs
|
||||
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
|
||||
# for transitive dependencies - eg. grpcio
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS vllmcache-builder
|
||||
|
||||
COPY --from=torch-builder /tmp/control /dev/null
|
||||
COPY --from=arrow-builder /tmp/control /dev/null
|
||||
COPY --from=cv-builder /tmp/control /dev/null
|
||||
|
||||
ARG VLLM_TARGET_DEVICE=cpu
|
||||
|
||||
# this step installs vllm and populates uv cache
|
||||
# with all the transitive dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,src=.,dst=/src/,rw \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
uv pip install pandas pythran pybind11 && \
|
||||
# sentencepiece.pc is in some pkgconfig inside uv cache
|
||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
||||
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||
cd /src/ && \
|
||||
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
|
||||
uv pip install /vllmwheel/*.whl
|
||||
|
||||
|
||||
###############################################################
|
||||
# Stage to build numactl
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS numa-builder
|
||||
|
||||
# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
|
||||
# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
|
||||
ARG MAX_JOBS
|
||||
ARG NUMACTL_VERSION=2.0.19
|
||||
RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
|
||||
&& cd numactl \
|
||||
&& autoreconf -i && ./configure \
|
||||
&& make -j ${MAX_JOBS:-$(nproc)}
|
||||
|
||||
###############################################################
|
||||
# Stage to build lapack
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS lapack-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG LAPACK_VERSION=3.12.1
|
||||
RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
|
||||
&& cd lapack && source /opt/rh/gcc-toolset-13/enable \
|
||||
&& cmake -B build -S . \
|
||||
&& cmake --build build -j ${MAX_JOBS:-$(nproc)}
|
||||
|
||||
|
||||
###############################################################
|
||||
# FINAL VLLM IMAGE STAGE #
|
||||
###############################################################
|
||||
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
|
||||
# Set Environment Variables for venv & openblas
|
||||
ENV VIRTUAL_ENV=/opt/vllm
|
||||
ENV PATH=${VIRTUAL_ENV}/bin:$PATH
|
||||
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# create artificial dependencies between stages for independent stages to build in parallel
|
||||
COPY --from=torch-builder /tmp/control /dev/null
|
||||
COPY --from=arrow-builder /tmp/control /dev/null
|
||||
COPY --from=cv-builder /tmp/control /dev/null
|
||||
COPY --from=vllmcache-builder /tmp/control /dev/null
|
||||
COPY --from=numa-builder /tmp/control /dev/null
|
||||
COPY --from=lapack-builder /tmp/control /dev/null
|
||||
|
||||
# install gcc-11, python, openblas, numactl, lapack
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
|
||||
--mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
|
||||
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
|
||||
microdnf install --nodocs -y \
|
||||
tar findutils openssl \
|
||||
pkgconfig xsimd g++ gcc-fortran libsndfile \
|
||||
libtiff libjpeg openjpeg2 zlib zeromq \
|
||||
freetype lcms2 libwebp tcl tk utf8proc \
|
||||
harfbuzz fribidi libraqm libimagequant libxcb \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||
&& microdnf clean all \
|
||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||
&& python -m pip install -U pip uv --no-cache \
|
||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||
&& make -C /numactl install \
|
||||
&& uv pip install cmake \
|
||||
&& cmake --install /lapack/build \
|
||||
&& uv pip uninstall cmake
|
||||
|
||||
# consume previously built wheels (including vllm)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
|
||||
# Some packages in requirements/cpu are installed here
|
||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||
# Currently these may not be available for venv or pip directly
|
||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||
-r requirements/cpu.txt \
|
||||
xformers uvloop==0.20.0
|
||||
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
||||
@ -40,7 +40,7 @@ ARG USE_CYTHON
|
||||
RUN cd vllm \
|
||||
&& python3 -m pip install -r requirements/rocm.txt \
|
||||
&& python3 setup.py clean --all \
|
||||
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
|
||||
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||
FROM scratch AS export_vllm
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
21
README.md
21
README.md
@ -28,27 +28,10 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
|
||||
<details>
|
||||
<summary>Previous News</summary>
|
||||
|
||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||
|
||||
</details>
|
||||
- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
|
||||
---
|
||||
|
||||
## About
|
||||
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
@ -1,420 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
import triton
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
assert current_platform.is_cuda(
|
||||
), "Only support tune w8a8 block fp8 kernel on CUDA device."
|
||||
|
||||
DTYPE_MAP = {
|
||||
"float32": torch.float32,
|
||||
"float16": torch.float16,
|
||||
"half": torch.half,
|
||||
"bfloat16": torch.bfloat16,
|
||||
}
|
||||
|
||||
|
||||
def w8a8_block_matmul(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size: list[int],
|
||||
config: dict[str, Any],
|
||||
output_dtype: torch.dtype = torch.float16,
|
||||
) -> torch.Tensor:
|
||||
"""This function performs matrix multiplication with
|
||||
block-wise quantization.
|
||||
|
||||
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
||||
The output is returned in the specified `output_dtype`.
|
||||
|
||||
Args:
|
||||
A: The input tensor, e.g., activation.
|
||||
B: The input tensor, e.g., weight.
|
||||
As: The per-token-group quantization scale for `A`.
|
||||
Bs: The per-block quantization scale for `B`.
|
||||
block_size: The block size for per-block quantization.
|
||||
It should be 2-dim, e.g., [128, 128].
|
||||
output_dytpe: The dtype of the returned tensor.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The result of matmul.
|
||||
"""
|
||||
assert len(block_size) == 2
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
|
||||
assert A.shape[-1] == B.shape[-1]
|
||||
assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
|
||||
assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
|
||||
M = A.numel() // A.shape[-1]
|
||||
|
||||
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
||||
N, K = B.shape
|
||||
assert triton.cdiv(N, block_n) == Bs.shape[0]
|
||||
assert triton.cdiv(K, block_k) == Bs.shape[1]
|
||||
|
||||
C_shape = A.shape[:-1] + (N, )
|
||||
C = A.new_empty(C_shape, dtype=output_dtype)
|
||||
|
||||
def grid(META):
|
||||
return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
|
||||
triton.cdiv(N, META["BLOCK_SIZE_N"]), )
|
||||
|
||||
if A.dtype == torch.float8_e4m3fn:
|
||||
kernel = _w8a8_block_fp8_matmul
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Currently, only support tune w8a8 block fp8 kernel.")
|
||||
|
||||
kernel[grid](
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
As,
|
||||
Bs,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
block_n,
|
||||
block_k,
|
||||
A.stride(-2),
|
||||
A.stride(-1),
|
||||
B.stride(1),
|
||||
B.stride(0),
|
||||
C.stride(-2),
|
||||
C.stride(-1),
|
||||
As.stride(-2),
|
||||
As.stride(-1),
|
||||
Bs.stride(1),
|
||||
Bs.stride(0),
|
||||
**config,
|
||||
)
|
||||
|
||||
return C
|
||||
|
||||
|
||||
def get_configs_compute_bound():
|
||||
configs = []
|
||||
for num_stages in [2, 3, 4, 5]:
|
||||
for block_m in [16, 32, 64, 128, 256]:
|
||||
for block_k in [64, 128]:
|
||||
for block_n in [32, 64, 128, 256]:
|
||||
for num_warps in [4, 8]:
|
||||
for group_size in [1, 16, 32, 64]:
|
||||
configs.append({
|
||||
"BLOCK_SIZE_M": block_m,
|
||||
"BLOCK_SIZE_N": block_n,
|
||||
"BLOCK_SIZE_K": block_k,
|
||||
"GROUP_SIZE_M": group_size,
|
||||
"num_warps": num_warps,
|
||||
"num_stages": num_stages,
|
||||
})
|
||||
return configs
|
||||
|
||||
|
||||
def get_weight_shapes(tp_size):
|
||||
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
|
||||
# Modify them, if you tune for another different model.
|
||||
# cannot TP
|
||||
total = [
|
||||
(512 + 64, 7168),
|
||||
((128 + 64) * 128, 7168),
|
||||
(128 * (128 + 128), 512),
|
||||
(7168, 16384),
|
||||
(7168, 18432),
|
||||
]
|
||||
# N can TP
|
||||
n_tp = [
|
||||
(18432 * 2, 7168),
|
||||
((128 + 64) * 128, 7168),
|
||||
(128 * (128 + 128), 512),
|
||||
(24576, 1536),
|
||||
(12288, 7168),
|
||||
(4096, 7168),
|
||||
]
|
||||
# K can TP
|
||||
k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
|
||||
|
||||
weight_shapes = []
|
||||
for t in total:
|
||||
weight_shapes.append(t)
|
||||
for n_t in n_tp:
|
||||
new_t = (n_t[0] // tp_size, n_t[1])
|
||||
weight_shapes.append(new_t)
|
||||
for k_t in k_tp:
|
||||
new_t = (k_t[0], k_t[1] // tp_size)
|
||||
weight_shapes.append(new_t)
|
||||
return weight_shapes
|
||||
|
||||
|
||||
def benchmark_config(A,
|
||||
B,
|
||||
As,
|
||||
Bs,
|
||||
block_size,
|
||||
config,
|
||||
out_dtype=torch.float16,
|
||||
num_iters=10):
|
||||
|
||||
def run():
|
||||
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
# JIT complication & warmup
|
||||
for _ in range(5):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
torch.cuda.synchronize()
|
||||
start_event.record()
|
||||
run()
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
latencies.append(start_event.elapsed_time(end_event))
|
||||
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
||||
return avg
|
||||
|
||||
|
||||
def tune(M, N, K, block_size, out_dtype, search_space, input_type):
|
||||
factor_for_scale = 1e-2
|
||||
|
||||
if input_type == "fp8":
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||
|
||||
A_fp32 = (
|
||||
(torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
|
||||
fp8_max)
|
||||
A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
|
||||
B_fp32 = (
|
||||
(torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
|
||||
fp8_max)
|
||||
B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Currently, only support tune w8a8 block fp8 kernel.")
|
||||
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
n_tiles = (N + block_n - 1) // block_n
|
||||
k_tiles = (K + block_k - 1) // block_k
|
||||
|
||||
As = torch.rand(M, k_tiles, dtype=torch.float32,
|
||||
device="cuda") * factor_for_scale
|
||||
Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
|
||||
factor_for_scale)
|
||||
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
for config in tqdm(search_space):
|
||||
try:
|
||||
kernel_time = benchmark_config(
|
||||
A,
|
||||
B,
|
||||
As,
|
||||
Bs,
|
||||
block_size,
|
||||
config,
|
||||
out_dtype,
|
||||
num_iters=10,
|
||||
)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
# Some configurations may be invalid and fail to compile.
|
||||
continue
|
||||
|
||||
if kernel_time < best_time:
|
||||
best_time = kernel_time
|
||||
best_config = config
|
||||
now = datetime.now()
|
||||
print(f"{now.ctime()}] Completed tuning for batch_size={M}")
|
||||
assert best_config is not None
|
||||
return best_config
|
||||
|
||||
|
||||
def save_configs(
|
||||
N,
|
||||
K,
|
||||
block_n,
|
||||
block_k,
|
||||
configs,
|
||||
save_path,
|
||||
input_type="fp8",
|
||||
) -> None:
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
device_name = current_platform.get_device_name().replace(" ", "_")
|
||||
json_file_name = (
|
||||
f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
|
||||
f"block_shape=[{block_n},{block_k}].json")
|
||||
|
||||
config_file_path = os.path.join(save_path, json_file_name)
|
||||
print(f"Writing best config to {config_file_path}...")
|
||||
|
||||
with open(config_file_path, "w") as f:
|
||||
json.dump(configs, f, indent=4)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def tune_on_gpu(args_dict):
|
||||
"""Run tuning on a specific GPU."""
|
||||
gpu_id = args_dict["gpu_id"]
|
||||
batch_sizes = args_dict["batch_sizes"]
|
||||
weight_shapes = args_dict["weight_shapes"]
|
||||
args = args_dict["args"]
|
||||
|
||||
torch.cuda.set_device(gpu_id)
|
||||
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
|
||||
|
||||
block_n = args.block_n
|
||||
block_k = args.block_k
|
||||
out_dtype = DTYPE_MAP[args.out_dtype]
|
||||
save_path = args.save_path
|
||||
input_type = args.input_type
|
||||
|
||||
search_space = get_configs_compute_bound()
|
||||
search_space = [
|
||||
config for config in search_space
|
||||
if block_k % config["BLOCK_SIZE_K"] == 0
|
||||
]
|
||||
|
||||
start = time.time()
|
||||
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
|
||||
N, K = shape[0], shape[1]
|
||||
print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
|
||||
benchmark_results = [
|
||||
tune(
|
||||
batch_size,
|
||||
N,
|
||||
K,
|
||||
[block_n, block_k],
|
||||
out_dtype,
|
||||
search_space,
|
||||
input_type,
|
||||
) for batch_size in tqdm(batch_sizes,
|
||||
desc=f"GPU {gpu_id} - Batch sizes")
|
||||
]
|
||||
best_configs = {
|
||||
M: config
|
||||
for M, config in zip(batch_sizes, benchmark_results)
|
||||
}
|
||||
save_configs(N, K, block_n, block_k, best_configs, save_path,
|
||||
input_type)
|
||||
|
||||
end = time.time()
|
||||
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
|
||||
|
||||
|
||||
def distribute_batch_sizes(batch_sizes, num_gpus):
|
||||
"""Distribute batch sizes across available GPUs."""
|
||||
batches_per_gpu = []
|
||||
for i in range(num_gpus):
|
||||
start_idx = i * len(batch_sizes) // num_gpus
|
||||
end_idx = (i + 1) * len(batch_sizes) // num_gpus
|
||||
batches_per_gpu.append(batch_sizes[start_idx:end_idx])
|
||||
return batches_per_gpu
|
||||
|
||||
|
||||
def main(args):
|
||||
print(args)
|
||||
num_gpus = torch.cuda.device_count()
|
||||
if num_gpus == 0:
|
||||
raise RuntimeError("No GPU available for tuning")
|
||||
print(f"Found {num_gpus} GPUs for parallel tuning")
|
||||
|
||||
torch.cuda.init()
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
8,
|
||||
16,
|
||||
24,
|
||||
32,
|
||||
48,
|
||||
64,
|
||||
96,
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
1536,
|
||||
2048,
|
||||
3072,
|
||||
4096,
|
||||
]
|
||||
else:
|
||||
batch_sizes = [args.batch_size]
|
||||
num_gpus = 1 # If only one batch size, use only one GPU
|
||||
|
||||
weight_shapes = get_weight_shapes(args.tp_size)
|
||||
|
||||
batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
|
||||
|
||||
process_args = []
|
||||
for gpu_id in range(num_gpus):
|
||||
process_args.append({
|
||||
"gpu_id": gpu_id,
|
||||
"batch_sizes": batches_per_gpu[gpu_id],
|
||||
"weight_shapes":
|
||||
weight_shapes, # Each GPU processes all weight shapes
|
||||
"args": args,
|
||||
})
|
||||
|
||||
ctx = mp.get_context("spawn")
|
||||
with ctx.Pool(num_gpus) as pool:
|
||||
pool.map(tune_on_gpu, process_args)
|
||||
|
||||
print("Multi-GPU tuning completed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
|
||||
python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
|
||||
Then copy to model_executor/layers/quantization/utils/configs
|
||||
""",
|
||||
formatter_class=argparse.RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument("--tp-size", "-tp", type=int, default=8)
|
||||
parser.add_argument("--input-type",
|
||||
type=str,
|
||||
choices=["fp8"],
|
||||
default="fp8")
|
||||
parser.add_argument(
|
||||
"--out-dtype",
|
||||
type=str,
|
||||
choices=["float32", "float16", "bfloat16", "half"],
|
||||
default="float16",
|
||||
)
|
||||
parser.add_argument("--block-n", type=int, default=128)
|
||||
parser.add_argument("--block-k", type=int, default=128)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--save-path", type=str, default="./")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
@ -190,7 +190,6 @@ set(VLLM_EXT_SRC
|
||||
"csrc/cpu/cache.cpp"
|
||||
"csrc/cpu/utils.cpp"
|
||||
"csrc/cpu/layernorm.cpp"
|
||||
"csrc/cpu/mla_decode.cpp"
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp")
|
||||
|
||||
|
||||
@ -1,92 +0,0 @@
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
namespace vllm {
|
||||
__global__ void append_kernel(const int* __restrict__ row_indices,
|
||||
const int* __restrict__ cu_num_appends,
|
||||
const int* __restrict__ block_ids,
|
||||
int* __restrict__ block_table,
|
||||
int max_num_blocks_per_row) {
|
||||
int bid = blockIdx.x;
|
||||
int tgt_row = row_indices[2 * bid];
|
||||
int tgt_offset = row_indices[2 * bid + 1];
|
||||
|
||||
int start = cu_num_appends[bid];
|
||||
int end = cu_num_appends[bid + 1];
|
||||
int length = end - start;
|
||||
int tid = threadIdx.x;
|
||||
int64_t offset = tgt_row * max_num_blocks_per_row + tgt_offset;
|
||||
for (int i = tid; i < length; i += blockDim.x) {
|
||||
block_table[offset + i] = block_ids[start + i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void move_kernel(const int* __restrict__ src_dst_n,
|
||||
int* __restrict__ block_table,
|
||||
int max_num_blocks_per_row) {
|
||||
int bid = blockIdx.x;
|
||||
int src_row = src_dst_n[3 * bid];
|
||||
int tgt_row = src_dst_n[3 * bid + 1];
|
||||
int num_blocks = src_dst_n[3 * bid + 2];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = tid; i < num_blocks; i += blockDim.x) {
|
||||
block_table[tgt_row * max_num_blocks_per_row + i] =
|
||||
block_table[src_row * max_num_blocks_per_row + i];
|
||||
}
|
||||
}
|
||||
} // namespace vllm
|
||||
|
||||
void block_table_appends(
|
||||
torch::Tensor& append_row_indices,
|
||||
torch::Tensor& append_row_indices_cpu,
|
||||
torch::Tensor& append_cumsums,
|
||||
torch::Tensor& append_cumsums_cpu,
|
||||
torch::Tensor& append_block_ids,
|
||||
torch::Tensor& append_block_ids_cpu,
|
||||
torch::Tensor& block_table,
|
||||
int64_t num_appends,
|
||||
int64_t total_num_append_blocks) {
|
||||
int* append_row_indices_ptr = append_row_indices.data_ptr<int>();
|
||||
const int* append_row_indices_cpu_ptr = append_row_indices_cpu.data_ptr<int>();
|
||||
int* append_cumsums_ptr = append_cumsums.data_ptr<int>();
|
||||
const int* append_cumsums_cpu_ptr = append_cumsums_cpu.data_ptr<int>();
|
||||
int* append_block_ids_ptr = append_block_ids.data_ptr<int>();
|
||||
const int* append_block_ids_cpu_ptr = append_block_ids_cpu.data_ptr<int>();
|
||||
int* block_table_ptr = block_table.data_ptr<int>();
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(block_table));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
cudaMemcpyAsync(append_row_indices_ptr, append_row_indices_cpu_ptr,
|
||||
num_appends * 2 * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(append_cumsums_ptr, append_cumsums_cpu_ptr,
|
||||
(num_appends + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(append_block_ids_ptr, append_block_ids_cpu_ptr,
|
||||
total_num_append_blocks * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
|
||||
int64_t max_num_blocks_per_row = block_table.size(1);
|
||||
vllm::append_kernel<<<num_appends, 1024, 0, stream>>>(
|
||||
append_row_indices_ptr, append_cumsums_ptr, append_block_ids_ptr,
|
||||
block_table_ptr, max_num_blocks_per_row);
|
||||
}
|
||||
|
||||
void block_table_moves(
|
||||
torch::Tensor& src_dst_n,
|
||||
torch::Tensor& src_dst_n_cpu,
|
||||
torch::Tensor& block_table,
|
||||
int64_t num_moves) {
|
||||
int* src_dst_n_ptr = src_dst_n.data_ptr<int>();
|
||||
const int* src_dst_n_cpu_ptr = src_dst_n_cpu.data_ptr<int>();
|
||||
int* block_table_ptr = block_table.data_ptr<int>();
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(block_table));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
cudaMemcpyAsync(src_dst_n_ptr, src_dst_n_cpu_ptr,
|
||||
num_moves * 3 * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
|
||||
int64_t max_num_blocks_per_row = block_table.size(1);
|
||||
vllm::move_kernel<<<num_moves, 1024, 0, stream>>>(
|
||||
src_dst_n_ptr, block_table_ptr, max_num_blocks_per_row);
|
||||
}
|
||||
@ -88,48 +88,6 @@ void reshape_and_cache_cpu_impl(
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
template <typename scalar_t>
|
||||
void concat_and_cache_mla_cpu_impl(
|
||||
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
||||
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
|
||||
scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
|
||||
// + pe_dim)]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int num_tokens, //
|
||||
const int block_stride, //
|
||||
const int entry_stride, //
|
||||
const int kv_c_stride, //
|
||||
const int k_pe_stride, //
|
||||
const int kv_lora_rank, //
|
||||
const int pe_dim, //
|
||||
const int block_size //
|
||||
) {
|
||||
#pragma omp parallel for
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
|
||||
auto copy = [&](const scalar_t* __restrict__ src,
|
||||
scalar_t* __restrict__ dst, int src_stride, int dst_stride,
|
||||
int size, int offset) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
const int64_t src_idx = token_idx * src_stride + i;
|
||||
const int64_t dst_idx =
|
||||
block_idx * block_stride + block_offset * entry_stride + i + offset;
|
||||
dst[dst_idx] = src[src_idx];
|
||||
}
|
||||
};
|
||||
|
||||
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
}
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
@ -176,38 +134,6 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
});
|
||||
}
|
||||
|
||||
void concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
||||
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
|
||||
// pe_dim)]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& scale) {
|
||||
int num_tokens = slot_mapping.size(0);
|
||||
int kv_lora_rank = kv_c.size(1);
|
||||
int pe_dim = k_pe.size(1);
|
||||
int block_size = kv_cache.size(1);
|
||||
|
||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||
TORCH_CHECK(kv_cache_dtype != "fp8");
|
||||
|
||||
int kv_c_stride = kv_c.stride(0);
|
||||
int k_pe_stride = k_pe.stride(0);
|
||||
int block_stride = kv_cache.stride(0);
|
||||
int entry_stride = kv_cache.stride(1);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
|
||||
concat_and_cache_mla_cpu_impl<scalar_t>(
|
||||
kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
|
||||
kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
|
||||
num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
|
||||
kv_lora_rank, pe_dim, block_size);
|
||||
CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping) {
|
||||
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
|
||||
|
||||
@ -130,8 +130,6 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
|
||||
__m512i reg;
|
||||
|
||||
explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
|
||||
|
||||
explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||
|
||||
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||
|
||||
@ -1,393 +0,0 @@
|
||||
#include "cpu_types.hpp"
|
||||
#include <float.h>
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
struct KernelVecType {
|
||||
using qk_load_vec_type = void;
|
||||
using qk_vec_type = void;
|
||||
using v_load_vec_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<float> {
|
||||
using qk_load_vec_type = vec_op::FP32Vec16;
|
||||
using qk_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<c10::Half> {
|
||||
#if defined(__powerpc64__) || defined(__s390x__)
|
||||
// Power and s390x architecture-specific vector types
|
||||
using qk_load_vec_type = vec_op::FP32Vec16;
|
||||
using qk_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::FP32Vec16;
|
||||
#else
|
||||
// Fallback for other architectures, including x86
|
||||
using qk_load_vec_type = vec_op::FP16Vec16;
|
||||
using qk_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::FP16Vec16;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using qk_load_vec_type = vec_op::BF16Vec32;
|
||||
using qk_vec_type = vec_op::BF16Vec32;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
|
||||
// pass
|
||||
#else
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using qk_load_vec_type = vec_op::BF16Vec16;
|
||||
using qk_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
|
||||
typename qk_vec_type>
|
||||
void mla_decode_block_head(
|
||||
const qk_vec_type* __restrict__ q_vecs, // [HEAD_UNROLL, head_dim]
|
||||
const qk_vec_type* __restrict__ k_vecs, // [block_size, head_dim]
|
||||
const vec_op::FP32Vec16* __restrict v_vecs_f32, // [block_size, v_head_dim]
|
||||
float* __restrict__ acc_out, // [HEAD_UNROLL, v_head_dim]
|
||||
float* __restrict__ acc_lse, // [HEAD_UNROLL]
|
||||
const float scale, const int num_tokens) {
|
||||
using f32_vec_type = vec_op::FP32Vec16;
|
||||
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
|
||||
constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
|
||||
|
||||
float logits[BLOCK_SIZE][HEAD_UNROLL] = {}; // initialize to zeros
|
||||
float max_val[HEAD_UNROLL];
|
||||
std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
|
||||
|
||||
f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
|
||||
for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
|
||||
// load to registers
|
||||
qk_vec_type q_vec[HEAD_UNROLL];
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
|
||||
q_vec[unroll] =
|
||||
qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
|
||||
|
||||
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
|
||||
qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
|
||||
vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
|
||||
}
|
||||
}
|
||||
|
||||
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
|
||||
const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
|
||||
logits[block_offset][unroll] = acc;
|
||||
max_val[unroll] = std::max(max_val[unroll], acc);
|
||||
}
|
||||
}
|
||||
|
||||
float sum_exp[HEAD_UNROLL] = {};
|
||||
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
|
||||
const float val =
|
||||
std::exp(logits[block_offset][unroll] - max_val[unroll]);
|
||||
logits[block_offset][unroll] = val;
|
||||
sum_exp[unroll] += val;
|
||||
}
|
||||
}
|
||||
|
||||
f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
|
||||
|
||||
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
|
||||
// load to registers
|
||||
f32_vec_type scale_[HEAD_UNROLL];
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
|
||||
scale_[unroll] =
|
||||
f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
|
||||
|
||||
for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
|
||||
f32_vec_type v_vec(
|
||||
v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
|
||||
vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
|
||||
}
|
||||
}
|
||||
|
||||
// merge attention state
|
||||
// section 2.2 in https://arxiv.org/pdf/2501.01005
|
||||
f32_vec_type prev_scale[HEAD_UNROLL];
|
||||
f32_vec_type curr_scale[HEAD_UNROLL];
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
|
||||
const float prev_lse = acc_lse[unroll];
|
||||
const float curr_lse = std::log(sum_exp[unroll]) +
|
||||
max_val[unroll]; // add back max_val to get true lse
|
||||
// softmax trick
|
||||
const float max_lse = std::max(prev_lse, curr_lse);
|
||||
const float prev_sum_exp = std::exp(prev_lse - max_lse);
|
||||
const float curr_sum_exp = std::exp(curr_lse - max_lse);
|
||||
|
||||
const float new_sum_exp = prev_sum_exp + curr_sum_exp;
|
||||
acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
|
||||
|
||||
prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
|
||||
curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
|
||||
}
|
||||
|
||||
for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
|
||||
f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
|
||||
o_vec = o_vec * prev_scale[unroll] +
|
||||
this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
|
||||
o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
|
||||
}
|
||||
}
|
||||
|
||||
q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
|
||||
acc_out += V_HEAD_DIM * HEAD_UNROLL;
|
||||
}
|
||||
|
||||
template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
|
||||
typename qk_vec_type>
|
||||
void mla_decode_block(
|
||||
const qk_vec_type* __restrict__ q_vecs, // [num_heads, head_dim]
|
||||
const scalar_t* __restrict__ kv_cache, // [block_size, head_dim]
|
||||
float* __restrict__ acc_out, // [num_heads, v_head_dim]
|
||||
float* __restrict__ acc_lse, // [num_heads]
|
||||
const int num_heads, const float scale, const int num_tokens) {
|
||||
using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
|
||||
static_assert(
|
||||
std::is_same<qk_vec_type,
|
||||
typename KernelVecType<scalar_t>::qk_vec_type>::value);
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
using f32_vec_type = vec_op::FP32Vec16;
|
||||
static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
|
||||
static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
|
||||
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
|
||||
constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
|
||||
|
||||
const qk_vec_type* k_vecs;
|
||||
const f32_vec_type* v_vecs_f32;
|
||||
float* kv_cache_f32 = nullptr;
|
||||
|
||||
if constexpr (!std::is_same<scalar_t, float>::value) {
|
||||
// convert KV cache block to FP32 to reuse it across query heads and
|
||||
// attn @ V computation, since FP16/BF16->FP32 is expensive.
|
||||
// TODO: move malloc outside of this fn to reuse across iterations.
|
||||
const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
|
||||
kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
|
||||
|
||||
for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
|
||||
for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
|
||||
v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
|
||||
f32_vec_type kv_vec_f32(kv_load_vec);
|
||||
kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
|
||||
}
|
||||
|
||||
if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
|
||||
// for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
|
||||
// NOTE: in this case, we only need to convert the V section to FP32.
|
||||
// But for simplicity, we will convert the whole KV block to FP32.
|
||||
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
|
||||
} else {
|
||||
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
|
||||
}
|
||||
|
||||
// attn @ V always use FP32 for V, since attn is FP32.
|
||||
v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
|
||||
|
||||
} else {
|
||||
// KV cache is FP32. don't need to do anything.
|
||||
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
|
||||
v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
|
||||
}
|
||||
|
||||
// compute 2 heads at the same time to improve ILP and
|
||||
// take advantage of register cache for K and V.
|
||||
constexpr int HEAD_UNROLL = 2;
|
||||
for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
|
||||
mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
|
||||
q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
|
||||
|
||||
q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
|
||||
acc_out += HEAD_UNROLL * V_HEAD_DIM;
|
||||
acc_lse += HEAD_UNROLL;
|
||||
}
|
||||
|
||||
// take care of the remaining heads
|
||||
for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
|
||||
mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
|
||||
q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
|
||||
|
||||
q_vecs += HEAD_DIM / QK_NUM_ELEM;
|
||||
acc_out += V_HEAD_DIM;
|
||||
acc_lse += 1;
|
||||
}
|
||||
|
||||
if (kv_cache_f32 != nullptr) {
|
||||
std::free(kv_cache_f32);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
|
||||
void mla_decode_kvcache_cpu_impl(
|
||||
scalar_t* __restrict__ out, // [num_seqs, num_heads, v_head_dim]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_dim]
|
||||
const scalar_t* __restrict__ kv_cache, // [num_blocks, block_size,
|
||||
// head_dim]
|
||||
const int num_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
|
||||
const int kv_stride, const int num_seqs) {
|
||||
using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
|
||||
using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
|
||||
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
|
||||
|
||||
// shared across threads
|
||||
const int max_threads = omp_get_max_threads();
|
||||
const int acc_out_nbytes =
|
||||
max_threads * num_heads * V_HEAD_DIM * sizeof(float);
|
||||
float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
|
||||
std::vector<float> acc_lse(max_threads * num_heads);
|
||||
|
||||
// allocate memory to pre-convert query to FP32 later
|
||||
float* q_f32;
|
||||
constexpr bool PRE_CONVERT_QUERY =
|
||||
!std::is_same<scalar_t, float>::value &&
|
||||
std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
|
||||
if constexpr (PRE_CONVERT_QUERY) {
|
||||
const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
|
||||
q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int num_threads = omp_get_num_threads();
|
||||
const int thread_id = omp_get_thread_num();
|
||||
float* __restrict__ acc_out_thread =
|
||||
acc_out + thread_id * num_heads * V_HEAD_DIM;
|
||||
float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
|
||||
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
// reset accumulator
|
||||
std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
|
||||
std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
|
||||
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
|
||||
|
||||
const qk_vec_type* q_vecs;
|
||||
if constexpr (PRE_CONVERT_QUERY) {
|
||||
// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
|
||||
#pragma omp for
|
||||
for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
|
||||
qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
|
||||
qk_vec_type q_vec(q_load_vec);
|
||||
q_vec.save(q_f32 + i);
|
||||
}
|
||||
q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
|
||||
} else {
|
||||
q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
|
||||
}
|
||||
|
||||
#pragma omp for
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int physical_block_idx =
|
||||
block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
|
||||
const int num_tokens =
|
||||
block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
|
||||
|
||||
mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
|
||||
q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
|
||||
acc_lse_thread, num_heads, scale, num_tokens);
|
||||
}
|
||||
|
||||
// merge attention states across threads
|
||||
// section 2.2 in https://arxiv.org/pdf/2501.01005
|
||||
// each thread is responsible for 1 head
|
||||
#pragma omp for
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
float* acc_lse_head = acc_lse.data() + head_idx;
|
||||
float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
|
||||
|
||||
float max_val = -FLT_MAX;
|
||||
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
|
||||
max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
|
||||
}
|
||||
|
||||
float sum_exp = 0.0f;
|
||||
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
|
||||
float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
|
||||
acc_lse_head[thread_id_ * num_heads] = val;
|
||||
sum_exp += val;
|
||||
}
|
||||
|
||||
float inv_sum = 1.0f / sum_exp;
|
||||
float out_head[V_HEAD_DIM] = {};
|
||||
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
|
||||
float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
|
||||
for (int i = 0; i < V_HEAD_DIM; ++i) {
|
||||
out_head[i] +=
|
||||
acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < V_HEAD_DIM; ++i) {
|
||||
vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
|
||||
head_idx * V_HEAD_DIM + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (PRE_CONVERT_QUERY) {
|
||||
std::free(q_f32);
|
||||
}
|
||||
std::free(acc_out);
|
||||
}
|
||||
|
||||
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
||||
torch::Tensor& kv_cache, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens) {
|
||||
const int num_seqs = query.size(0);
|
||||
const int num_heads = query.size(1);
|
||||
const int head_dim = query.size(2);
|
||||
const int block_size = kv_cache.size(1);
|
||||
const int v_head_dim = out.size(2);
|
||||
|
||||
const int max_num_blocks_per_seq = block_tables.size(1);
|
||||
const int o_stride = out.stride(0);
|
||||
const int q_stride = query.stride(0);
|
||||
const int kv_stride = kv_cache.stride(0);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
|
||||
if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
|
||||
mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
|
||||
out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
|
||||
kv_cache.data_ptr<scalar_t>(), num_heads, scale,
|
||||
block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
|
||||
max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
|
||||
else
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size);
|
||||
CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
|
||||
});
|
||||
}
|
||||
@ -18,10 +18,6 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
|
||||
const std::optional<torch::Tensor>& azp,
|
||||
const std::optional<torch::Tensor>& bias);
|
||||
|
||||
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
||||
torch::Tensor& kv_cache, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
@ -154,14 +150,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor k_scale, Tensor v_scale) -> ()");
|
||||
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
|
||||
|
||||
cache_ops.def(
|
||||
"concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
|
||||
" Tensor! kv_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor scale) -> ()");
|
||||
cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
@ -169,12 +157,4 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
|
||||
cpu_ops.def(
|
||||
"mla_decode_kvcache("
|
||||
" Tensor! out, Tensor query, Tensor kv_cache,"
|
||||
" float scale, Tensor block_tables, Tensor seq_lens) -> ()");
|
||||
cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||
|
||||
12
csrc/ops.h
12
csrc/ops.h
@ -119,18 +119,6 @@ void advance_step_flashinfer(
|
||||
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
|
||||
|
||||
void block_table_appends(torch::Tensor& append_row_indices,
|
||||
torch::Tensor& append_row_indices_cpu,
|
||||
torch::Tensor& append_cumsums,
|
||||
torch::Tensor& append_cumsums_cpu,
|
||||
torch::Tensor& append_block_ids,
|
||||
torch::Tensor& append_block_ids_cpu,
|
||||
torch::Tensor& block_table, int64_t num_appends,
|
||||
int64_t total_num_append_blocks);
|
||||
|
||||
void block_table_moves(torch::Tensor& src_dst_n, torch::Tensor& src_dst_n_cpu,
|
||||
torch::Tensor& block_table, int64_t num_moves);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
|
||||
@ -375,25 +375,25 @@ torch::Tensor ggml_moe_a8(torch::Tensor X, // input
|
||||
int64_t ggml_moe_get_block_size(int64_t type) {
|
||||
switch (type) {
|
||||
case 2:
|
||||
return MOE_X_Q4_0;
|
||||
return MMQ_X_Q4_0;
|
||||
case 3:
|
||||
return MOE_X_Q4_1;
|
||||
return MMQ_X_Q4_1;
|
||||
case 6:
|
||||
return MOE_X_Q5_0;
|
||||
return MMQ_X_Q5_0;
|
||||
case 7:
|
||||
return MOE_X_Q5_1;
|
||||
return MMQ_X_Q5_1;
|
||||
case 8:
|
||||
return MOE_X_Q8_0;
|
||||
return MMQ_X_Q8_0;
|
||||
case 10:
|
||||
return MOE_X_Q2_K;
|
||||
return MMQ_X_Q2_K;
|
||||
case 11:
|
||||
return MOE_X_Q3_K;
|
||||
return MMQ_X_Q3_K;
|
||||
case 12:
|
||||
return MOE_X_Q4_K;
|
||||
return MMQ_X_Q4_K;
|
||||
case 13:
|
||||
return MOE_X_Q5_K;
|
||||
return MMQ_X_Q5_K;
|
||||
case 14:
|
||||
return MOE_X_Q6_K;
|
||||
return MMQ_X_Q6_K;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -129,12 +129,12 @@ static __device__ __forceinline__ void moe_q(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q4_0 64
|
||||
#define MOE_Y_Q4_0 128
|
||||
#define MMQ_X_Q4_0 64
|
||||
#define MMQ_Y_Q4_0 128
|
||||
#define NWARPS_Q4_0 8
|
||||
#else
|
||||
#define MOE_X_Q4_0 4
|
||||
#define MOE_Y_Q4_0 32
|
||||
#define MMQ_X_Q4_0 4
|
||||
#define MMQ_Y_Q4_0 32
|
||||
#define NWARPS_Q4_0 4
|
||||
#endif
|
||||
|
||||
@ -149,8 +149,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q4_0;
|
||||
const int mmq_y = MOE_Y_Q4_0;
|
||||
const int mmq_x = MMQ_X_Q4_0;
|
||||
const int mmq_y = MMQ_Y_Q4_0;
|
||||
const int nwarps = NWARPS_Q4_0;
|
||||
|
||||
moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
|
||||
@ -167,8 +167,8 @@ static void ggml_moe_q4_0_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
int mmq_x = MOE_X_Q4_0;
|
||||
int mmq_y = MOE_Y_Q4_0;
|
||||
int mmq_x = MMQ_X_Q4_0;
|
||||
int mmq_y = MMQ_Y_Q4_0;
|
||||
int nwarps = NWARPS_Q4_0;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -190,12 +190,12 @@ static void ggml_moe_q4_0_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q4_1 64
|
||||
#define MOE_Y_Q4_1 128
|
||||
#define MMQ_X_Q4_1 64
|
||||
#define MMQ_Y_Q4_1 128
|
||||
#define NWARPS_Q4_1 8
|
||||
#else
|
||||
#define MOE_X_Q4_1 4
|
||||
#define MOE_Y_Q4_1 32
|
||||
#define MMQ_X_Q4_1 4
|
||||
#define MMQ_Y_Q4_1 32
|
||||
#define NWARPS_Q4_1 4
|
||||
#endif
|
||||
|
||||
@ -210,8 +210,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q4_1;
|
||||
const int mmq_y = MOE_Y_Q4_1;
|
||||
const int mmq_x = MMQ_X_Q4_1;
|
||||
const int mmq_y = MMQ_Y_Q4_1;
|
||||
const int nwarps = NWARPS_Q4_1;
|
||||
|
||||
moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
|
||||
@ -228,8 +228,8 @@ static void ggml_moe_q4_1_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
int mmq_x = MOE_X_Q4_1;
|
||||
int mmq_y = MOE_Y_Q4_1;
|
||||
int mmq_x = MMQ_X_Q4_1;
|
||||
int mmq_y = MMQ_Y_Q4_1;
|
||||
int nwarps = NWARPS_Q4_1;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -251,12 +251,12 @@ static void ggml_moe_q4_1_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q5_0 64
|
||||
#define MOE_Y_Q5_0 128
|
||||
#define MMQ_X_Q5_0 64
|
||||
#define MMQ_Y_Q5_0 128
|
||||
#define NWARPS_Q5_0 8
|
||||
#else
|
||||
#define MOE_X_Q5_0 4
|
||||
#define MOE_Y_Q5_0 32
|
||||
#define MMQ_X_Q5_0 4
|
||||
#define MMQ_Y_Q5_0 32
|
||||
#define NWARPS_Q5_0 4
|
||||
#endif
|
||||
|
||||
@ -271,8 +271,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q5_0;
|
||||
const int mmq_y = MOE_Y_Q5_0;
|
||||
const int mmq_x = MMQ_X_Q5_0;
|
||||
const int mmq_y = MMQ_Y_Q5_0;
|
||||
const int nwarps = NWARPS_Q5_0;
|
||||
|
||||
moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
|
||||
@ -289,8 +289,8 @@ static void ggml_moe_q5_0_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q5_0;
|
||||
const int mmq_y = MOE_Y_Q5_0;
|
||||
const int mmq_x = MMQ_X_Q5_0;
|
||||
const int mmq_y = MMQ_Y_Q5_0;
|
||||
const int nwarps = NWARPS_Q5_0;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -312,12 +312,12 @@ static void ggml_moe_q5_0_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q5_1 64
|
||||
#define MOE_Y_Q5_1 128
|
||||
#define MMQ_X_Q5_1 64
|
||||
#define MMQ_Y_Q5_1 128
|
||||
#define NWARPS_Q5_1 8
|
||||
#else
|
||||
#define MOE_X_Q5_1 4
|
||||
#define MOE_Y_Q5_1 32
|
||||
#define MMQ_X_Q5_1 4
|
||||
#define MMQ_Y_Q5_1 32
|
||||
#define NWARPS_Q5_1 4
|
||||
#endif
|
||||
|
||||
@ -332,8 +332,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q5_1;
|
||||
const int mmq_y = MOE_Y_Q5_1;
|
||||
const int mmq_x = MMQ_X_Q5_1;
|
||||
const int mmq_y = MMQ_Y_Q5_1;
|
||||
const int nwarps = NWARPS_Q5_1;
|
||||
|
||||
moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
|
||||
@ -350,8 +350,8 @@ static void ggml_moe_q5_1_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q5_1;
|
||||
const int mmq_y = MOE_Y_Q5_1;
|
||||
const int mmq_x = MMQ_X_Q5_1;
|
||||
const int mmq_y = MMQ_Y_Q5_1;
|
||||
const int nwarps = NWARPS_Q5_1;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -373,12 +373,12 @@ static void ggml_moe_q5_1_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q8_0 64
|
||||
#define MOE_Y_Q8_0 128
|
||||
#define MMQ_X_Q8_0 64
|
||||
#define MMQ_Y_Q8_0 128
|
||||
#define NWARPS_Q8_0 8
|
||||
#else
|
||||
#define MOE_X_Q8_0 4
|
||||
#define MOE_Y_Q8_0 32
|
||||
#define MMQ_X_Q8_0 4
|
||||
#define MMQ_Y_Q8_0 32
|
||||
#define NWARPS_Q8_0 4
|
||||
#endif
|
||||
|
||||
@ -393,8 +393,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q8_0;
|
||||
const int mmq_y = MOE_Y_Q8_0;
|
||||
const int mmq_x = MMQ_X_Q8_0;
|
||||
const int mmq_y = MMQ_Y_Q8_0;
|
||||
const int nwarps = NWARPS_Q8_0;
|
||||
|
||||
moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
|
||||
@ -411,8 +411,8 @@ static void ggml_moe_q8_0_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q8_0;
|
||||
const int mmq_y = MOE_Y_Q8_0;
|
||||
const int mmq_x = MMQ_X_Q8_0;
|
||||
const int mmq_y = MMQ_Y_Q8_0;
|
||||
const int nwarps = NWARPS_Q8_0;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -434,12 +434,12 @@ static void ggml_moe_q8_0_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q2_K 64
|
||||
#define MOE_Y_Q2_K 128
|
||||
#define MMQ_X_Q2_K 64
|
||||
#define MMQ_Y_Q2_K 128
|
||||
#define NWARPS_Q2_K 8
|
||||
#else
|
||||
#define MOE_X_Q2_K 4
|
||||
#define MOE_Y_Q2_K 32
|
||||
#define MMQ_X_Q2_K 4
|
||||
#define MMQ_Y_Q2_K 32
|
||||
#define NWARPS_Q2_K 4
|
||||
#endif
|
||||
|
||||
@ -454,8 +454,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q2_K;
|
||||
const int mmq_y = MOE_Y_Q2_K;
|
||||
const int mmq_x = MMQ_X_Q2_K;
|
||||
const int mmq_y = MMQ_Y_Q2_K;
|
||||
const int nwarps = NWARPS_Q2_K;
|
||||
|
||||
moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
|
||||
@ -472,8 +472,8 @@ static void ggml_moe_q2_K_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q2_K;
|
||||
const int mmq_y = MOE_Y_Q2_K;
|
||||
const int mmq_x = MMQ_X_Q2_K;
|
||||
const int mmq_y = MMQ_Y_Q2_K;
|
||||
const int nwarps = NWARPS_Q2_K;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -495,12 +495,12 @@ static void ggml_moe_q2_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q3_K 64
|
||||
#define MOE_Y_Q3_K 128
|
||||
#define MMQ_X_Q3_K 64
|
||||
#define MMQ_Y_Q3_K 128
|
||||
#define NWARPS_Q3_K 8
|
||||
#else
|
||||
#define MOE_X_Q3_K 4
|
||||
#define MOE_Y_Q3_K 32
|
||||
#define MMQ_X_Q3_K 4
|
||||
#define MMQ_Y_Q3_K 32
|
||||
#define NWARPS_Q3_K 4
|
||||
#endif
|
||||
|
||||
@ -516,8 +516,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
|
||||
const int mmq_x = MOE_X_Q3_K;
|
||||
const int mmq_y = MOE_Y_Q3_K;
|
||||
const int mmq_x = MMQ_X_Q3_K;
|
||||
const int mmq_y = MMQ_Y_Q3_K;
|
||||
const int nwarps = NWARPS_Q3_K;
|
||||
|
||||
moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
|
||||
@ -533,8 +533,8 @@ static void ggml_moe_q3_K_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q3_K;
|
||||
const int mmq_y = MOE_Y_Q3_K;
|
||||
const int mmq_x = MMQ_X_Q3_K;
|
||||
const int mmq_y = MMQ_Y_Q3_K;
|
||||
const int nwarps = NWARPS_Q3_K;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -556,12 +556,12 @@ static void ggml_moe_q3_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q4_K 64
|
||||
#define MOE_Y_Q4_K 128
|
||||
#define MMQ_X_Q4_K 64
|
||||
#define MMQ_Y_Q4_K 128
|
||||
#define NWARPS_Q4_K 8
|
||||
#else
|
||||
#define MOE_X_Q4_K 4
|
||||
#define MOE_Y_Q4_K 32
|
||||
#define MMQ_X_Q4_K 4
|
||||
#define MMQ_Y_Q4_K 32
|
||||
#define NWARPS_Q4_K 4
|
||||
#endif
|
||||
|
||||
@ -576,8 +576,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q4_K;
|
||||
const int mmq_y = MOE_Y_Q4_K;
|
||||
const int mmq_x = MMQ_X_Q4_K;
|
||||
const int mmq_y = MMQ_Y_Q4_K;
|
||||
const int nwarps = NWARPS_Q4_K;
|
||||
|
||||
moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
|
||||
@ -594,8 +594,8 @@ static void ggml_moe_q4_K_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q4_K;
|
||||
const int mmq_y = MOE_Y_Q4_K;
|
||||
const int mmq_x = MMQ_X_Q4_K;
|
||||
const int mmq_y = MMQ_Y_Q4_K;
|
||||
const int nwarps = NWARPS_Q4_K;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -617,12 +617,12 @@ static void ggml_moe_q4_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q5_K 64
|
||||
#define MOE_Y_Q5_K 128
|
||||
#define MMQ_X_Q5_K 64
|
||||
#define MMQ_Y_Q5_K 128
|
||||
#define NWARPS_Q5_K 8
|
||||
#else
|
||||
#define MOE_X_Q5_K 4
|
||||
#define MOE_Y_Q5_K 32
|
||||
#define MMQ_X_Q5_K 4
|
||||
#define MMQ_Y_Q5_K 32
|
||||
#define NWARPS_Q5_K 4
|
||||
#endif
|
||||
|
||||
@ -637,8 +637,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q5_K;
|
||||
const int mmq_y = MOE_Y_Q5_K;
|
||||
const int mmq_x = MMQ_X_Q5_K;
|
||||
const int mmq_y = MMQ_Y_Q5_K;
|
||||
const int nwarps = NWARPS_Q5_K;
|
||||
|
||||
moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
|
||||
@ -655,8 +655,8 @@ static void ggml_moe_q5_K_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q5_K;
|
||||
const int mmq_y = MOE_Y_Q5_K;
|
||||
const int mmq_x = MMQ_X_Q5_K;
|
||||
const int mmq_y = MMQ_Y_Q5_K;
|
||||
const int nwarps = NWARPS_Q5_K;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -678,12 +678,12 @@ static void ggml_moe_q5_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q6_K 64
|
||||
#define MOE_Y_Q6_K 128
|
||||
#define MMQ_X_Q6_K 64
|
||||
#define MMQ_Y_Q6_K 128
|
||||
#define NWARPS_Q6_K 8
|
||||
#else
|
||||
#define MOE_X_Q6_K 4
|
||||
#define MOE_Y_Q6_K 32
|
||||
#define MMQ_X_Q6_K 4
|
||||
#define MMQ_Y_Q6_K 32
|
||||
#define NWARPS_Q6_K 4
|
||||
#endif
|
||||
|
||||
@ -698,8 +698,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst,
|
||||
const int top_k) {
|
||||
const int mmq_x = MOE_X_Q6_K;
|
||||
const int mmq_y = MOE_Y_Q6_K;
|
||||
const int mmq_x = MMQ_X_Q6_K;
|
||||
const int mmq_y = MMQ_Y_Q6_K;
|
||||
const int nwarps = NWARPS_Q6_K;
|
||||
|
||||
moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
|
||||
@ -716,8 +716,8 @@ static void ggml_moe_q6_K_q8_1_cuda(
|
||||
const int exp_stride, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
|
||||
const int tokens_post_padded, cudaStream_t stream) {
|
||||
const int mmq_x = MOE_X_Q6_K;
|
||||
const int mmq_y = MOE_Y_Q6_K;
|
||||
const int mmq_x = MMQ_X_Q6_K;
|
||||
const int mmq_y = MMQ_Y_Q6_K;
|
||||
const int nwarps = NWARPS_Q6_K;
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
|
||||
@ -14,7 +14,7 @@ __global__ void awq_marlin_repack_kernel(
|
||||
int n_tiles = size_n / tile_n_size;
|
||||
int block_k_tiles = div_ceil(k_tiles, gridDim.x);
|
||||
|
||||
auto start_k_tile = blockIdx.x * block_k_tiles;
|
||||
int start_k_tile = blockIdx.x * block_k_tiles;
|
||||
if (start_k_tile >= k_tiles) {
|
||||
return;
|
||||
}
|
||||
@ -51,8 +51,8 @@ __global__ void awq_marlin_repack_kernel(
|
||||
int4* sh_ptr = sh + stage_size * pipe;
|
||||
|
||||
if (threadIdx.x < stage_size) {
|
||||
auto k_id = threadIdx.x / stage_n_threads;
|
||||
auto n_id = threadIdx.x % stage_n_threads;
|
||||
int k_id = threadIdx.x / stage_n_threads;
|
||||
int n_id = threadIdx.x % stage_n_threads;
|
||||
|
||||
int first_k = k_tile_id * tile_k_size;
|
||||
|
||||
@ -70,8 +70,8 @@ __global__ void awq_marlin_repack_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
auto th_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int th_id = threadIdx.x % 32;
|
||||
|
||||
if (warp_id >= 4) {
|
||||
return;
|
||||
@ -265,4 +265,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
|
||||
m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
|
||||
}
|
||||
}
|
||||
@ -42,7 +42,7 @@ namespace marlin {
|
||||
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
int const* __restrict__ perm_int_ptr,
|
||||
int4* __restrict__ out_int4_ptr, int size_m,
|
||||
int size_k, int lda, int block_rows) {}
|
||||
int size_k, int block_rows) {}
|
||||
|
||||
template <typename scalar_t, // compute dtype, half or nv_float16
|
||||
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
|
||||
@ -459,32 +459,29 @@ __device__ inline void barrier_release(int* lock, bool reset = false) {
|
||||
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
int const* __restrict__ perm_int_ptr,
|
||||
int4* __restrict__ out_int4_ptr, int size_m,
|
||||
int size_k, int lda, int block_rows) {
|
||||
auto start_row = block_rows * blockIdx.x;
|
||||
int size_k, int block_rows) {
|
||||
int start_row = block_rows * blockIdx.x;
|
||||
int finish_row = start_row + block_rows;
|
||||
if (finish_row > size_m) {
|
||||
finish_row = size_m;
|
||||
}
|
||||
int cur_block_rows = finish_row - start_row;
|
||||
|
||||
int input_row_stride = lda * sizeof(half) / 16;
|
||||
int output_row_stride = size_k * sizeof(half) / 16;
|
||||
int row_stride = size_k * sizeof(half) / 16;
|
||||
|
||||
auto permute_row = [&](int row) {
|
||||
int iters = size_k / default_threads;
|
||||
int rest = size_k % default_threads;
|
||||
|
||||
int input_offset = row * input_row_stride;
|
||||
int output_offset = row * output_row_stride;
|
||||
int offset = row * row_stride;
|
||||
|
||||
half const* a_row_half =
|
||||
reinterpret_cast<half const*>(a_int4_ptr + input_offset);
|
||||
half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);
|
||||
half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
|
||||
half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
|
||||
|
||||
int base_k = 0;
|
||||
|
||||
for (int i = 0; i < iters; i++) {
|
||||
auto cur_k = base_k + threadIdx.x;
|
||||
int cur_k = base_k + threadIdx.x;
|
||||
int src_pos = perm_int_ptr[cur_k];
|
||||
|
||||
out_half[cur_k] = a_row_half[src_pos];
|
||||
@ -494,7 +491,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
|
||||
if (rest) {
|
||||
if (threadIdx.x < rest) {
|
||||
auto cur_k = base_k + threadIdx.x;
|
||||
int cur_k = base_k + threadIdx.x;
|
||||
int src_pos = perm_int_ptr[cur_k];
|
||||
|
||||
out_half[cur_k] = a_row_half[src_pos];
|
||||
@ -540,7 +537,6 @@ __global__ void Marlin(
|
||||
int prob_m, // batch dimension m
|
||||
int prob_n, // output dimension n
|
||||
int prob_k, // reduction dimension k
|
||||
int lda, // A.stride(0), equal to prob_k is A is contiguous
|
||||
int* locks, // extra global storage for barrier synchronization
|
||||
bool use_atomic_add, // whether to use atomic add to reduce
|
||||
bool use_fp32_reduce // whether to use fp32 global reduce
|
||||
@ -604,7 +600,7 @@ __global__ void Marlin(
|
||||
// We can easily implement parallel problem execution by just remapping
|
||||
// indices and advancing global pointers
|
||||
if (slice_col_par >= n_tiles) {
|
||||
A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
|
||||
A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
|
||||
C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
|
||||
locks += (slice_col_par / n_tiles) * n_tiles;
|
||||
slice_col = slice_col_par % n_tiles;
|
||||
@ -635,7 +631,7 @@ __global__ void Marlin(
|
||||
}
|
||||
}
|
||||
if (slice_col == n_tiles) {
|
||||
A += 16 * thread_m_blocks * lda / 8;
|
||||
A += 16 * thread_m_blocks * prob_k / 8;
|
||||
C += 16 * thread_m_blocks * prob_n / 8;
|
||||
locks += n_tiles;
|
||||
slice_col = 0;
|
||||
@ -647,7 +643,7 @@ __global__ void Marlin(
|
||||
// A sizes/strides
|
||||
|
||||
// stride of the A matrix in global memory
|
||||
int a_gl_stride = lda / 8;
|
||||
int a_gl_stride = prob_k / 8;
|
||||
// stride of an A matrix tile in shared memory
|
||||
constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
|
||||
// delta between subsequent A tiles in global memory
|
||||
@ -723,8 +719,8 @@ __global__ void Marlin(
|
||||
(threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
|
||||
b_gl_rd += b_sh_stride * slice_col;
|
||||
b_gl_rd += b_gl_rd_delta_o * slice_row;
|
||||
auto b_sh_wr = threadIdx.x * b_thread_vecs;
|
||||
auto b_sh_rd = threadIdx.x * b_thread_vecs;
|
||||
int b_sh_wr = threadIdx.x * b_thread_vecs;
|
||||
int b_sh_rd = threadIdx.x * b_thread_vecs;
|
||||
|
||||
// For act_order
|
||||
constexpr int k_iter_size = tb_k / b_sh_wr_iters;
|
||||
@ -743,7 +739,7 @@ __global__ void Marlin(
|
||||
s_sh_stride * slice_col + threadIdx.x;
|
||||
}
|
||||
}
|
||||
auto s_sh_wr = threadIdx.x;
|
||||
int s_sh_wr = threadIdx.x;
|
||||
bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
|
||||
|
||||
// Zero-points
|
||||
@ -756,7 +752,7 @@ __global__ void Marlin(
|
||||
zp_sh_stride * slice_col + threadIdx.x;
|
||||
}
|
||||
}
|
||||
auto zp_sh_wr = threadIdx.x;
|
||||
int zp_sh_wr = threadIdx.x;
|
||||
bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
|
||||
|
||||
// We use a different scale layout for grouped and column-wise quantization as
|
||||
@ -1047,7 +1043,7 @@ __global__ void Marlin(
|
||||
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
|
||||
reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
|
||||
} else {
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int n_warps = thread_n_blocks / 4;
|
||||
|
||||
int warp_row = warp_id / n_warps;
|
||||
@ -1085,7 +1081,7 @@ __global__ void Marlin(
|
||||
|
||||
// Determine "position" inside the thread-block (based on warp and
|
||||
// thread-id)
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int n_warps =
|
||||
thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N
|
||||
|
||||
@ -1094,7 +1090,7 @@ __global__ void Marlin(
|
||||
|
||||
cur_k += warp_row * 16;
|
||||
|
||||
auto th_id = threadIdx.x % 32;
|
||||
int th_id = threadIdx.x % 32;
|
||||
cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix
|
||||
|
||||
int s_col_shift =
|
||||
@ -1159,7 +1155,7 @@ __global__ void Marlin(
|
||||
(reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
|
||||
}
|
||||
} else {
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int n_warps = thread_n_blocks / 4;
|
||||
|
||||
int warp_row = warp_id / n_warps;
|
||||
@ -1197,7 +1193,7 @@ __global__ void Marlin(
|
||||
(pipe / (group_blocks / thread_k_blocks)));
|
||||
reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
|
||||
} else {
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int n_warps = thread_n_blocks / 4;
|
||||
|
||||
int warp_row = warp_id / n_warps;
|
||||
@ -1323,7 +1319,7 @@ __global__ void Marlin(
|
||||
auto thread_block_reduce = [&]() {
|
||||
constexpr int red_off = threads / b_sh_stride_threads / 2;
|
||||
if (red_off >= 1) {
|
||||
auto red_idx = threadIdx.x / b_sh_stride_threads;
|
||||
int red_idx = threadIdx.x / b_sh_stride_threads;
|
||||
constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
|
||||
constexpr int red_sh_delta = b_sh_stride_threads;
|
||||
int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
|
||||
@ -1390,7 +1386,7 @@ __global__ void Marlin(
|
||||
4 * (threadIdx.x / 32) + threadIdx.x % 4;
|
||||
c_gl_wr += (2 * thread_n_blocks) * slice_col;
|
||||
constexpr int c_sh_wr_delta = active_threads;
|
||||
auto c_sh_wr = threadIdx.x;
|
||||
int c_sh_wr = threadIdx.x;
|
||||
|
||||
int row = (threadIdx.x % 32) / 4;
|
||||
|
||||
@ -1784,8 +1780,8 @@ __global__ void Marlin(
|
||||
HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT> \
|
||||
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \
|
||||
num_groups, prob_m, prob_n, prob_k, lda, locks, \
|
||||
use_atomic_add, use_fp32_reduce); \
|
||||
num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add, \
|
||||
use_fp32_reduce); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -2075,7 +2071,7 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
|
||||
template <typename scalar_t>
|
||||
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
|
||||
int prob_n, int prob_k, int lda, void* workspace,
|
||||
int prob_n, int prob_k, void* workspace,
|
||||
vllm::ScalarType const& q_type, bool has_act_order,
|
||||
bool is_k_full, bool has_zp, int num_groups, int group_size,
|
||||
int dev, cudaStream_t stream, int thread_k, int thread_n,
|
||||
@ -2188,9 +2184,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
// Permute A columns
|
||||
int block_rows = div_ceil(prob_m, blocks);
|
||||
permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
|
||||
A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
|
||||
A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
|
||||
A_ptr = a_tmp_ptr;
|
||||
lda = prob_k;
|
||||
}
|
||||
|
||||
// If we have a full K, then we can run the non-act-order version of Marlin
|
||||
@ -2249,7 +2244,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
", num_bits = ", num_bits);
|
||||
}
|
||||
|
||||
A_ptr += 16 * thread_m_blocks * (lda / 8) * par;
|
||||
A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
|
||||
C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
|
||||
}
|
||||
}
|
||||
@ -2305,10 +2300,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
|
||||
// Verify device and strides
|
||||
TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
|
||||
TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
|
||||
// We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
|
||||
TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
|
||||
TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");
|
||||
TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
|
||||
|
||||
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
|
||||
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
|
||||
@ -2440,7 +2432,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
|
||||
c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
|
||||
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
|
||||
a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
|
||||
a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
|
||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||
thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
|
||||
@ -2451,10 +2443,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
|
||||
b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
|
||||
perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
|
||||
a.stride(0), workspace.data_ptr(), b_q_type, has_act_order, is_k_full,
|
||||
has_zp, num_groups, group_size, dev,
|
||||
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
|
||||
marlin::max_par, use_atomic_add, use_fp32_reduce, is_zp_float);
|
||||
workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
|
||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||
thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
|
||||
use_fp32_reduce, is_zp_float);
|
||||
} else {
|
||||
TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
|
||||
}
|
||||
|
||||
@ -15,7 +15,7 @@ __global__ void gptq_marlin_repack_kernel(
|
||||
int n_tiles = size_n / tile_n_size;
|
||||
int block_k_tiles = div_ceil(k_tiles, gridDim.x);
|
||||
|
||||
auto start_k_tile = blockIdx.x * block_k_tiles;
|
||||
int start_k_tile = blockIdx.x * block_k_tiles;
|
||||
if (start_k_tile >= k_tiles) {
|
||||
return;
|
||||
}
|
||||
@ -71,8 +71,8 @@ __global__ void gptq_marlin_repack_kernel(
|
||||
|
||||
if constexpr (has_perm) {
|
||||
if (threadIdx.x < stage_size) {
|
||||
auto k_id = threadIdx.x / stage_n_threads;
|
||||
auto n_id = threadIdx.x % stage_n_threads;
|
||||
int k_id = threadIdx.x / stage_n_threads;
|
||||
int n_id = threadIdx.x % stage_n_threads;
|
||||
|
||||
uint32_t const* sh_perm_int_ptr =
|
||||
reinterpret_cast<uint32_t const*>(sh_perm_ptr);
|
||||
@ -88,8 +88,8 @@ __global__ void gptq_marlin_repack_kernel(
|
||||
|
||||
} else {
|
||||
if (threadIdx.x < stage_size) {
|
||||
auto k_id = threadIdx.x / stage_n_threads;
|
||||
auto n_id = threadIdx.x % stage_n_threads;
|
||||
int k_id = threadIdx.x / stage_n_threads;
|
||||
int n_id = threadIdx.x % stage_n_threads;
|
||||
|
||||
int first_k = k_tile_id * tile_k_size;
|
||||
int first_k_packed = first_k / pack_factor;
|
||||
@ -109,8 +109,8 @@ __global__ void gptq_marlin_repack_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
auto th_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int th_id = threadIdx.x % 32;
|
||||
|
||||
if (warp_id >= 4) {
|
||||
return;
|
||||
@ -339,4 +339,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
|
||||
m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
|
||||
}
|
||||
}
|
||||
@ -277,12 +277,12 @@ __global__ void Marlin(
|
||||
b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
|
||||
b_gl_rd += b_sh_stride * slice_col;
|
||||
b_gl_rd += b_gl_rd_delta_o * slice_row;
|
||||
auto b_sh_wr = threadIdx.x;
|
||||
auto b_sh_rd = threadIdx.x;
|
||||
int b_sh_wr = threadIdx.x;
|
||||
int b_sh_rd = threadIdx.x;
|
||||
|
||||
int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
|
||||
s_sh_stride * slice_col + threadIdx.x;
|
||||
auto s_sh_wr = threadIdx.x;
|
||||
int s_sh_wr = threadIdx.x;
|
||||
int s_sh_rd;
|
||||
// We use a different scale layout for grouped and column-wise quantization as
|
||||
// we scale a `half2` tile in column-major layout in the former and in
|
||||
@ -455,7 +455,7 @@ __global__ void Marlin(
|
||||
auto thread_block_reduce = [&]() {
|
||||
constexpr int red_off = threads / b_sh_stride / 2;
|
||||
if (red_off >= 1) {
|
||||
auto red_idx = threadIdx.x / b_sh_stride;
|
||||
int red_idx = threadIdx.x / b_sh_stride;
|
||||
constexpr int red_sh_stride = b_sh_stride * 4 * 2;
|
||||
constexpr int red_sh_delta = b_sh_stride;
|
||||
int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
|
||||
@ -522,7 +522,7 @@ __global__ void Marlin(
|
||||
4 * (threadIdx.x / 32) + threadIdx.x % 4;
|
||||
c_gl_wr += (2 * thread_n_blocks) * slice_col;
|
||||
constexpr int c_sh_wr_delta = active_threads;
|
||||
auto c_sh_wr = threadIdx.x;
|
||||
int c_sh_wr = threadIdx.x;
|
||||
|
||||
int row = (threadIdx.x % 32) / 4;
|
||||
|
||||
|
||||
@ -353,10 +353,10 @@ __global__ void Marlin(
|
||||
b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
|
||||
b_gl_rd += b_sh_stride * slice_col;
|
||||
b_gl_rd += b_gl_rd_delta_o * slice_row;
|
||||
auto b_sh_wr = threadIdx.x;
|
||||
auto b_sh_rd = threadIdx.x;
|
||||
int b_sh_wr = threadIdx.x;
|
||||
int b_sh_rd = threadIdx.x;
|
||||
|
||||
auto s_tok_gl_rd = threadIdx.x;
|
||||
int s_tok_gl_rd = threadIdx.x;
|
||||
// NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
|
||||
// 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
|
||||
// thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
|
||||
@ -368,8 +368,8 @@ __global__ void Marlin(
|
||||
int s_tok_sh_rd = (threadIdx.x % 32) / 4;
|
||||
bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
|
||||
|
||||
auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
|
||||
auto s_ch_sh_wr = threadIdx.x;
|
||||
int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
|
||||
int s_ch_sh_wr = threadIdx.x;
|
||||
int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
|
||||
2 * ((threadIdx.x % 32) % 4);
|
||||
bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
|
||||
@ -558,7 +558,7 @@ __global__ void Marlin(
|
||||
auto thread_block_reduce = [&]() {
|
||||
constexpr int red_off = threads / b_sh_stride / 2;
|
||||
if (red_off >= 1) {
|
||||
auto red_idx = threadIdx.x / b_sh_stride;
|
||||
int red_idx = threadIdx.x / b_sh_stride;
|
||||
constexpr int red_sh_stride = b_sh_stride * 4 * 2;
|
||||
constexpr int red_sh_delta = b_sh_stride;
|
||||
int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
|
||||
@ -628,7 +628,7 @@ __global__ void Marlin(
|
||||
8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
|
||||
c_gl_wr += (4 * thread_n_blocks) * slice_col;
|
||||
constexpr int c_sh_wr_delta = active_threads * 2;
|
||||
auto c_sh_wr = 2 * threadIdx.x;
|
||||
int c_sh_wr = 2 * threadIdx.x;
|
||||
|
||||
int row = (threadIdx.x % 32) / 4;
|
||||
|
||||
|
||||
@ -273,15 +273,15 @@ __global__ void Marlin_24(
|
||||
(threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
|
||||
b_gl_rd += b_sh_stride * slice_col;
|
||||
b_gl_rd += b_gl_rd_delta_o * slice_row;
|
||||
auto b_sh_wr = threadIdx.x * b_thread_vecs;
|
||||
auto b_sh_rd = threadIdx.x * b_thread_vecs;
|
||||
int b_sh_wr = threadIdx.x * b_thread_vecs;
|
||||
int b_sh_rd = threadIdx.x * b_thread_vecs;
|
||||
|
||||
int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
|
||||
(threadIdx.x % (m_sh_stride));
|
||||
m_gl_rd += (m_sh_stride)*slice_col;
|
||||
m_gl_rd += m_gl_rd_delta_o * slice_row;
|
||||
auto m_sh_wr = threadIdx.x;
|
||||
auto m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
|
||||
int m_sh_wr = threadIdx.x;
|
||||
int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
|
||||
|
||||
int s_gl_rd;
|
||||
if constexpr (group_blocks == -1) {
|
||||
@ -291,7 +291,7 @@ __global__ void Marlin_24(
|
||||
s_sh_stride * slice_col + threadIdx.x;
|
||||
}
|
||||
|
||||
auto s_sh_wr = threadIdx.x;
|
||||
int s_sh_wr = threadIdx.x;
|
||||
int s_sh_rd;
|
||||
// We use a different scale layout for grouped and column-wise quantization as
|
||||
// we scale a `half2` tile in column-major layout in the former and in
|
||||
@ -516,7 +516,7 @@ __global__ void Marlin_24(
|
||||
auto thread_block_reduce = [&]() {
|
||||
constexpr int red_off = threads / b_sh_stride_threads / 2;
|
||||
if (red_off >= 1) {
|
||||
auto red_idx = threadIdx.x / b_sh_stride_threads;
|
||||
int red_idx = threadIdx.x / b_sh_stride_threads;
|
||||
constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
|
||||
constexpr int red_sh_delta = b_sh_stride_threads;
|
||||
int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
|
||||
@ -583,7 +583,7 @@ __global__ void Marlin_24(
|
||||
8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
|
||||
c_gl_wr += (2 * thread_n_blocks) * slice_col;
|
||||
constexpr int c_sh_wr_delta = active_threads;
|
||||
auto c_sh_wr = threadIdx.x;
|
||||
int c_sh_wr = threadIdx.x;
|
||||
|
||||
int col = 2 * ((threadIdx.x % 32) % 4);
|
||||
|
||||
|
||||
@ -284,18 +284,18 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
int max_ctx_blocks, const float* k_scale, const float* v_scale) {
|
||||
// clang-format on
|
||||
constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
|
||||
const auto warpid = threadIdx.x / WARP_SIZE;
|
||||
const auto laneid = threadIdx.x % WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
const int laneid = threadIdx.x % WARP_SIZE;
|
||||
const int lane4id = laneid % 4;
|
||||
const int lane16id = laneid % 16;
|
||||
const int rowid = laneid / 16;
|
||||
|
||||
const auto seq_idx = blockIdx.x;
|
||||
const auto partition_idx = blockIdx.y;
|
||||
const int seq_idx = blockIdx.x;
|
||||
const int partition_idx = blockIdx.y;
|
||||
|
||||
constexpr int T_PAR_SIZE = 256; // token partition size set to 256
|
||||
|
||||
const auto max_num_partitions = gridDim.y;
|
||||
const int max_num_partitions = gridDim.y;
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
|
||||
@ -346,9 +346,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// can be interpreted as B8x16 for 8 bit types
|
||||
_B16x8 Klocal[TLOOP][QKHELOOP];
|
||||
|
||||
const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
|
||||
const auto wg_start_kv_head_idx = blockIdx.z;
|
||||
const auto total_num_heads = gridDim.z * GQA_RATIO;
|
||||
const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
|
||||
const int wg_start_kv_head_idx = blockIdx.z;
|
||||
const int total_num_heads = gridDim.z * GQA_RATIO;
|
||||
|
||||
// for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps
|
||||
// each mfma takes QH16xT16x16HE across warp
|
||||
@ -789,14 +789,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
int max_ctx_blocks, const float* k_scale, const float* v_scale) {
|
||||
// clang-format on
|
||||
constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
|
||||
const auto warpid = threadIdx.x / WARP_SIZE;
|
||||
const auto laneid = threadIdx.x % WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
const int laneid = threadIdx.x % WARP_SIZE;
|
||||
const int lane4id = laneid % 4;
|
||||
|
||||
const auto seq_idx = blockIdx.x;
|
||||
const auto partition_idx = blockIdx.y;
|
||||
const auto partition_size = blockDim.x;
|
||||
const auto max_num_partitions = gridDim.y;
|
||||
const int seq_idx = blockIdx.x;
|
||||
const int partition_idx = blockIdx.y;
|
||||
const int partition_size = blockDim.x;
|
||||
const int max_num_partitions = gridDim.y;
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int partition_start_token_idx = partition_idx * partition_size;
|
||||
@ -838,8 +838,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
qk_max[h] = -FLT_MAX;
|
||||
}
|
||||
|
||||
const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
|
||||
const auto wg_start_kv_head_idx = blockIdx.z;
|
||||
const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
|
||||
const int wg_start_kv_head_idx = blockIdx.z;
|
||||
|
||||
const int warp_start_token_idx =
|
||||
partition_start_token_idx + warpid * WARP_SIZE;
|
||||
@ -857,7 +857,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
|
||||
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||
// token id within partition
|
||||
const auto local_token_idx = threadIdx.x;
|
||||
const int local_token_idx = threadIdx.x;
|
||||
// token id within sequence
|
||||
const int global_token_idx = partition_start_token_idx + local_token_idx;
|
||||
|
||||
@ -1126,7 +1126,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const auto num_heads = gridDim.z * GQA_RATIO;
|
||||
const int num_heads = gridDim.z * GQA_RATIO;
|
||||
float* max_logits_ptr =
|
||||
max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
|
||||
float* exp_sums_ptr =
|
||||
@ -1268,14 +1268,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
// max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int max_num_partitions) {
|
||||
const auto num_heads = gridDim.x;
|
||||
const auto head_idx = blockIdx.x;
|
||||
const auto seq_idx = blockIdx.y;
|
||||
const int num_heads = gridDim.x;
|
||||
const int head_idx = blockIdx.x;
|
||||
const int seq_idx = blockIdx.y;
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
const auto warpid = threadIdx.x / WARP_SIZE;
|
||||
[[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||
@ -1294,7 +1294,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NPAR_LOOPS; i++) {
|
||||
const auto partition_no = i * WARP_SIZE + threadIdx.x;
|
||||
const int partition_no = i * WARP_SIZE + threadIdx.x;
|
||||
valid_partition[i] =
|
||||
(partition_no < num_partitions) ? partition_no : last_valid_partition;
|
||||
}
|
||||
@ -1324,7 +1324,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NPAR_LOOPS; i++) {
|
||||
const auto partition_no = i * WARP_SIZE + threadIdx.x;
|
||||
const int partition_no = i * WARP_SIZE + threadIdx.x;
|
||||
rescaled_exp_sum[i] *= (partition_no < num_partitions)
|
||||
? expf(reg_max_logit[i] - max_logit)
|
||||
: 0.0f;
|
||||
@ -1336,7 +1336,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NPAR_LOOPS; i++) {
|
||||
const auto partition_no = i * WARP_SIZE + threadIdx.x;
|
||||
const int partition_no = i * WARP_SIZE + threadIdx.x;
|
||||
shared_exp_sums[partition_no] = rescaled_exp_sum[i];
|
||||
}
|
||||
|
||||
|
||||
@ -111,19 +111,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
") -> ()");
|
||||
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
|
||||
|
||||
ops.def(
|
||||
"block_table_appends(Tensor append_row_indices, "
|
||||
"Tensor append_row_indices_cpu, Tensor append_cumsums, "
|
||||
"Tensor append_cumsums_cpu, Tensor append_block_ids, "
|
||||
"Tensor append_block_ids_cpu, Tensor! block_table, int num_appends, "
|
||||
"int total_num_append_blocks) -> ()");
|
||||
ops.impl("block_table_appends", torch::kCUDA, &block_table_appends);
|
||||
|
||||
ops.def(
|
||||
"block_table_moves(Tensor src_dst_n, Tensor src_dst_n_cpu, "
|
||||
"Tensor! block_table, int num_moves) -> ()");
|
||||
ops.impl("block_table_moves", torch::kCUDA, &block_table_moves);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
|
||||
@ -85,7 +85,6 @@ html_static_path = ["_static"]
|
||||
html_js_files = ["custom.js"]
|
||||
html_css_files = ["custom.css"]
|
||||
|
||||
myst_heading_anchors = 2
|
||||
myst_url_schemes = {
|
||||
'http': None,
|
||||
'https': None,
|
||||
|
||||
@ -4,9 +4,6 @@
|
||||
|
||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
||||
|
||||
* [Deployment with CPUs](#deployment-with-cpus)
|
||||
* [Deployment with GPUs](#deployment-with-gpus)
|
||||
|
||||
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [Helm](frameworks/helm.md)
|
||||
* [InftyAI/llmaz](integrations/llmaz.md)
|
||||
@ -17,107 +14,11 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
||||
* [vllm-project/production-stack](integrations/production-stack.md)
|
||||
|
||||
## Deployment with CPUs
|
||||
## Pre-requisite
|
||||
|
||||
:::{note}
|
||||
The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
|
||||
:::
|
||||
Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
|
||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
type: Opaque
|
||||
data:
|
||||
token: $(HF_TOKEN)
|
||||
```
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
||||
EOF
|
||||
```
|
||||
|
||||
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
|
||||
|
||||
```console
|
||||
kubectl logs -l app.kubernetes.io/name=vllm
|
||||
...
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
```
|
||||
|
||||
## Deployment with GPUs
|
||||
|
||||
**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
## Deployment using native K8s
|
||||
|
||||
1. Create a PVC, Secret and Deployment for vLLM
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
|
||||
Below are the steps to utilize BitsAndBytes with vLLM.
|
||||
|
||||
```console
|
||||
pip install bitsandbytes>=0.45.3
|
||||
pip install bitsandbytes>=0.45.0
|
||||
```
|
||||
|
||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||
|
||||
@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,
|
||||
|
||||
vLLM currently supports the following reasoning models:
|
||||
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
|
||||
| Model Series | Parser Name | Structured Output Support |
|
||||
|--------------|-------------|------------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
|
||||
|
||||
## Quickstart
|
||||
|
||||
@ -170,51 +170,10 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
## Tool Calling
|
||||
|
||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto"
|
||||
)
|
||||
|
||||
print(response)
|
||||
tool_call = response.choices[0].message.tool_calls[0].function
|
||||
|
||||
print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
|
||||
print(f"Function called: {tool_call.name}")
|
||||
print(f"Arguments: {tool_call.arguments}")
|
||||
```
|
||||
|
||||
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
|
||||
|
||||
## Limitations
|
||||
|
||||
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
|
||||
- It is not compatible with [`tool_calling`](#tool_calling).
|
||||
|
||||
## How to support a new reasoning model
|
||||
|
||||
|
||||
@ -30,10 +30,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="facebook/opt-6.7b",
|
||||
tensor_parallel_size=1,
|
||||
speculative_config={
|
||||
"model": "facebook/opt-125m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
speculative_model="facebook/opt-125m",
|
||||
num_speculative_tokens=5,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -47,14 +45,10 @@ To perform the same with an online mode launch the server:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
|
||||
--seed 42 -tp 1 --gpu_memory_utilization 0.8 \
|
||||
--speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
|
||||
--seed 42 -tp 1 --speculative_model facebook/opt-125m \
|
||||
--num_speculative_tokens 5 --gpu_memory_utilization 0.8
|
||||
```
|
||||
|
||||
:::{warning}
|
||||
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
|
||||
:::
|
||||
|
||||
Then use a client:
|
||||
|
||||
```python
|
||||
@ -107,11 +101,9 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="facebook/opt-6.7b",
|
||||
tensor_parallel_size=1,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 4,
|
||||
},
|
||||
speculative_model="[ngram]",
|
||||
num_speculative_tokens=5,
|
||||
ngram_prompt_lookup_max=4,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -139,10 +131,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
speculative_config={
|
||||
"model": "ibm-ai-platform/llama3-70b-accelerator",
|
||||
"draft_tensor_parallel_size": 1,
|
||||
},
|
||||
speculative_model="ibm-ai-platform/llama3-70b-accelerator",
|
||||
speculative_draft_tensor_parallel_size=1,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -185,10 +175,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
speculative_config={
|
||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"draft_tensor_parallel_size": 1,
|
||||
},
|
||||
speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
speculative_draft_tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@ -206,10 +194,11 @@ A few important things to consider when using the EAGLE based draft models:
|
||||
be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
|
||||
If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
|
||||
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
|
||||
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
|
||||
and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
|
||||
the latest version of vLLM, please leave a comment or raise an issue.
|
||||
|
||||
2. The EAGLE based draft models need to be run without tensor parallelism
|
||||
(i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
|
||||
(i.e. speculative_draft_tensor_parallel_size is set to 1), although
|
||||
it is possible to run the main model using tensor parallelism (see example above).
|
||||
|
||||
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
||||
|
||||
@ -193,7 +193,7 @@ vLLM CPU backend supports the following vLLM features:
|
||||
|
||||
## Related runtime environment variables
|
||||
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
|
||||
- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
|
||||
|
||||
|
||||
@ -58,11 +58,6 @@ from vllm import LLM, SamplingParams
|
||||
```
|
||||
|
||||
The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
|
||||
:::{important}
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
|
||||
:::
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
@ -81,7 +76,7 @@ llm = LLM(model="facebook/opt-125m")
|
||||
```
|
||||
|
||||
:::{note}
|
||||
By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
|
||||
By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
|
||||
:::
|
||||
|
||||
Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
|
||||
@ -112,11 +107,6 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
|
||||
By default, the server uses a predefined chat template stored in the tokenizer.
|
||||
You can learn about overriding it [here](#chat-template).
|
||||
:::
|
||||
:::{important}
|
||||
By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
To disable this behavior, please pass `--generation-config vllm` when launching the server.
|
||||
:::
|
||||
|
||||
This server can be queried in the same format as OpenAI API. For example, to list the models:
|
||||
|
||||
|
||||
@ -129,9 +129,6 @@ in progress.
|
||||
- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
|
||||
will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
|
||||
|
||||
- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet.
|
||||
See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations.
|
||||
|
||||
#### Features to Be Supported
|
||||
|
||||
- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
|
||||
@ -159,9 +156,6 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
|
||||
|
||||
For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
||||
## Frequently Asked Questions
|
||||
## FAQ
|
||||
|
||||
**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
|
||||
The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
|
||||
|
||||
On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
|
||||
TODO
|
||||
|
||||
@ -1,5 +0,0 @@
|
||||
Loading Model weights with fastsafetensors
|
||||
===================================================================
|
||||
|
||||
Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
|
||||
For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
|
||||
@ -5,5 +5,4 @@
|
||||
|
||||
runai_model_streamer
|
||||
tensorizer
|
||||
fastsafetensor
|
||||
:::
|
||||
|
||||
@ -46,11 +46,6 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
:::{important}
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
|
||||
:::
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:
|
||||
|
||||
- <project:#quantization-index> (except GGUF)
|
||||
- <project:#lora-adapter>
|
||||
- <project:#distributed-serving> (requires `transformers>=4.49.0`)
|
||||
- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
|
||||
|
||||
#### Remote code
|
||||
|
||||
|
||||
@ -2,12 +2,7 @@
|
||||
|
||||
# Engine Arguments
|
||||
|
||||
Engine arguments control the behavior of the vLLM engine.
|
||||
|
||||
- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
|
||||
- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
|
||||
|
||||
Below, you can find an explanation of every engine argument:
|
||||
Below, you can find an explanation of every engine argument for vLLM:
|
||||
|
||||
<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
|
||||
```{eval-rst}
|
||||
@ -20,7 +15,7 @@ Below, you can find an explanation of every engine argument:
|
||||
|
||||
## Async Engine Arguments
|
||||
|
||||
Additional arguments are available to the asynchronous engine which is used for online serving:
|
||||
Below are the additional arguments related to the asynchronous engine:
|
||||
|
||||
<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
|
||||
```{eval-rst}
|
||||
|
||||
@ -97,13 +97,6 @@ llm = LLM(model="adept/fuyu-8b",
|
||||
max_num_seqs=2)
|
||||
```
|
||||
|
||||
#### Adjust cache size
|
||||
|
||||
If you run out of CPU RAM, try the following options:
|
||||
|
||||
- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
|
||||
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
|
||||
|
||||
### Performance optimization and tuning
|
||||
|
||||
You can potentially improve the performance of vLLM by finetuning various options.
|
||||
|
||||
@ -33,11 +33,7 @@ print(completion.choices[0].message)
|
||||
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
||||
You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
|
||||
:::
|
||||
:::{important}
|
||||
By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
To disable this behavior, please pass `--generation-config vllm` when launching the server.
|
||||
:::
|
||||
## Supported APIs
|
||||
|
||||
We currently support the following OpenAI APIs:
|
||||
|
||||
@ -50,9 +50,7 @@ if __name__ == "__main__":
|
||||
# Create an LLM with spec decoding
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-13b-chat-hf",
|
||||
speculative_config={
|
||||
"model": "ibm-ai-platform/llama-13b-accelerator",
|
||||
},
|
||||
speculative_model="ibm-ai-platform/llama-13b-accelerator",
|
||||
)
|
||||
|
||||
print("With speculation")
|
||||
|
||||
@ -361,7 +361,6 @@ def run_llava_next_video(questions: list[str],
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
|
||||
123
examples/online_serving/disaggregated_prefill_zmq.sh
Normal file
123
examples/online_serving/disaggregated_prefill_zmq.sh
Normal file
@ -0,0 +1,123 @@
|
||||
#!/bin/bash
|
||||
# This file demonstrates the example usage of disaggregated prefilling with ZMQ
|
||||
# We will launch 2 vllm instances (1 for prefill and 1 for decode),
|
||||
# and then transfer the KV cache between them.
|
||||
|
||||
set -xe
|
||||
|
||||
echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
|
||||
sleep 1
|
||||
|
||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||
trap 'cleanup' INT
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo "Caught Ctrl+C, cleaning up..."
|
||||
# Cleanup commands
|
||||
pgrep python | xargs kill -9
|
||||
pkill -f python
|
||||
echo "Cleanup complete. Exiting."
|
||||
exit 0
|
||||
}
|
||||
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
|
||||
# a function that waits vLLM connect to start
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 1200 bash -c "
|
||||
until curl -s localhost:${port}/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
|
||||
# a function that waits vLLM disagg to start
|
||||
wait_for_disagg_server() {
|
||||
local log_file=$1
|
||||
timeout 1200 bash -c "
|
||||
until grep -q 'PDWorker is ready' $log_file; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
|
||||
# You can also adjust --kv-ip and --kv-port for distributed inference.
|
||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
CONTROLLER_ADDR=controller.ipc
|
||||
PREFILL_WORKER_ADDR=prefill.ipc
|
||||
DECODE_WORKER_ADDR=decode.ipc
|
||||
PORT=8001
|
||||
|
||||
# prefilling instance, which is the KV producer
|
||||
CUDA_VISIBLE_DEVICES=0 python3 ../../vllm/entrypoints/disaggregated/worker.py \
|
||||
--model $MODEL \
|
||||
--controller-addr $CONTROLLER_ADDR \
|
||||
--worker-addr $PREFILL_WORKER_ADDR \
|
||||
--max-model-len 100 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' > vllm_disagg_prefill.log 2>&1 &
|
||||
|
||||
# decoding instance, which is the KV consumer
|
||||
CUDA_VISIBLE_DEVICES=1 python3 ../../vllm/entrypoints/disaggregated/worker.py \
|
||||
--model $MODEL \
|
||||
--controller-addr $CONTROLLER_ADDR \
|
||||
--worker-addr $DECODE_WORKER_ADDR \
|
||||
--max-model-len 100 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' > vllm_disagg_decode.log 2>&1 &
|
||||
|
||||
# launch a proxy server that opens the service at port 8000
|
||||
# the workflow of this proxy:
|
||||
# - Send req to prefill instance, wait until complete.
|
||||
# - Send req to decode instance, streaming tokens.
|
||||
python3 ../../vllm/entrypoints/disaggregated/api_server.py \
|
||||
--port $PORT \
|
||||
--model $MODEL \
|
||||
--controller-addr $CONTROLLER_ADDR \
|
||||
--prefill-addr $PREFILL_WORKER_ADDR \
|
||||
--decode-addr $DECODE_WORKER_ADDR &
|
||||
|
||||
# wait until prefill, decode instances and proxy are ready
|
||||
wait_for_server $PORT
|
||||
wait_for_disagg_server vllm_disagg_prefill.log
|
||||
wait_for_disagg_server vllm_disagg_decode.log
|
||||
|
||||
# serve two example requests
|
||||
output1=$(curl -X POST -s http://localhost:8001/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
}')
|
||||
|
||||
output2=$(curl -X POST -s http://localhost:8001/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"prompt": "Santa Clara is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
}')
|
||||
|
||||
|
||||
# Cleanup commands
|
||||
pgrep python | xargs kill -9
|
||||
pkill -f python
|
||||
|
||||
echo ""
|
||||
|
||||
sleep 1
|
||||
|
||||
# Print the outputs of the curl requests
|
||||
echo ""
|
||||
echo "Output of first request: $output1"
|
||||
echo "Output of second request: $output2"
|
||||
|
||||
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
|
||||
echo ""
|
||||
@ -1,177 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
An example demonstrates how to use tool calling with reasoning models
|
||||
like QwQ-32B. The reasoning_content will not be parsed by the tool
|
||||
calling process; only the final output will be parsed.
|
||||
|
||||
To run this example, you need to start the vLLM server with both
|
||||
the reasoning parser and tool calling enabled.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/QwQ-32B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1 \
|
||||
--enable-auto-tool-choice --tool-call-parser hermes
|
||||
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
# Now, simulate a tool call
|
||||
def get_current_weather(city: str, state: str, unit: 'str'):
|
||||
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
|
||||
"partly cloudly, with highs in the 90's.")
|
||||
|
||||
|
||||
available_tools = {"get_current_weather": get_current_weather}
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": "Hi! How are you doing today?"
|
||||
}, {
|
||||
"role": "assistant",
|
||||
"content": "I'm doing well! How can I help you?"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
}]
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning_content = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
function_names.append("")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
function_names[tool_call_idx] = tool_call.function.name
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content += chunk.choices[0].delta.reasoning_content
|
||||
return reasoning_content, arguments, function_names
|
||||
|
||||
|
||||
print("---------Full Generate With Automatic Function Calling-------------")
|
||||
tool_calls = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools)
|
||||
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
|
||||
print(f"function name: "
|
||||
f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
|
||||
print(f"function arguments: "
|
||||
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
|
||||
|
||||
print("----------Stream Generate With Automatic Function Calling-----------")
|
||||
tool_calls_stream = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
stream=True)
|
||||
chunks = []
|
||||
for chunk in tool_calls_stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
|
||||
chunks)
|
||||
|
||||
print(f"reasoning_content: {reasoning_content}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
|
||||
print("----------Full Generate With Named Function Calling-----------------")
|
||||
tool_calls = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name":
|
||||
"get_current_weather"
|
||||
}
|
||||
})
|
||||
|
||||
tool_call = tool_calls.choices[0].message.tool_calls[0].function
|
||||
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
|
||||
print(f"function name: {tool_call.name}")
|
||||
print(f"function arguments: {tool_call.arguments}")
|
||||
print("----------Stream Generate With Named Function Calling--------------")
|
||||
|
||||
tool_calls_stream = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather"
|
||||
}
|
||||
},
|
||||
stream=True)
|
||||
|
||||
chunks = []
|
||||
for chunk in tool_calls_stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
|
||||
chunks)
|
||||
print(f"reasoning_content: {reasoning_content}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
print("\n\n")
|
||||
@ -86,7 +86,6 @@ exclude = [
|
||||
"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/worker/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/utils.py" = ["UP006", "UP035"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
|
||||
@ -18,7 +18,7 @@ pillow # Required for image processing
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer >= 0.10.11, < 0.11
|
||||
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
|
||||
@ -4,14 +4,14 @@
|
||||
# Dependencies for CPUs
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.6.0; platform_system == "Darwin"
|
||||
torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
||||
|
||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchaudio==2.6.0; platform_machine == "ppc64le"
|
||||
torchaudio==2.5.1; platform_machine == "ppc64le"
|
||||
|
||||
# required for the image processor of phi3v, this must be updated alongside torch
|
||||
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchvision==0.21.0; platform_machine == "ppc64le"
|
||||
torchvision==0.20.1; platform_machine == "ppc64le"
|
||||
datasets # for benchmark scripts
|
||||
|
||||
@ -41,4 +41,3 @@ tritonclient==2.51.0
|
||||
numpy < 2.0.0
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
fastsafetensors>=0.1.10
|
||||
|
||||
@ -67,7 +67,6 @@ click==8.1.7
|
||||
# jiwer
|
||||
# nltk
|
||||
# ray
|
||||
# typer
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# awscli
|
||||
@ -123,8 +122,6 @@ fastparquet==2024.11.0
|
||||
# via genai-perf
|
||||
fastrlock==0.8.2
|
||||
# via cupy-cuda12x
|
||||
fastsafetensors==0.1.10
|
||||
# via -r requirements/test.in
|
||||
filelock==3.16.1
|
||||
# via
|
||||
# datasets
|
||||
@ -508,9 +505,7 @@ requests==2.32.3
|
||||
responses==0.25.3
|
||||
# via genai-perf
|
||||
rich==13.9.4
|
||||
# via
|
||||
# genai-perf
|
||||
# typer
|
||||
# via genai-perf
|
||||
rouge-score==0.1.2
|
||||
# via lm-eval
|
||||
rpds-py==0.20.1
|
||||
@ -555,8 +550,6 @@ setuptools==75.8.0
|
||||
# via
|
||||
# pytablewriter
|
||||
# torch
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
six==1.16.0
|
||||
# via
|
||||
# python-dateutil
|
||||
@ -607,7 +600,6 @@ torch==2.6.0
|
||||
# accelerate
|
||||
# bitsandbytes
|
||||
# encodec
|
||||
# fastsafetensors
|
||||
# lm-eval
|
||||
# peft
|
||||
# runai-model-streamer
|
||||
@ -662,8 +654,6 @@ typepy==1.3.2
|
||||
# dataproperty
|
||||
# pytablewriter
|
||||
# tabledata
|
||||
typer==0.15.2
|
||||
# via fastsafetensors
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
@ -673,7 +663,6 @@ typing-extensions==4.12.2
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# torch
|
||||
# typer
|
||||
tzdata==2024.2
|
||||
# via pandas
|
||||
urllib3==2.2.3
|
||||
|
||||
1
setup.py
1
setup.py
@ -680,7 +680,6 @@ setup(
|
||||
install_requires=get_requirements(),
|
||||
extras_require={
|
||||
"tensorizer": ["tensorizer>=2.9.0"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile"], # Required for audio processing
|
||||
"video": ["decord"] # Required for video processing
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import Cython.Compiler.Options
|
||||
from Cython.Build import cythonize
|
||||
from setuptools import setup
|
||||
|
||||
Cython.Compiler.Options.annotate = True
|
||||
|
||||
infiles = []
|
||||
|
||||
infiles += [
|
||||
"vllm/engine/llm_engine.py",
|
||||
"vllm/transformers_utils/detokenizer.py",
|
||||
"vllm/engine/output_processor/single_step.py",
|
||||
"vllm/outputs.py",
|
||||
"vllm/engine/output_processor/stop_checker.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/core/scheduler.py",
|
||||
"vllm/sequence.py",
|
||||
"vllm/core/block_manager.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/model_executor/layers/sampler.py",
|
||||
"vllm/sampling_params.py",
|
||||
"vllm/utils.py",
|
||||
]
|
||||
|
||||
setup(ext_modules=cythonize(infiles,
|
||||
annotate=False,
|
||||
force=True,
|
||||
compiler_directives={
|
||||
'language_level': "3",
|
||||
'infer_types': True
|
||||
}))
|
||||
|
||||
# example usage: python3 build_cython.py build_ext --inplace
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import copy
|
||||
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -9,63 +10,32 @@ from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import CompilationConfig
|
||||
|
||||
|
||||
# dummy custom pass that doesn't inherit
|
||||
def simple_callable(graph: torch.fx.Graph):
|
||||
pass
|
||||
|
||||
|
||||
# Should fail to add directly to the pass manager
|
||||
def test_bad_callable():
|
||||
config = CompilationConfig().pass_config
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
pass_manager.add(simple_callable) # noqa, type wrong on purpose
|
||||
|
||||
|
||||
# Pass that inherits from InductorPass
|
||||
class ProperPass(InductorPass):
|
||||
|
||||
def __call__(self, graph: torch.fx.graph.Graph) -> None:
|
||||
pass
|
||||
callable_uuid = CallableInductorPass(simple_callable,
|
||||
InductorPass.hash_source(__file__))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"callable",
|
||||
"works, callable",
|
||||
[
|
||||
ProperPass(),
|
||||
# Can also wrap callables in CallableInductorPass for compliance
|
||||
CallableInductorPass(simple_callable),
|
||||
CallableInductorPass(simple_callable,
|
||||
InductorPass.hash_source(__file__))
|
||||
(False, simple_callable),
|
||||
(True, callable_uuid),
|
||||
(True, CallableInductorPass(simple_callable)),
|
||||
],
|
||||
)
|
||||
def test_pass_manager_uuid(callable):
|
||||
def test_pass_manager(works: bool, callable):
|
||||
config = CompilationConfig().pass_config
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
# Check that UUID is different if the same pass is added 2x
|
||||
pass_manager.add(callable)
|
||||
uuid1 = pass_manager.uuid()
|
||||
pass_manager.add(callable)
|
||||
uuid2 = pass_manager.uuid()
|
||||
assert uuid1 != uuid2
|
||||
|
||||
# UUID should be the same as the original one,
|
||||
# as we constructed in the same way.
|
||||
pass_manager2 = PostGradPassManager()
|
||||
pass_manager2.configure(config)
|
||||
pass_manager2.add(callable)
|
||||
assert uuid1 == pass_manager2.uuid()
|
||||
|
||||
# UUID should be different due to config change
|
||||
config2 = copy.deepcopy(config)
|
||||
config2.enable_fusion = not config2.enable_fusion
|
||||
pass_manager3 = PostGradPassManager()
|
||||
pass_manager3.configure(config2)
|
||||
pass_manager3.add(callable)
|
||||
assert uuid1 != pass_manager3.uuid()
|
||||
# Try to add the callable to the pass manager
|
||||
if works:
|
||||
pass_manager.add(callable)
|
||||
pickle.dumps(pass_manager)
|
||||
else:
|
||||
with pytest.raises(AssertionError):
|
||||
pass_manager.add(callable)
|
||||
|
||||
@ -175,8 +175,6 @@ TEXT_GENERATION_MODELS = {
|
||||
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
||||
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
|
||||
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
|
||||
# Tests TransformersModel
|
||||
"ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
|
||||
# Uses Llama
|
||||
@ -245,7 +243,6 @@ TEST_MODELS = [
|
||||
# [LANGUAGE GENERATION]
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
# "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
|
||||
"ibm/PowerLM-3b",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
|
||||
@ -21,9 +21,18 @@ def test_collective_rpc(tp_size, backend):
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
class MyWorker(Worker):
|
||||
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
load_format="dummy",
|
||||
tensor_parallel_size=tp_size,
|
||||
distributed_executor_backend=backend)
|
||||
assert llm.collective_rpc(echo_rank) == list(range(tp_size))
|
||||
distributed_executor_backend=backend,
|
||||
worker_cls=MyWorker)
|
||||
for method in ["echo_rank", echo_rank]:
|
||||
assert llm.collective_rpc(method) == list(range(tp_size))
|
||||
|
||||
@ -107,10 +107,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||
# Call the function and get the result
|
||||
result = apply_hf_chat_template(
|
||||
tokenizer,
|
||||
trust_remote_code=True,
|
||||
conversation=mock_request.messages,
|
||||
chat_template=mock_request.chat_template or template_content,
|
||||
tools=None,
|
||||
add_generation_prompt=mock_request.add_generation_prompt,
|
||||
continue_final_message=mock_request.continue_final_message,
|
||||
)
|
||||
|
||||
@ -1,145 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# a reasoning and tool calling model
|
||||
MODEL_NAME = "Qwen/QwQ-32B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
|
||||
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
|
||||
"--tool-call-parser", "hermes"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
TOOLS = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
MESSAGES = [{
|
||||
"role": "user",
|
||||
"content": "Hi! How are you doing today?"
|
||||
}, {
|
||||
"role": "assistant",
|
||||
"content": "I'm doing well! How can I help you?"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
}]
|
||||
|
||||
FUNC_NAME = "get_current_weather"
|
||||
FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning_content = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
function_names.append("")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
function_names[tool_call_idx] = tool_call.function.name
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content += chunk.choices[0].delta.reasoning_content
|
||||
return reasoning_content, arguments, function_names
|
||||
|
||||
|
||||
# test streaming
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_streaming_of_tool_and_reasoning(
|
||||
client: openai.AsyncOpenAI):
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
chunks = []
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
|
||||
chunks)
|
||||
assert len(reasoning_content) > 0
|
||||
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
|
||||
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
|
||||
|
||||
|
||||
# test full generate
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
|
||||
|
||||
tool_calls = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert len(tool_calls.choices[0].message.reasoning_content) > 0
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.name \
|
||||
== FUNC_NAME
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
|
||||
== FUNC_ARGS
|
||||
@ -87,7 +87,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
|
||||
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
@ -180,7 +180,7 @@ async def test_single_chat_session_video_base64encoded(
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
|
||||
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
|
||||
@ -4,13 +4,10 @@ import warnings
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
|
||||
_try_extract_ast, load_chat_template,
|
||||
from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
parse_chat_messages,
|
||||
parse_chat_messages_futures,
|
||||
resolve_chat_template_content_format)
|
||||
@ -26,10 +23,8 @@ EXAMPLES_DIR = VLLM_PATH / "examples"
|
||||
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
|
||||
ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
|
||||
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
@ -708,70 +703,25 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
vllm_result = apply_hf_chat_template(
|
||||
tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
conversation=conversation,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
assert hf_result == vllm_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
QWEN2VL_MODEL_ID, # tokenizer.chat_template is of type str
|
||||
HERMES_MODEL_ID, # tokenizer.chat_template is of type dict
|
||||
])
|
||||
@pytest.mark.parametrize("use_tools", [True, False])
|
||||
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
"""checks that chat_template is a dict type for HF models."""
|
||||
|
||||
# Build the tokenizer group and grab the underlying tokenizer
|
||||
tokenizer_group = TokenizerGroup(
|
||||
model,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
)
|
||||
tokenizer = tokenizer_group.tokenizer
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}] if use_tools else None
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
chat_template = _resolve_hf_chat_template(
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=tools,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("model", "expected_format"),
|
||||
[(PHI3V_MODEL_ID, "string"),
|
||||
(QWEN2VL_MODEL_ID, "openai"),
|
||||
(QWEN25VL_MODEL_ID, "openai"),
|
||||
(ULTRAVOX_MODEL_ID, "string"),
|
||||
(MLLAMA_MODEL_ID, "openai"),
|
||||
(LLAMA_GUARD_MODEL_ID, "openai")],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
|
||||
"4.49.0"):
|
||||
pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
|
||||
|
||||
tokenizer_group = TokenizerGroup(
|
||||
model,
|
||||
enable_lora=False,
|
||||
@ -780,13 +730,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
)
|
||||
tokenizer = tokenizer_group.tokenizer
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
chat_template = _resolve_hf_chat_template(
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
chat_template = tokenizer.chat_template
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
print("[TEXT]")
|
||||
@ -796,10 +740,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
|
||||
resolved_format = resolve_chat_template_content_format(
|
||||
None, # Test detecting the tokenizer's chat_template
|
||||
None,
|
||||
"auto",
|
||||
tokenizer,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
@ -849,10 +791,8 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
||||
|
||||
resolved_format = resolve_chat_template_content_format(
|
||||
chat_template,
|
||||
None,
|
||||
"auto",
|
||||
dummy_tokenizer,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
|
||||
test_model = "openai-community/gpt2"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
|
||||
|
||||
|
||||
def test_model_loader_download_files(vllm_runner):
|
||||
with vllm_runner(test_model,
|
||||
load_format=LoadFormat.FASTSAFETENSORS) as llm:
|
||||
deserialized_outputs = llm.generate(prompts, sampling_params)
|
||||
assert deserialized_outputs
|
||||
@ -1,46 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import glob
|
||||
import tempfile
|
||||
|
||||
import huggingface_hub.constants
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf, fastsafetensors_weights_iterator,
|
||||
safetensors_weights_iterator)
|
||||
|
||||
|
||||
def test_fastsafetensors_model_loader():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = False
|
||||
download_weights_from_hf("openai-community/gpt2",
|
||||
allow_patterns=["*.safetensors"],
|
||||
cache_dir=tmpdir)
|
||||
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
|
||||
assert len(safetensors) > 0
|
||||
|
||||
fastsafetensors_tensors = {}
|
||||
hf_safetensors_tensors = {}
|
||||
|
||||
for name, tensor in fastsafetensors_weights_iterator(
|
||||
safetensors, True):
|
||||
fastsafetensors_tensors[name] = tensor
|
||||
|
||||
for name, tensor in safetensors_weights_iterator(safetensors, True):
|
||||
hf_safetensors_tensors[name] = tensor
|
||||
|
||||
assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
|
||||
|
||||
for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
|
||||
fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
|
||||
assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
|
||||
name].dtype
|
||||
assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
|
||||
name].shape
|
||||
assert torch.all(
|
||||
fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_fastsafetensors_model_loader()
|
||||
@ -749,72 +749,3 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
|
||||
|
||||
ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
|
||||
torch.testing.assert_close(dst, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
|
||||
@torch.inference_mode()
|
||||
def test_concat_and_cache_mla_cpu(
|
||||
kv_lora_rank: int,
|
||||
qk_rope_head_dim: int,
|
||||
num_tokens: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
) -> None:
|
||||
device = "cpu"
|
||||
kv_cache_dtype = "auto"
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
total_slots = num_blocks * block_size
|
||||
slot_mapping_lst = random.sample(range(total_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst,
|
||||
dtype=torch.long,
|
||||
device=device)
|
||||
|
||||
kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
|
||||
k_pe = torch.randn(num_tokens,
|
||||
qk_rope_head_dim,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
||||
|
||||
scale = torch.tensor(0.1, dtype=torch.float32, device=device)
|
||||
kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
|
||||
kv_cache_dtype, device)
|
||||
ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
|
||||
|
||||
for i in range(num_tokens):
|
||||
slot = slot_mapping[i].item()
|
||||
block_idx = slot // block_size
|
||||
block_offset = slot % block_size
|
||||
ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
|
||||
ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
|
||||
|
||||
if kv_cache_dtype == "fp8":
|
||||
ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
|
||||
ops.convert_fp8(ref_kv_cache,
|
||||
ref_temp,
|
||||
scale.item(),
|
||||
kv_dtype=kv_cache_dtype)
|
||||
else:
|
||||
ref_kv_cache = ref_temp
|
||||
|
||||
opcheck(
|
||||
torch.ops._C_cache_ops.concat_and_cache_mla,
|
||||
(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
)
|
||||
|
||||
ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
|
||||
kv_cache_dtype, scale)
|
||||
torch.testing.assert_close(kv_cache, ref_kv_cache)
|
||||
|
||||
@ -606,51 +606,6 @@ def test_marlin_qqq_gemm(
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
def test_marlin_gemm_subset_input():
|
||||
quant_type = scalar_types.uint4b8
|
||||
group_size = 128
|
||||
|
||||
size_m, size_k, size_n = 32, 1024, 2048
|
||||
big_m = size_m * 2
|
||||
big_k = size_k * 2
|
||||
|
||||
a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
|
||||
b_weight, quant_type, group_size, False)
|
||||
|
||||
marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
|
||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
a_input,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_zp,
|
||||
g_idx,
|
||||
sort_indices,
|
||||
workspace.scratch,
|
||||
quant_type,
|
||||
a_input.shape[0],
|
||||
b_weight.shape[1],
|
||||
a_input.shape[1],
|
||||
is_k_full=True,
|
||||
has_zp=False,
|
||||
use_atomic_add=False,
|
||||
use_fp32_reduce=True,
|
||||
is_zp_float=False,
|
||||
)
|
||||
output_ref = torch.matmul(a_input, w_ref)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
def test_marlin_gemm_opcheck():
|
||||
size_m = 2048
|
||||
size_n = 4096
|
||||
|
||||
@ -1,94 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def cdiv(a, b):
|
||||
return (a + b - 1) // b
|
||||
|
||||
|
||||
def ref_mla(
|
||||
out: Tensor, # (bs, num_heads, v_head_dim)
|
||||
query: Tensor, # (bs, num_heads, head_dim)
|
||||
kv_cache: Tensor, # (num_blocks, block_size, head_dim)
|
||||
scale: float,
|
||||
block_tables: Tensor, # (bs, max_num_blocks)
|
||||
seq_lens: Tensor, # (bs,)
|
||||
):
|
||||
bs, num_heads, v_head_dim = out.shape
|
||||
head_dim = query.shape[2]
|
||||
|
||||
for i in range(bs):
|
||||
# gather and flatten KV-cache
|
||||
kv = kv_cache[
|
||||
block_tables[i]] # (max_num_blocks, block_size, head_dim)
|
||||
kv = kv.view(1, -1,
|
||||
head_dim)[:, :seq_lens[i]] # (1, seq_len, head_dim)
|
||||
v = kv[:, :, :v_head_dim]
|
||||
|
||||
q = query[i].view(num_heads, 1, head_dim)
|
||||
o = F.scaled_dot_product_attention(q,
|
||||
kv,
|
||||
v,
|
||||
scale=scale,
|
||||
enable_gqa=True)
|
||||
out[i] = o.view(num_heads, v_head_dim)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bs", [4])
|
||||
@pytest.mark.parametrize("mean_seq_len", [256])
|
||||
@pytest.mark.parametrize("h_q", [16])
|
||||
@pytest.mark.parametrize("d", [576])
|
||||
@pytest.mark.parametrize("dv", [512])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
|
||||
@pytest.mark.parametrize("varlen", [False, True])
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
|
||||
def test_mla_decode_cpu(
|
||||
bs: int,
|
||||
mean_seq_len: int,
|
||||
h_q: int,
|
||||
d: int,
|
||||
dv: int,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
varlen: bool,
|
||||
):
|
||||
torch.set_default_dtype(dtype)
|
||||
torch.manual_seed(0)
|
||||
|
||||
scale = d**(-0.5)
|
||||
if varlen:
|
||||
seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
|
||||
seq_lens = seq_lens.clip(2).to(torch.int32)
|
||||
else:
|
||||
seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
|
||||
max_seq_len = seq_lens.max().item()
|
||||
seqlen_pad = cdiv(max_seq_len, 256) * 256 # is this necessary?
|
||||
|
||||
q = torch.randn(bs, h_q, d)
|
||||
block_table = torch.arange(bs * seqlen_pad // block_size,
|
||||
dtype=torch.int32)
|
||||
block_table = block_table.view(bs, seqlen_pad // block_size)
|
||||
|
||||
kv_cache = torch.randn(block_table.numel(), block_size, d)
|
||||
for i, seq_len in enumerate(seq_lens.tolist()):
|
||||
kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
|
||||
|
||||
out_mla = q.new_zeros(bs, h_q, dv)
|
||||
ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
|
||||
seq_lens)
|
||||
|
||||
out_ref = q.new_zeros(bs, h_q, dv)
|
||||
ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
|
||||
|
||||
assert not out_mla.isnan().any(), "Likely read out of bounds"
|
||||
torch.testing.assert_close(out_mla, out_ref)
|
||||
@ -3,11 +3,8 @@
|
||||
|
||||
Run `pytest tests/kernels/test_moe.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch.nn import Parameter
|
||||
from torch.nn import functional as F
|
||||
from transformers import MixtralConfig
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||
|
||||
@ -40,7 +37,6 @@ TOP_KS = [2, 6]
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
def test_fused_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
@ -49,7 +45,6 @@ def test_fused_moe(
|
||||
topk: int,
|
||||
ep_size: int,
|
||||
dtype: torch.dtype,
|
||||
padding: bool,
|
||||
):
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
@ -70,23 +65,6 @@ def test_fused_moe(
|
||||
else:
|
||||
e_map = None
|
||||
|
||||
torch_output = torch_moe(a, w1, w2, score, topk, e_map)
|
||||
iterative_output = iterative_moe(a,
|
||||
w1,
|
||||
w2,
|
||||
score,
|
||||
topk,
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False)
|
||||
|
||||
# Pad the weight if moe padding is enabled
|
||||
if padding:
|
||||
w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
|
||||
torch.cuda.empty_cache()
|
||||
w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
triton_output = fused_moe(a,
|
||||
w1,
|
||||
w2,
|
||||
@ -95,7 +73,16 @@ def test_fused_moe(
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False)
|
||||
torch_output = torch_moe(a, w1, w2, score, topk, e_map)
|
||||
torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
|
||||
iterative_output = iterative_moe(a,
|
||||
w1,
|
||||
w2,
|
||||
score,
|
||||
topk,
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False)
|
||||
torch.testing.assert_close(iterative_output,
|
||||
torch_output,
|
||||
atol=2e-2,
|
||||
@ -215,9 +202,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
|
||||
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_mixtral_moe(dtype: torch.dtype, padding: bool):
|
||||
def test_mixtral_moe(dtype: torch.dtype):
|
||||
"""Make sure our Mixtral MoE implementation agrees with the one from
|
||||
huggingface."""
|
||||
|
||||
@ -247,17 +233,6 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
|
||||
# vLLM uses 1D query [num_tokens, hidden_dim]
|
||||
vllm_inputs = hf_inputs.flatten(0, 1)
|
||||
|
||||
# Pad the weight if moe padding is enabled
|
||||
if padding:
|
||||
vllm_moe.experts.w13_weight = Parameter(F.pad(
|
||||
vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[..., 0:-128],
|
||||
requires_grad=False)
|
||||
torch.cuda.empty_cache()
|
||||
vllm_moe.experts.w2_weight = Parameter(F.pad(
|
||||
vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
|
||||
requires_grad=False)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Run forward passes for both MoE blocks
|
||||
hf_states, _ = hf_moe.forward(hf_inputs)
|
||||
vllm_states = vllm_moe.forward(vllm_inputs)
|
||||
|
||||
@ -163,24 +163,24 @@ VLM_TEST_SETTINGS = {
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
#### Extended model tests
|
||||
"aria": VLMTestInfo(
|
||||
models=["rhymes-ai/Aria"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
stop_str=["<|im_end|>"],
|
||||
image_size_factors=[(0.10, 0.15)],
|
||||
max_tokens=64,
|
||||
marks=[large_gpu_mark(min_gb=64)],
|
||||
),
|
||||
# "aria": VLMTestInfo(
|
||||
# models=["rhymes-ai/Aria"],
|
||||
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
# prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
||||
# img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
||||
# max_model_len=4096,
|
||||
# max_num_seqs=2,
|
||||
# auto_cls=AutoModelForImageTextToText,
|
||||
# single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
# "stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||
# }),
|
||||
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
# stop_str=["<|im_end|>"],
|
||||
# image_size_factors=[(0.10, 0.15)],
|
||||
# max_tokens=64,
|
||||
# marks=[large_gpu_mark(min_gb=64)],
|
||||
# ),
|
||||
"blip2": VLMTestInfo(
|
||||
models=["Salesforce/blip2-opt-2.7b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
@ -352,7 +352,6 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
|
||||
num_video_frames=16,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
|
||||
),
|
||||
@ -385,7 +384,7 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
"minicpmo_26": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-o-2_6"],
|
||||
test_type=(VLMTestType.IMAGE),
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
@ -394,21 +393,9 @@ VLM_TEST_SETTINGS = {
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
),
|
||||
"minicpmo_26_multi_image": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-o-2_6"],
|
||||
test_type=(VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"minicpmv_26": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-V-2_6"],
|
||||
test_type=(VLMTestType.IMAGE),
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
@ -417,18 +404,6 @@ VLM_TEST_SETTINGS = {
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
),
|
||||
"minicpmv_26_multi_image": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-V-2_6"],
|
||||
test_type=(VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"molmo": VLMTestInfo(
|
||||
models=["allenai/Molmo-7B-D-0924"],
|
||||
test_type=(VLMTestType.IMAGE),
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import copy
|
||||
from functools import partial
|
||||
from typing import Optional, Union
|
||||
|
||||
@ -28,7 +29,7 @@ def _test_processing_correctness(
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
ignore_mm_keys: Optional[set[str]] = None,
|
||||
ignore_mm_keys: Optional[list[str]] = None,
|
||||
):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@ -144,7 +145,7 @@ def _test_processing_correctness_hf(
|
||||
baseline_processor: BaseMultiModalProcessor,
|
||||
cached_processor: BaseMultiModalProcessor,
|
||||
batch_idx: int,
|
||||
ignore_mm_keys: Optional[set[str]] = None,
|
||||
ignore_mm_keys: Optional[list[str]] = None,
|
||||
):
|
||||
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
|
||||
# For some multimodal models, tokenizer will always add bos_token
|
||||
@ -166,12 +167,11 @@ def _test_processing_correctness_hf(
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
assert _inputs_equal(
|
||||
baseline_result,
|
||||
cached_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
|
||||
)
|
||||
ignore_mm_keys,
|
||||
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
|
||||
|
||||
baseline_tokenized_result = baseline_processor.apply(
|
||||
token_prompt,
|
||||
@ -179,12 +179,11 @@ def _test_processing_correctness_hf(
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
assert _inputs_equal(
|
||||
baseline_result,
|
||||
baseline_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
|
||||
)
|
||||
ignore_mm_keys,
|
||||
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
|
||||
|
||||
cached_tokenized_result = cached_processor.apply(
|
||||
token_prompt,
|
||||
@ -192,12 +191,11 @@ def _test_processing_correctness_hf(
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
assert _inputs_equal(
|
||||
cached_result,
|
||||
cached_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
|
||||
)
|
||||
ignore_mm_keys,
|
||||
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
|
||||
|
||||
|
||||
def _test_processing_correctness_mistral(
|
||||
@ -208,7 +206,7 @@ def _test_processing_correctness_mistral(
|
||||
baseline_processor: BaseMultiModalProcessor,
|
||||
cached_processor: BaseMultiModalProcessor,
|
||||
batch_idx: int,
|
||||
ignore_mm_keys: Optional[set[str]] = None,
|
||||
ignore_mm_keys: Optional[list[str]] = None,
|
||||
):
|
||||
images = mm_data.get("image", [])
|
||||
if not isinstance(images, list):
|
||||
@ -235,12 +233,11 @@ def _test_processing_correctness_mistral(
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
assert _inputs_equal(
|
||||
baseline_tokenized_result,
|
||||
cached_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
|
||||
)
|
||||
ignore_mm_keys,
|
||||
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@ -264,7 +261,6 @@ def _test_processing_correctness_mistral(
|
||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
"mistral-community/pixtral-12b",
|
||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||
"openbmb/MiniCPM-o-2_6",
|
||||
"openbmb/MiniCPM-V-2_6",
|
||||
"allenai/Molmo-7B-D-0924",
|
||||
@ -294,7 +290,7 @@ def test_processing_correctness(
|
||||
# In Ultravox, the audio_features can be different depending on padding
|
||||
# The slight difference should not be a problem though, since
|
||||
# attention_mask lets us ignore the difference.
|
||||
ignore_mm_keys = {"audio_features"}
|
||||
ignore_mm_keys = ['audio_features']
|
||||
|
||||
_test_processing_correctness(
|
||||
model_id,
|
||||
@ -332,26 +328,38 @@ def test_processing_correctness_phi3v(
|
||||
)
|
||||
|
||||
|
||||
def _assert_inputs_equal(
|
||||
def _inputs_equal(
|
||||
a: MultiModalInputs,
|
||||
b: MultiModalInputs,
|
||||
*,
|
||||
ignore_mm_keys: Optional[set[str]] = None,
|
||||
msg: str = "",
|
||||
ignore_mm_keys: Optional[list[str]] = None,
|
||||
):
|
||||
if ignore_mm_keys is None:
|
||||
ignore_mm_keys = set()
|
||||
return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
|
||||
b, ignore_mm_keys)
|
||||
|
||||
if msg is None:
|
||||
assert "mm_kwargs" in a and "mm_kwargs" in b
|
||||
else:
|
||||
assert "mm_kwargs" in a and "mm_kwargs" in b, msg
|
||||
|
||||
for key in ignore_mm_keys:
|
||||
a["mm_kwargs"].pop(key, None)
|
||||
b["mm_kwargs"].pop(key, None)
|
||||
def _drop_mm_kwargs_keys(
|
||||
result: MultiModalInputs,
|
||||
ignore_mm_keys: Optional[list[str]] = None,
|
||||
) -> MultiModalInputs:
|
||||
"""Drop specified keys from result['mm_kwargs'].
|
||||
|
||||
if msg is None:
|
||||
assert a == b
|
||||
else:
|
||||
assert a == b, msg
|
||||
This is mainly to avoid doing exact match of audio_features in ultravox.
|
||||
|
||||
Args:
|
||||
result: Result to drop keys from
|
||||
ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
|
||||
"""
|
||||
if not ignore_mm_keys:
|
||||
return result
|
||||
|
||||
if 'mm_kwargs' in result:
|
||||
result = copy.deepcopy(result)
|
||||
mm_kwargs = result['mm_kwargs']
|
||||
for key in ignore_mm_keys:
|
||||
mm_kwargs.pop(key, None)
|
||||
for items in mm_kwargs._items_by_modality.values():
|
||||
for item in items:
|
||||
for key in ignore_mm_keys:
|
||||
item.pop(key, None)
|
||||
|
||||
return result
|
||||
|
||||
@ -56,7 +56,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
def maybe_assert_ngram_worker(llm):
|
||||
# Verify the proposer worker is ngram if ngram is specified.
|
||||
if (llm.llm_engine.speculative_config is not None
|
||||
and llm.llm_engine.speculative_config.method == "ngram"):
|
||||
and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
|
||||
from vllm.spec_decode.ngram_worker import NGramWorker
|
||||
assert isinstance(
|
||||
llm.llm_engine.model_executor.driver_worker.proposer_worker,
|
||||
|
||||
@ -7,39 +7,28 @@ from vllm import SamplingParams
|
||||
from .conftest import get_output_from_llm_generator
|
||||
|
||||
|
||||
@pytest.mark.parametrize("common_llm_kwargs",
|
||||
[{
|
||||
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||
}])
|
||||
@pytest.mark.parametrize("common_llm_kwargs", [{
|
||||
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# Speculative max model len > overridden max model len should raise.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 129,
|
||||
},
|
||||
"max_model_len": 128,
|
||||
"speculative_max_model_len": 129,
|
||||
},
|
||||
{
|
||||
# Speculative max model len > draft max model len should raise.
|
||||
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 2048 + 1,
|
||||
},
|
||||
"speculative_max_model_len": 2048 + 1,
|
||||
},
|
||||
{
|
||||
# Speculative max model len > target max model len should raise.
|
||||
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 131072 + 1,
|
||||
},
|
||||
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
|
||||
"speculative_max_model_len": 131072 + 1,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
|
||||
@ -57,10 +57,8 @@ PRECISION = "float32"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -97,19 +95,18 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": False,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": True,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
}])
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
128,
|
||||
])
|
||||
@ -122,19 +119,18 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
batch_size: int, output_len: int, seed: int,
|
||||
logprobs: int):
|
||||
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -155,10 +151,8 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -199,10 +193,8 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -244,10 +236,8 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -287,13 +277,12 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -335,10 +324,8 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "yuhuili/EAGLE-llama2-chat-7B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -385,10 +372,8 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -435,10 +420,8 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -23,10 +23,8 @@ MAIN_MODEL = "JackFram/llama-68m"
|
||||
[
|
||||
{
|
||||
# Identical models.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@ -59,33 +57,26 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
# Explicitly specify draft model quantization
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
"quantization": "gptq",
|
||||
},
|
||||
"speculative_model_quantization": "gptq",
|
||||
},
|
||||
# Explicitly specify GPTQ-based draft model to use marlin quantization
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
"quantization": "marlin",
|
||||
},
|
||||
"speculative_model_quantization": "marlin",
|
||||
},
|
||||
# Not explicitly specify draft model quantization
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
"quantization": None,
|
||||
},
|
||||
"speculative_model_quantization": None,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@ -116,16 +107,15 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -137,7 +127,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
|
||||
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||
output_len: int, seed: int):
|
||||
"""Verify that speculative decoding generates the same output
|
||||
"""Verify that ngram speculative decoding generates the same output
|
||||
with batch expansion scorer and mqa scorer.
|
||||
"""
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
|
||||
@ -27,19 +27,18 @@ from .conftest import run_equality_correctness_test_tp
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
[
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num-speculative-tokens",
|
||||
"3",
|
||||
],
|
||||
[
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"[ngram]",
|
||||
"--num-speculative-tokens",
|
||||
"5",
|
||||
"--ngram-prompt-lookup-max",
|
||||
"3",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
@ -84,24 +83,23 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
]])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize(
|
||||
"model, test_llm_kwargs",
|
||||
[("JackFram/llama-68m", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
]),
|
||||
("ibm-granite/granite-3b-code-instruct", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "ibm-granite/granite-3b-code-instruct",
|
||||
"num_speculative_tokens": 5,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
])])
|
||||
@pytest.mark.parametrize("model, test_llm_kwargs",
|
||||
[("JackFram/llama-68m", [
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num_speculative-tokens",
|
||||
"5",
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
]),
|
||||
("ibm-granite/granite-3b-code-instruct", [
|
||||
"--speculative-model",
|
||||
"ibm-granite/granite-3b-code-instruct",
|
||||
"--num_speculative-tokens",
|
||||
"5",
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
])])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
|
||||
@ -146,19 +144,18 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize("model, test_llm_kwargs",
|
||||
[("JackFram/llama-68m", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num_speculative-tokens",
|
||||
"3",
|
||||
]),
|
||||
("JackFram/llama-68m", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num_speculative-tokens",
|
||||
"3",
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
])])
|
||||
@pytest.mark.parametrize("logprobs", [None, 2])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
|
||||
@ -24,7 +24,12 @@ SPEC_MODEL = "JackFram/llama-68m"
|
||||
"4",
|
||||
]])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||
[],
|
||||
[
|
||||
"--speculative-model",
|
||||
f"{SPEC_MODEL}",
|
||||
"--num-speculative-tokens",
|
||||
"5",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize(
|
||||
@ -32,12 +37,8 @@ SPEC_MODEL = "JackFram/llama-68m"
|
||||
[
|
||||
#TODO(wooyeon): add spec_draft_dp=2 case
|
||||
[
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": f"{SPEC_MODEL}",
|
||||
"num_speculative_tokens": 5,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
@ -77,14 +78,15 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
[
|
||||
"--speculative-model",
|
||||
f"{SPEC_MODEL}",
|
||||
"--num-speculative-tokens",
|
||||
"5",
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": f"{SPEC_MODEL}",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 32,
|
||||
}),
|
||||
"--speculative-max-model-len",
|
||||
"32",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
|
||||
@ -20,19 +20,16 @@ from .conftest import run_equality_correctness_test
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -51,20 +48,19 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
|
||||
as well as with and without chunked prefill.
|
||||
"""
|
||||
maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -77,19 +73,16 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 6,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 6,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -105,19 +98,18 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
|
||||
output_len: int, seed: int, logprobs: int):
|
||||
"""Veriy logprob greedy equality with different speculation lens.
|
||||
"""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -133,15 +125,13 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
# Artificially limit the draft model max model len; this forces
|
||||
# vLLM to skip speculation once the sequences grow beyond 32-k
|
||||
# tokens.
|
||||
"max_model_len": 32,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||
"speculative_max_model_len": 32,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize(
|
||||
@ -159,19 +149,18 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
seed: int, logprobs: int):
|
||||
"""Verify logprobs greedy equality when some sequences skip speculation.
|
||||
"""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -184,13 +173,12 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -260,13 +248,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize(
|
||||
@ -283,16 +270,15 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
|
||||
"""Check the behavior when logprobs are disabled.
|
||||
Token choices should match with the base model.
|
||||
"""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
@ -60,10 +60,8 @@ PRECISION = "float32"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -109,18 +107,14 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -138,20 +132,19 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
prefill_chunk_size: int):
|
||||
"""Verify greedy equality with different batch size."""
|
||||
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -172,10 +165,8 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -223,10 +214,8 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -275,10 +264,8 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -325,13 +312,12 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -373,17 +359,16 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Main model
|
||||
"model_name": MAIN_MODEL,
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -62,9 +62,7 @@ PRECISION = "float32"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -110,16 +108,12 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [8])
|
||||
@ -139,20 +133,19 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
# up sampling different tokens at the tail (ie top tokens don't change).
|
||||
# TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
|
||||
maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -174,9 +167,7 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [2048])
|
||||
@ -218,10 +209,8 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
|
||||
# Main model
|
||||
"model_name": MAIN_MODEL,
|
||||
|
||||
# Speculative config
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
# Speculative model
|
||||
"speculative_model": SPEC_MODEL,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
|
||||
@ -285,9 +274,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -339,9 +326,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -397,10 +382,8 @@ def test_mlp_e2e_greedy_correctness_with_padding(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -447,12 +430,11 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_by_batch_size": 4,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -493,15 +475,14 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
"speculative_model": SPEC_MODEL,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -57,9 +57,7 @@ PRECISION = "bfloat16"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -101,16 +99,12 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -125,19 +119,18 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
batch_size: int, output_len: int, seed: int,
|
||||
logprobs: int):
|
||||
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -159,9 +152,7 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -207,9 +198,7 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -254,9 +243,7 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -299,12 +286,11 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -61,19 +61,15 @@ from .conftest import (get_output_from_llm_generator,
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
# Chunked prefill enabled with small value
|
||||
# to make sure we get mixed batches.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -152,23 +148,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"enable_chunked_prefill": False,
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
"disable_logprobs_during_spec_decoding": False
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
"disable_logprobs_during_spec_decoding": False
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
[
|
||||
@ -191,7 +184,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
whether all speculative tokens are accepted.
|
||||
"""
|
||||
ensure_all_accepted = per_test_common_llm_kwargs.get(
|
||||
"model_name") == test_llm_kwargs.get("speculative_config")["model"]
|
||||
"model_name") == test_llm_kwargs.get("speculative_model")
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
@ -231,17 +224,13 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -294,17 +283,13 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -351,17 +336,13 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -410,17 +391,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -472,17 +449,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -541,17 +514,13 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -598,25 +567,21 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 32,
|
||||
},
|
||||
"speculative_max_model_len": 32,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 32,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
"speculative_max_model_len": 32,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@ -662,19 +627,15 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"disable_by_batch_size": 2,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"speculative_disable_by_batch_size": 2,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"disable_by_batch_size": 2,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"speculative_disable_by_batch_size": 2,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
@ -715,19 +676,15 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"enable_chunked_prefill": False,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
|
||||
] + [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
@ -772,21 +729,17 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"acceptance_method": "typical_acceptance_sampler",
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
|
||||
"enable_chunked_prefill": False
|
||||
}
|
||||
# Try a range of common k.
|
||||
for k in [1, 2, 3]
|
||||
] + [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"acceptance_method": "typical_acceptance_sampler",
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
|
||||
@ -48,20 +48,16 @@ from .conftest import run_equality_correctness_test
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": False,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_mqa_scorer": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -105,20 +101,16 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -133,20 +125,19 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
batch_size: int, output_len: int, seed: int,
|
||||
logprobs: int):
|
||||
"""Verify greedy equality on a tiny model with different batch size."""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -168,21 +159,17 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"enable_chunked_prefill": True,
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
},
|
||||
@ -227,21 +214,17 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": k,
|
||||
"prompt_lookup_max": 3,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": k,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 3, 5]
|
||||
] + [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": k,
|
||||
"prompt_lookup_max": 1,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": k,
|
||||
"ngram_prompt_lookup_max": 1,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 3, 5]
|
||||
@ -260,7 +243,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
|
||||
seed: int):
|
||||
"""Verify that ngram speculative decoding produces exact equality
|
||||
to without spec decode with many different values of k and
|
||||
different ngram prompt_lookup_max.
|
||||
different ngram_prompt_lookup_max.
|
||||
"""
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
@ -283,25 +266,22 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_by_batch_size": 4
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_by_batch_size": 4,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}, {
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_by_batch_size": 4,
|
||||
"enable_chunked_prefill": True,
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -316,7 +296,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
seed: int):
|
||||
"""Verify that ngram speculative decoding produces exact equality
|
||||
to without spec decode with many different values of k and
|
||||
different ngram prompt_lookup_max.
|
||||
different ngram_prompt_lookup_max.
|
||||
"""
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
@ -336,17 +316,18 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Required for spec decode.
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -19,11 +19,11 @@ SPEC_MODEL = "JackFram/llama-160m"
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# speculative config
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
# speculative model
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
|
||||
# num speculative tokens
|
||||
"num_speculative_tokens": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
|
||||
|
||||
@ -39,10 +39,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
|
||||
|
||||
# universal args for all models go here. also good if you need to test locally
|
||||
# and change type or KV cache quantization or something.
|
||||
ARGS: list[str] = [
|
||||
"--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
|
||||
"256"
|
||||
]
|
||||
ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
|
||||
|
||||
CONFIGS: dict[str, ServerConfig] = {
|
||||
"hermes": {
|
||||
|
||||
@ -5,96 +5,92 @@ import os
|
||||
import tempfile
|
||||
|
||||
import depyf
|
||||
import pytest
|
||||
|
||||
from vllm.config import CompilationLevel
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with depyf.prepare_debug(temp_dir):
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
@pytest.mark.skip(reason="Not working; needs investigation.")
|
||||
def test_tpu_compilation():
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with depyf.prepare_debug(temp_dir):
|
||||
from vllm import LLM, SamplingParams
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
"The greatest glory in living lies not in never falling,",
|
||||
]
|
||||
answers = [
|
||||
" or, through inaction, allow a human being to come to harm.",
|
||||
" what is essential is invisible to the eye.",
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
N = 1
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
sampling_params = SamplingParams(temperature=0.7,
|
||||
top_p=1.0,
|
||||
n=N,
|
||||
max_tokens=16)
|
||||
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
"The greatest glory in living lies not in never falling,",
|
||||
]
|
||||
answers = [
|
||||
" or, through inaction, allow a human being to come to harm.",
|
||||
" what is essential is invisible to the eye.",
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
N = 1
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
sampling_params = SamplingParams(temperature=0.7,
|
||||
top_p=1.0,
|
||||
n=N,
|
||||
max_tokens=16)
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
# disable custom dispatcher, let Dynamo takes over
|
||||
# all the control
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=512,
|
||||
max_num_seqs=64,
|
||||
enforce_eager=True,
|
||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
# disable custom dispatcher, let Dynamo takes over
|
||||
# all the control
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=512,
|
||||
max_num_seqs=64,
|
||||
enforce_eager=True,
|
||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
# We should only trigger Dynamo compilation 4 times:
|
||||
# 1. forward pass (symbolic)
|
||||
# 2. compute_logits (symbolic)
|
||||
# 3. forward pass (shape 16)
|
||||
# 4. forward pass (shape 32)
|
||||
# and later calls should not trigger Dynamo compilation again.
|
||||
# NOTE: It might still trigger XLA compilation.
|
||||
|
||||
# We should only trigger Dynamo compilation 4 times:
|
||||
# 1. forward pass (symbolic)
|
||||
# 2. compute_logits (symbolic)
|
||||
# 3. forward pass (shape 16)
|
||||
# 4. forward pass (shape 32)
|
||||
# and later calls should not trigger Dynamo compilation again.
|
||||
# NOTE: It might still trigger XLA compilation.
|
||||
# Check we have 4 compiled codes
|
||||
assert len(compiled_codes) == 4
|
||||
|
||||
# Check we have 4 compiled codes
|
||||
assert len(compiled_codes) == 4
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
# The first compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The first compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
# The second compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The second compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
# The third compilation is shape 16, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[2]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
# The third compilation is shape 16, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[2]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[3]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[3]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
@ -70,16 +70,12 @@ def test_ngram_correctness(
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
)
|
||||
spec_llm = LLM(model=model_name,
|
||||
speculative_model='[ngram]',
|
||||
ngram_prompt_lookup_max=5,
|
||||
ngram_prompt_lookup_min=3,
|
||||
num_speculative_tokens=3,
|
||||
max_model_len=1024)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
|
||||
@ -11,13 +11,11 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
|
||||
STOP_STRINGS,
|
||||
DummyOutputProcessorTestVectors,
|
||||
MockEngineCore)
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.output_processor import (OutputProcessor,
|
||||
RequestOutputCollector)
|
||||
from vllm.v1.engine.output_processor import OutputProcessor
|
||||
from vllm.v1.metrics.stats import IterationStats
|
||||
|
||||
|
||||
@ -836,88 +834,3 @@ def test_iteration_stats(dummy_test_vectors):
|
||||
|
||||
assert iteration_stats.num_prompt_tokens == 0
|
||||
assert iteration_stats.num_generation_tokens == num_active
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_output_collector():
|
||||
NUM_REQS = 3
|
||||
TEXT = "a"
|
||||
|
||||
def make_outputs() -> list[RequestOutput]:
|
||||
return [
|
||||
RequestOutput(
|
||||
request_id="my-request-id",
|
||||
prompt=None,
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[
|
||||
CompletionOutput(
|
||||
index=0,
|
||||
text=TEXT,
|
||||
token_ids=[idx],
|
||||
cumulative_logprob=(idx + 1 * 1.0),
|
||||
logprobs=[{
|
||||
"a": idx,
|
||||
"b": idx
|
||||
}],
|
||||
finish_reason="length" if
|
||||
(idx == NUM_REQS - 1) else None,
|
||||
)
|
||||
],
|
||||
finished=(idx == NUM_REQS - 1),
|
||||
) for idx in range(NUM_REQS)
|
||||
]
|
||||
|
||||
collector = RequestOutputCollector(RequestOutputKind.DELTA)
|
||||
|
||||
# CASE 1: Put then get.
|
||||
outputs = make_outputs()
|
||||
collector.put(outputs[0])
|
||||
output = await collector.get()
|
||||
assert not collector.ready.is_set()
|
||||
assert collector.output is None
|
||||
assert output.outputs[0].text == "a"
|
||||
assert output.outputs[0].token_ids == [0]
|
||||
|
||||
# CASE 2: 2 puts then get.
|
||||
num_to_put = 2
|
||||
outputs = make_outputs()
|
||||
for i in range(num_to_put):
|
||||
collector.put(outputs[i])
|
||||
output = await collector.get()
|
||||
assert not collector.ready.is_set()
|
||||
assert collector.output is None
|
||||
|
||||
assert not output.finished
|
||||
# Text, token_ids, and logprobs should get merged.
|
||||
assert output.outputs[0].text == TEXT * num_to_put
|
||||
for tok_0, tok_1 in zip(output.outputs[0].token_ids,
|
||||
list(range(num_to_put))):
|
||||
assert tok_0 == tok_1
|
||||
assert len(output.outputs[0].logprobs) == num_to_put
|
||||
|
||||
# Cumulative logprobs should be the last one.
|
||||
cumulative_logprob_expected = 1.0 * num_to_put
|
||||
assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
|
||||
|
||||
# CASE 3: Put all 3 (including a finished).
|
||||
num_to_put = 3
|
||||
outputs = make_outputs()
|
||||
for i in range(num_to_put):
|
||||
collector.put(outputs[i])
|
||||
output = await collector.get()
|
||||
assert not collector.ready.is_set()
|
||||
assert collector.output is None
|
||||
|
||||
assert output.finished
|
||||
assert output.outputs[0].finish_reason == "length"
|
||||
# Text, token_ids, and logprobs should get merged.
|
||||
assert output.outputs[0].text == TEXT * num_to_put
|
||||
for tok_0, tok_1 in zip(output.outputs[0].token_ids,
|
||||
list(range(num_to_put))):
|
||||
assert tok_0 == tok_1
|
||||
assert len(output.outputs[0].logprobs) == num_to_put
|
||||
|
||||
# Cumulative logprobs should be the last one.
|
||||
cumulative_logprob_expected = 1.0 * num_to_put
|
||||
assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
|
||||
GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
|
||||
MODELS_TO_TEST = [
|
||||
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
||||
]
|
||||
@ -30,13 +30,12 @@ def test_guided_json_completion(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json=sample_json_schema,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
@ -112,14 +111,13 @@ def test_guided_json_object(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=100,
|
||||
n=2,
|
||||
guided_decoding=GuidedDecodingParams(json_object=True))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=100,
|
||||
n=2,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json_object=True,
|
||||
backend=guided_decoding_backend))
|
||||
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a JSON object with curly braces for a person with "
|
||||
@ -139,20 +137,12 @@ def test_guided_json_object(
|
||||
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
allowed_types: tuple[type, ...] = (dict, )
|
||||
if guided_decoding_backend == "xgrammar":
|
||||
# TODO - we are currently too permissive with xgrammar and
|
||||
# allow # any valid json (typically comes back as a list or
|
||||
# object). We can fix this by specifying a jsonschema of
|
||||
# {"type": "object"}, # but we need this fix in a release
|
||||
# first: https://github.com/mlc-ai/xgrammar/pull/264
|
||||
allowed_types = (dict, list)
|
||||
assert isinstance(parsed_json, allowed_types)
|
||||
assert isinstance(parsed_json, dict)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
GUIDED_DECODING_BACKENDS_V1 + ["auto"])
|
||||
GUIDED_DECODING_BACKENDS_V1)
|
||||
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
|
||||
def test_guided_json_unsupported_schema(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@ -161,43 +151,21 @@ def test_guided_json_unsupported_schema(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
|
||||
if guided_decoding_backend == "xgrammar":
|
||||
with pytest.raises(ValueError,
|
||||
match="The provided JSON schema contains features "
|
||||
"not supported by xgrammar."):
|
||||
llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {unsupported_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
else:
|
||||
# This should work for both "guidance" and "auto".
|
||||
|
||||
outputs = llm.generate(
|
||||
prompts=("Give an example JSON object for a grade "
|
||||
"that fits this schema: "
|
||||
f"{unsupported_json_schema}"),
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
print(generated_text)
|
||||
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json=unsupported_json_schema,
|
||||
backend=guided_decoding_backend))
|
||||
with pytest.raises(ValueError,
|
||||
match="The provided JSON schema contains features "
|
||||
"not supported by xgrammar."):
|
||||
llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {unsupported_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@ -211,14 +179,13 @@ def test_guided_grammar_ebnf(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
grammar=sample_sql_ebnf,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1"),
|
||||
@ -255,14 +222,13 @@ def test_guided_grammar_lark(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
grammar=sample_sql_lark,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1"),
|
||||
@ -303,15 +269,16 @@ def test_guided_grammar_ebnf_invalid(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
|
||||
with pytest.raises(ValueError, match="Failed to convert the grammar "):
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
grammar="not a grammar",
|
||||
backend=guided_decoding_backend))
|
||||
with pytest.raises(ValueError,
|
||||
match="Failed to convert the grammar "
|
||||
"from Lark to EBNF."):
|
||||
llm.generate(
|
||||
prompts=("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1"),
|
||||
@ -331,13 +298,12 @@ def test_guided_regex(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
regex=sample_regex,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts=[
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
@ -369,13 +335,12 @@ def test_guided_choice_completion(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
choice=sample_guided_choice,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts="The best language for type-safe systems programming is ",
|
||||
sampling_params=sampling_params,
|
||||
|
||||
@ -36,8 +36,6 @@ def create_logits_tensor(output_token_ids: list[list[int]],
|
||||
def create_sampling_metadata(
|
||||
all_greedy: bool,
|
||||
temperature: Optional[torch.Tensor] = None,
|
||||
top_k: Optional[torch.Tensor] = None,
|
||||
top_p: Optional[torch.Tensor] = None,
|
||||
generators: Optional[dict[int, Any]] = None,
|
||||
) -> SamplingMetadata:
|
||||
"""Create a v1 sampling metadata object with all_greedy set
|
||||
@ -54,8 +52,8 @@ def create_sampling_metadata(
|
||||
temperature=temperature,
|
||||
all_greedy=all_greedy,
|
||||
all_random=not all_greedy,
|
||||
top_p=top_p,
|
||||
top_k=top_k,
|
||||
top_p=None,
|
||||
top_k=None,
|
||||
min_p=torch.empty(1, ),
|
||||
generators=generators,
|
||||
max_num_logprobs=0,
|
||||
@ -464,147 +462,3 @@ def estimate_rejection_sampling_pdf(
|
||||
density=True)
|
||||
|
||||
return hist.hist
|
||||
|
||||
|
||||
def _test_masked_logits(
|
||||
rejection_sampler,
|
||||
batch_size: int,
|
||||
num_draft_tokens: int,
|
||||
vocab_size: int,
|
||||
target_logits: torch.Tensor,
|
||||
unmasked_indices: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
):
|
||||
# Set up test parameters
|
||||
num_tokens = batch_size * num_draft_tokens
|
||||
|
||||
# Create random draft probabilities.
|
||||
draft_probs = torch.rand((num_tokens, vocab_size),
|
||||
dtype=torch.float32,
|
||||
device=DEVICE)
|
||||
draft_probs = F.softmax(draft_probs, dim=-1)
|
||||
|
||||
# Randomly sample draft token ids from draft probs
|
||||
draft_token_ids = torch.multinomial(draft_probs, num_samples=1)
|
||||
draft_token_ids = draft_token_ids.reshape(batch_size, num_draft_tokens)
|
||||
draft_token_ids = draft_token_ids.tolist()
|
||||
|
||||
# Bonus tokens not used but required
|
||||
bonus_token_ids = torch.zeros((batch_size, 1),
|
||||
dtype=torch.int64,
|
||||
device=DEVICE)
|
||||
|
||||
# Create spec decode metadata
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
|
||||
draft_token_ids,
|
||||
device=DEVICE,
|
||||
)
|
||||
|
||||
# Run rejection sampling
|
||||
output_token_ids = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=draft_probs,
|
||||
target_logits=target_logits,
|
||||
bonus_token_ids=bonus_token_ids,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
# Remove bonus tokens and reshape
|
||||
output_token_ids = output_token_ids[:, :-1].flatten().tolist()
|
||||
|
||||
# Check that all sampled tokens are within the unmasked indices.
|
||||
for i in range(num_tokens):
|
||||
token_id = output_token_ids[i]
|
||||
if token_id == PLACEHOLDER_TOKEN_ID:
|
||||
continue
|
||||
assert token_id in unmasked_indices[i]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("top_k", [1, 5, 99])
|
||||
def test_top_k(rejection_sampler, top_k):
|
||||
"""Test rejection sampling with top-k sampling"""
|
||||
vocab_size = 100
|
||||
batch_size = 100
|
||||
num_draft_tokens = 3
|
||||
num_tokens = batch_size * num_draft_tokens
|
||||
|
||||
# Randomly create top-k indices.
|
||||
top_k_indices = [
|
||||
torch.randperm(vocab_size, device=DEVICE)[:top_k]
|
||||
for _ in range(num_tokens)
|
||||
]
|
||||
top_k_indices = torch.stack(top_k_indices)
|
||||
|
||||
# Create logits with the uniform distribution.
|
||||
target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
|
||||
|
||||
# Increment the logits for top-k indices, a little bit more than the other
|
||||
# ones. If the masking is effective, the non-topk indices will never be
|
||||
# sampled despite the small difference in logits.
|
||||
for i in range(num_tokens):
|
||||
target_logits[i, top_k_indices[i]] += 0.1
|
||||
|
||||
# Create sampling metadata
|
||||
temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
|
||||
sampling_metadata = create_sampling_metadata(
|
||||
all_greedy=False,
|
||||
temperature=temperature,
|
||||
top_k=torch.tensor([top_k] * batch_size,
|
||||
device=DEVICE,
|
||||
dtype=torch.int64),
|
||||
)
|
||||
|
||||
_test_masked_logits(
|
||||
rejection_sampler,
|
||||
batch_size=batch_size,
|
||||
num_draft_tokens=num_draft_tokens,
|
||||
vocab_size=vocab_size,
|
||||
target_logits=target_logits,
|
||||
unmasked_indices=top_k_indices,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("top_p", [0.5, 0.9, 0.99])
|
||||
def test_top_p(rejection_sampler, top_p):
|
||||
"""Test rejection sampling with top-p sampling"""
|
||||
vocab_size = 100
|
||||
batch_size = 100
|
||||
num_draft_tokens = 3
|
||||
num_tokens = batch_size * num_draft_tokens
|
||||
|
||||
# Create logits with the uniform distribution.
|
||||
target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
|
||||
temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
|
||||
rescaled_logits = target_logits / temperature
|
||||
|
||||
logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
|
||||
probs_sort = logits_sort.softmax(dim=-1)
|
||||
probs_sum = probs_sort.cumsum(dim=-1)
|
||||
top_p_mask = probs_sum <= 1 - top_p
|
||||
# at least one
|
||||
top_p_mask[:, -1] = False
|
||||
|
||||
# Get the top-p indices.
|
||||
top_p_indices = []
|
||||
for i in range(num_tokens):
|
||||
top_p_indices.append(logits_idx[i][~top_p_mask[i]].tolist())
|
||||
|
||||
# Create sampling metadata
|
||||
sampling_metadata = create_sampling_metadata(
|
||||
all_greedy=False,
|
||||
temperature=temperature,
|
||||
top_p=torch.tensor([top_p] * batch_size,
|
||||
device=DEVICE,
|
||||
dtype=torch.float32),
|
||||
)
|
||||
|
||||
_test_masked_logits(
|
||||
rejection_sampler,
|
||||
batch_size=batch_size,
|
||||
num_draft_tokens=num_draft_tokens,
|
||||
vocab_size=vocab_size,
|
||||
target_logits=target_logits,
|
||||
unmasked_indices=top_p_indices,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
@ -2,8 +2,7 @@
|
||||
|
||||
import numpy as np
|
||||
|
||||
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
|
||||
_find_subarray_kmp,
|
||||
from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp,
|
||||
_kmp_lps_array)
|
||||
|
||||
|
||||
@ -36,53 +35,3 @@ def test_find_subarray_kmp():
|
||||
# Return on the first match
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
|
||||
np.array([6, 2, 3]))
|
||||
|
||||
|
||||
def test_ngram_proposer():
|
||||
proposer = NgramProposer()
|
||||
|
||||
# No match.
|
||||
result = proposer.propose(
|
||||
context_token_ids=np.array([1, 2, 3, 4, 5]),
|
||||
min_n=2,
|
||||
max_n=2,
|
||||
k=2,
|
||||
)
|
||||
assert result is None
|
||||
|
||||
# No match for 4-gram.
|
||||
result = proposer.propose(
|
||||
context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
|
||||
min_n=4,
|
||||
max_n=4,
|
||||
k=2,
|
||||
)
|
||||
assert result is None
|
||||
|
||||
# No match for 4-gram but match for 3-gram.
|
||||
result = proposer.propose(
|
||||
context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
|
||||
min_n=3,
|
||||
max_n=4,
|
||||
k=2,
|
||||
)
|
||||
assert np.array_equal(result, np.array([4, 1]))
|
||||
|
||||
# Match for both 4-gram and 3-gram.
|
||||
# In this case, the proposer should return the 4-gram match.
|
||||
result = proposer.propose(
|
||||
context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]),
|
||||
min_n=3,
|
||||
max_n=4,
|
||||
k=2,
|
||||
)
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 1]
|
||||
|
||||
# Match for 2-gram and 3-gram, but not 4-gram.
|
||||
result = proposer.propose(
|
||||
context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]),
|
||||
min_n=2,
|
||||
max_n=4,
|
||||
k=2,
|
||||
)
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
|
||||
|
||||
@ -1,307 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import unittest.mock as mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.worker.tpu_model_runner import TPUModelRunner
|
||||
|
||||
# Mock torch_xla module since it may not be available in the test environments
|
||||
torch_xla_patcher = mock.patch.dict(
|
||||
"sys.modules", {
|
||||
"torch_xla": mock.MagicMock(),
|
||||
"torch_xla.core.xla_model": mock.MagicMock(),
|
||||
"torch_xla.runtime": mock.MagicMock(),
|
||||
})
|
||||
torch_xla_patcher.start()
|
||||
|
||||
# Mock the PallasAttentionBackend
|
||||
pallas_attention_backend_patcher = mock.patch(
|
||||
"vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
|
||||
pallas_attention_backend_patcher.start()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_runner():
|
||||
# Patchers have already been started at module level.
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=10,
|
||||
max_num_batched_tokens=512,
|
||||
max_model_len=512,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16", # TPUs typically use bfloat16
|
||||
seed=42,
|
||||
)
|
||||
cache_config = CacheConfig(
|
||||
block_size=16,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
device = "xla:0" # Mocking TPU device
|
||||
with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
|
||||
mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
|
||||
mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
|
||||
return TPUModelRunner(vllm_config, device)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope="session")
|
||||
def cleanup_patches():
|
||||
yield
|
||||
torch_xla_patcher.stop()
|
||||
pallas_attention_backend_patcher.stop()
|
||||
|
||||
|
||||
def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
|
||||
new_reqs = []
|
||||
num_scheduled_tokens = {}
|
||||
total_num_scheduled_tokens = 0
|
||||
for req_id in req_ids:
|
||||
new_reqs.append(
|
||||
NewRequestData(
|
||||
req_id=req_id,
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt="test",
|
||||
mm_inputs=[],
|
||||
mm_hashes=[],
|
||||
mm_positions=[],
|
||||
sampling_params=SamplingParams(),
|
||||
block_ids=[0],
|
||||
num_computed_tokens=0,
|
||||
lora_request=None,
|
||||
))
|
||||
num_scheduled_tokens[req_id] = 3
|
||||
total_num_scheduled_tokens += num_scheduled_tokens[req_id]
|
||||
|
||||
return SchedulerOutput(
|
||||
scheduled_new_reqs=new_reqs,
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens=num_scheduled_tokens,
|
||||
total_num_scheduled_tokens=total_num_scheduled_tokens,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
|
||||
def _is_req_scheduled(model_runner, req_id: str) -> bool:
|
||||
return req_id in model_runner.input_batch.req_id_to_index
|
||||
|
||||
|
||||
def _is_req_added(model_runner, req_id: str) -> bool:
|
||||
return req_id in model_runner.requests
|
||||
|
||||
|
||||
def _is_sampling_metadata_changed(model_runner,
|
||||
sampling_metadata_before: SamplingMetadata):
|
||||
return model_runner.input_batch.sampling_metadata is not (
|
||||
sampling_metadata_before)
|
||||
|
||||
|
||||
def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
|
||||
req_index = model_runner.input_batch.req_id_to_index[req_id]
|
||||
block_table = model_runner.input_batch.block_table
|
||||
req_state = model_runner.requests[req_id]
|
||||
if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
|
||||
return False
|
||||
num_blocks = block_table.num_blocks_per_row[req_index]
|
||||
return (block_table.block_table_np[req_index, :num_blocks] ==
|
||||
req_state.block_ids).all()
|
||||
|
||||
|
||||
def test_update_states_new_request(model_runner):
|
||||
req_id = "req_0"
|
||||
|
||||
# new req
|
||||
scheduler_output = _schedule_new_request(req_id)
|
||||
|
||||
metadata_before = model_runner.input_batch.sampling_metadata
|
||||
model_runner._update_states(scheduler_output)
|
||||
|
||||
assert _is_sampling_metadata_changed(model_runner, metadata_before)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert _is_req_scheduled(model_runner, req_id)
|
||||
assert _is_req_state_block_table_match(model_runner, req_id)
|
||||
|
||||
|
||||
def test_update_states_request_finished(model_runner):
|
||||
req_id = "req_0"
|
||||
|
||||
# new req
|
||||
scheduler_output = _schedule_new_request(req_id)
|
||||
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert _is_req_scheduled(model_runner, req_id)
|
||||
|
||||
# finish req
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={},
|
||||
total_num_scheduled_tokens=0,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids={req_id},
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
metadata_before = model_runner.input_batch.sampling_metadata
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert _is_sampling_metadata_changed(model_runner, metadata_before)
|
||||
assert not _is_req_added(model_runner, req_id)
|
||||
assert not _is_req_scheduled(model_runner, req_id)
|
||||
|
||||
|
||||
def test_update_states_request_resumed(model_runner):
|
||||
req_id = "req_0"
|
||||
|
||||
# new req
|
||||
scheduler_output = _schedule_new_request(req_id)
|
||||
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert _is_req_scheduled(model_runner, req_id)
|
||||
|
||||
# unschedule req
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={},
|
||||
total_num_scheduled_tokens=0,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert not _is_req_scheduled(model_runner, req_id)
|
||||
|
||||
# resume req
|
||||
cached_req_data = CachedRequestData(
|
||||
req_id=req_id,
|
||||
resumed_from_preemption=False,
|
||||
new_token_ids=[],
|
||||
new_block_ids=[],
|
||||
num_computed_tokens=0,
|
||||
)
|
||||
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[cached_req_data],
|
||||
num_scheduled_tokens={req_id: 1},
|
||||
total_num_scheduled_tokens=1,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
metadata_before = model_runner.input_batch.sampling_metadata
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert _is_sampling_metadata_changed(model_runner, metadata_before)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert _is_req_scheduled(model_runner, req_id)
|
||||
assert _is_req_state_block_table_match(model_runner, req_id)
|
||||
|
||||
|
||||
def test_update_states_no_changes(model_runner):
|
||||
req_id = "req_0"
|
||||
|
||||
# new req
|
||||
scheduler_output = _schedule_new_request(req_id)
|
||||
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert _is_req_scheduled(model_runner, req_id)
|
||||
|
||||
# schedule req
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={req_id: 1},
|
||||
total_num_scheduled_tokens=1,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
metadata_before = model_runner.input_batch.sampling_metadata
|
||||
model_runner._update_states(scheduler_output)
|
||||
assert not _is_sampling_metadata_changed(model_runner, metadata_before)
|
||||
assert _is_req_added(model_runner, req_id)
|
||||
assert _is_req_scheduled(model_runner, req_id)
|
||||
assert _is_req_state_block_table_match(model_runner, req_id)
|
||||
|
||||
|
||||
def test_update_states_request_unscheduled(model_runner):
|
||||
req_ids = ("req_0", "req_1")
|
||||
|
||||
# new reqs
|
||||
scheduler_output = _schedule_new_request(*req_ids)
|
||||
|
||||
model_runner._update_states(scheduler_output)
|
||||
|
||||
assert _is_req_added(model_runner, req_ids[0])
|
||||
assert _is_req_scheduled(model_runner, req_ids[0])
|
||||
|
||||
assert _is_req_added(model_runner, req_ids[1])
|
||||
assert _is_req_scheduled(model_runner, req_ids[1])
|
||||
|
||||
# unschedule req_1
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={req_ids[0]: 1},
|
||||
total_num_scheduled_tokens=1,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
metadata_before = model_runner._update_states(scheduler_output)
|
||||
assert _is_sampling_metadata_changed(model_runner, metadata_before)
|
||||
|
||||
assert _is_req_added(model_runner, req_ids[0])
|
||||
assert _is_req_scheduled(model_runner, req_ids[0])
|
||||
|
||||
assert _is_req_added(model_runner, req_ids[1])
|
||||
assert not _is_req_scheduled(model_runner, req_ids[1])
|
||||
@ -124,18 +124,6 @@ def paged_attention_rocm(
|
||||
kv_cache_dtype, k_scale, v_scale)
|
||||
|
||||
|
||||
def mla_decode_kvcache_cpu(
|
||||
out: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
scale: float,
|
||||
block_tables: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
) -> None:
|
||||
torch.ops._C_cpu.mla_decode_kvcache(out, query, kv_cache, scale,
|
||||
block_tables, seq_lens)
|
||||
|
||||
|
||||
# pos encoding ops
|
||||
def rotary_embedding(
|
||||
positions: torch.Tensor,
|
||||
@ -202,33 +190,6 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
|
||||
block_table_bound)
|
||||
|
||||
|
||||
def block_table_appends(
|
||||
append_row_indices: torch.Tensor,
|
||||
append_row_indices_cpu: torch.Tensor,
|
||||
append_cumsums: torch.Tensor,
|
||||
append_cumsums_cpu: torch.Tensor,
|
||||
append_block_ids: torch.Tensor,
|
||||
append_block_ids_cpu: torch.Tensor,
|
||||
block_table: torch.Tensor,
|
||||
num_appends: int,
|
||||
total_num_append_blocks: int,
|
||||
) -> None:
|
||||
torch.ops._C.block_table_appends.default(
|
||||
append_row_indices, append_row_indices_cpu, append_cumsums,
|
||||
append_cumsums_cpu, append_block_ids, append_block_ids_cpu,
|
||||
block_table, num_appends, total_num_append_blocks)
|
||||
|
||||
|
||||
def block_table_moves(
|
||||
src_dst_n: torch.Tensor,
|
||||
src_dst_n_cpu: torch.Tensor,
|
||||
block_table: torch.Tensor,
|
||||
num_moves: int,
|
||||
) -> None:
|
||||
torch.ops._C.block_table_moves.default(src_dst_n, src_dst_n_cpu,
|
||||
block_table, num_moves)
|
||||
|
||||
|
||||
# fused quant layer norm ops
|
||||
def rms_norm_dynamic_per_token_quant(
|
||||
input: torch.Tensor,
|
||||
|
||||
@ -187,28 +187,15 @@ class ipex_ops:
|
||||
gen_: torch.Generator,
|
||||
logits_soft_cap: float,
|
||||
) -> None:
|
||||
if ipex.__version__.endswith("cpu"):
|
||||
if logits_soft_cap != 0.0:
|
||||
raise ValueError("IPEX CPU does not support logits_soft_cap")
|
||||
ipex.llm.functional.varlen_attention(query.contiguous(),
|
||||
key.contiguous(),
|
||||
value.contiguous(), out,
|
||||
seqlen_q.int(),
|
||||
seqlen_k.int(), max_seqlen_q,
|
||||
max_seqlen_k, pdropout,
|
||||
softmax_scale, zero_tensors,
|
||||
is_causal, return_softmax,
|
||||
gen_)
|
||||
else: # XPU build
|
||||
ipex.llm.functional.varlen_attention(query.contiguous(),
|
||||
key.contiguous(),
|
||||
value.contiguous(), out,
|
||||
seqlen_q.int(),
|
||||
seqlen_k.int(), max_seqlen_q,
|
||||
max_seqlen_k, pdropout,
|
||||
softmax_scale, zero_tensors,
|
||||
is_causal, return_softmax,
|
||||
gen_, logits_soft_cap)
|
||||
ipex.llm.functional.varlen_attention(query.contiguous(),
|
||||
key.contiguous(),
|
||||
value.contiguous(), out,
|
||||
seqlen_q.int(), seqlen_k.int(),
|
||||
max_seqlen_q, max_seqlen_k,
|
||||
pdropout, softmax_scale,
|
||||
zero_tensors, is_causal,
|
||||
return_softmax, gen_,
|
||||
logits_soft_cap)
|
||||
|
||||
@staticmethod
|
||||
def reshape_and_cache(
|
||||
|
||||
@ -1,303 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm._ipex_ops import ipex_ops
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadataBuilder,
|
||||
AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState
|
||||
from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
|
||||
|
||||
|
||||
class CPUMLABackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "CPU_MLA"
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["CPUMLAMetadata"]:
|
||||
return CPUMLAMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]:
|
||||
return CPUMLAMetadataBuilder
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["MLACommonState"]:
|
||||
return MLACommonState
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["CPUMLAImpl"]:
|
||||
return CPUMLAImpl
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int, # assumed to be 1 for MLA
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (num_blocks, block_size, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: torch.Tensor,
|
||||
) -> None:
|
||||
ops.copy_blocks_mla(kv_caches, src_to_dists)
|
||||
|
||||
@staticmethod
|
||||
def get_supported_head_sizes() -> List[int]:
|
||||
return [576]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CPUMLAMetadata(TorchSDPAMetadata):
|
||||
# New for MLA
|
||||
# Input positions for rotrary embeddings since for MLA the rotary
|
||||
# position embeddings are applied inside the attention backend
|
||||
input_positions: torch.Tensor = None
|
||||
|
||||
# required by MLACommonImpl
|
||||
is_profile_run: bool = False
|
||||
|
||||
|
||||
class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
|
||||
|
||||
def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
|
||||
self.chunked_prefill = input_builder.chunked_prefill
|
||||
self.input_builder = input_builder
|
||||
assert not self.chunked_prefill, \
|
||||
"chunked prefill is currently not supported"
|
||||
|
||||
def prepare(self):
|
||||
self.input_data = self.input_builder.input_data
|
||||
|
||||
def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
|
||||
input_data = self.input_data
|
||||
prefill_seq_lens = seq_lens[0:input_data.num_prefills]
|
||||
prefill_query_lens = query_lens[0:input_data.num_prefills]
|
||||
slot_mapping = torch.tensor(input_data.slot_mapping,
|
||||
dtype=torch.long,
|
||||
device="cpu")
|
||||
|
||||
# metadata for prefill
|
||||
if input_data.num_prefills > 0:
|
||||
query_lens_tensor = torch.tensor(prefill_query_lens,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
kv_lens_tensor = torch.tensor(prefill_seq_lens,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
query_start_loc = torch.zeros(input_data.num_prefills + 1,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
kv_start_loc = torch.zeros(input_data.num_prefills + 1,
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
torch.cumsum(query_lens_tensor,
|
||||
dim=0,
|
||||
dtype=torch.int32,
|
||||
out=query_start_loc[1:])
|
||||
torch.cumsum(kv_lens_tensor,
|
||||
dim=0,
|
||||
dtype=torch.int32,
|
||||
out=kv_start_loc[1:])
|
||||
max_query_len = max(prefill_query_lens)
|
||||
max_kv_len = max(prefill_seq_lens)
|
||||
|
||||
# for chunked-prefill
|
||||
if self.chunked_prefill:
|
||||
prefill_block_tables = make_tensor_with_pad(
|
||||
self.input_data.prefill_block_tables,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
else:
|
||||
prefill_block_tables = None
|
||||
|
||||
else:
|
||||
query_start_loc = None
|
||||
kv_start_loc = None
|
||||
max_query_len = None
|
||||
max_kv_len = None
|
||||
prefill_block_tables = None
|
||||
|
||||
# metadata for decode
|
||||
if input_data.num_decode_tokens != 0:
|
||||
seq_lens_tensor = torch.tensor(
|
||||
input_data.seq_lens[input_data.num_prefills:],
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
block_tables = make_tensor_with_pad(
|
||||
self.input_data.decode_block_tables,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
else:
|
||||
block_tables = torch.tensor([])
|
||||
seq_lens_tensor = torch.tensor(
|
||||
input_data.seq_lens[:input_data.num_prefills],
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
)
|
||||
|
||||
# For multi-modal models
|
||||
placeholder_index_maps = None
|
||||
if len(input_data.multi_modal_inputs_list) != 0:
|
||||
placeholder_index_maps = {
|
||||
modality: placeholder_map.index_map()
|
||||
for modality, placeholder_map in
|
||||
input_data.multi_modal_placeholder_maps.items()
|
||||
}
|
||||
|
||||
return CPUMLAMetadata(
|
||||
chunked_prefill=self.chunked_prefill,
|
||||
seq_lens=prefill_seq_lens,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
max_query_len=max_query_len,
|
||||
max_kv_len=max_kv_len,
|
||||
query_start_loc=query_start_loc,
|
||||
kv_start_loc=kv_start_loc,
|
||||
max_decode_seq_len=input_data.max_decode_seq_len,
|
||||
num_prefills=input_data.num_prefills,
|
||||
num_prefill_tokens=input_data.num_prefill_tokens,
|
||||
num_decode_tokens=input_data.num_decode_tokens,
|
||||
block_tables=block_tables,
|
||||
prefill_block_tables=prefill_block_tables,
|
||||
slot_mapping=slot_mapping,
|
||||
multi_modal_placeholder_index_maps=placeholder_index_maps,
|
||||
enable_kv_scales_calculation=False,
|
||||
input_positions=torch.tensor([self.input_data.input_positions]))
|
||||
|
||||
|
||||
class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]],
|
||||
logits_soft_cap: Optional[float],
|
||||
attn_type: str,
|
||||
# MLA Specific Arguments
|
||||
**mla_args) -> None:
|
||||
super().__init__(num_heads, head_size, scale, num_kv_heads,
|
||||
alibi_slopes, sliding_window, kv_cache_dtype,
|
||||
blocksparse_params, logits_soft_cap, attn_type,
|
||||
**mla_args)
|
||||
|
||||
unsupported_features = [
|
||||
alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
|
||||
]
|
||||
if any(unsupported_features):
|
||||
raise NotImplementedError(
|
||||
"CPUMLAImpl does not support one of the following: "
|
||||
"alibi_slopes, sliding_window, blocksparse_params, "
|
||||
"logits_soft_cap")
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"CPUMLAImpl")
|
||||
|
||||
# states is implemented.
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype):
|
||||
raise NotImplementedError(
|
||||
"CPUMLAImpl with FP8 KV cache not yet supported")
|
||||
|
||||
def _forward_prefill(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
kv_c_normed: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
kv_c_and_k_pe_cache: torch.Tensor,
|
||||
attn_metadata: CPUMLAMetadata, # type: ignore[override]
|
||||
) -> torch.Tensor:
|
||||
|
||||
prefill_metadata = attn_metadata.prefill_metadata
|
||||
assert prefill_metadata is not None
|
||||
|
||||
kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
|
||||
-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
|
||||
k_nope, v = kv_nope\
|
||||
.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
|
||||
|
||||
k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
||||
|
||||
# For MLA the v head dim is smaller than qk head dim so we pad out
|
||||
# v with 0s to match the qk head dim
|
||||
v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
|
||||
value=0)
|
||||
|
||||
output = torch.empty_like(q)
|
||||
ipex_ops.varlen_attention(
|
||||
query=q,
|
||||
key=k,
|
||||
value=v_padded,
|
||||
out=output,
|
||||
seqlen_q=prefill_metadata.query_start_loc,
|
||||
seqlen_k=prefill_metadata.query_start_loc,
|
||||
max_seqlen_q=prefill_metadata.max_query_len,
|
||||
max_seqlen_k=prefill_metadata.max_query_len,
|
||||
pdropout=0.0,
|
||||
softmax_scale=self.scale,
|
||||
zero_tensors=False,
|
||||
is_causal=True,
|
||||
return_softmax=False,
|
||||
gen_=None,
|
||||
logits_soft_cap=0.0,
|
||||
)
|
||||
|
||||
# remove padding
|
||||
output = output.view(-1, self.num_heads,
|
||||
q.shape[-1])[..., :v.shape[-1]]
|
||||
output = output.reshape(-1, self.num_heads * v.shape[-1])
|
||||
return self.o_proj(output)[0]
|
||||
|
||||
def _forward_decode(
|
||||
self,
|
||||
q_nope: torch.Tensor,
|
||||
q_pe: torch.Tensor,
|
||||
kv_c_and_k_pe_cache: torch.Tensor,
|
||||
attn_metadata: CPUMLAMetadata, # type: ignore[override]
|
||||
) -> torch.Tensor:
|
||||
assert kv_c_and_k_pe_cache.numel() > 0
|
||||
|
||||
decode_meta = attn_metadata.decode_metadata
|
||||
assert decode_meta is not None
|
||||
|
||||
q = torch.cat([q_nope, q_pe], dim=-1)
|
||||
o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank)
|
||||
|
||||
# Run MQA
|
||||
ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
|
||||
decode_meta.block_tables,
|
||||
decode_meta.seq_lens_tensor)
|
||||
return self._v_up_proj_and_o_proj(o)
|
||||
@ -22,13 +22,12 @@ from vllm.attention.backends.utils import (
|
||||
compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
|
||||
get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
|
||||
is_all_encoder_attn_metadata_set, is_block_tables_empty)
|
||||
from vllm.fa_utils import get_flash_attn_version
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
|
||||
flash_attn_with_kvcache)
|
||||
from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8,
|
||||
get_flash_attn_version)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||
@ -633,13 +632,10 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
self.vllm_flash_attn_version = get_flash_attn_version(
|
||||
requires_alibi=self.alibi_slopes is not None)
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype) and (
|
||||
not self.kv_cache_dtype.startswith("fp8")
|
||||
or not flash_attn_supports_fp8()):
|
||||
if (is_quantized_kv_cache(self.kv_cache_dtype)
|
||||
and self.vllm_flash_attn_version != 3):
|
||||
raise NotImplementedError(
|
||||
f"FlashAttention does not support {self.kv_cache_dtype} "
|
||||
"kv-cache on this device "
|
||||
f"(FA supports fp8 = {flash_attn_supports_fp8()}).")
|
||||
"Only FlashAttention3 supports FP8 KV cache")
|
||||
if logits_soft_cap is None:
|
||||
# In flash-attn, setting logits_soft_cap as 0 means no soft cap.
|
||||
logits_soft_cap = 0
|
||||
@ -708,10 +704,6 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
logits_soft_cap: Optional[float] = self.logits_soft_cap
|
||||
fp8_attention = kv_cache_dtype.startswith("fp8")
|
||||
|
||||
if fp8_attention and not flash_attn_supports_fp8():
|
||||
raise NotImplementedError(
|
||||
"FlashAttention does not support FP8 kv-cache on this device.")
|
||||
|
||||
if kv_cache.numel() > 0:
|
||||
key_cache = kv_cache[0]
|
||||
value_cache = kv_cache[1]
|
||||
|
||||
@ -204,6 +204,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
|
||||
from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
|
||||
compute_slot_mapping_start_idx,
|
||||
is_block_tables_empty)
|
||||
from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
|
||||
from vllm.fa_utils import get_flash_attn_version
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
LinearBase, RowParallelLinear,
|
||||
UnquantizedLinearMethod)
|
||||
@ -211,27 +213,17 @@ from vllm.model_executor.layers.rotary_embedding import (
|
||||
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
|
||||
from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.attention.ops.triton_flash_attention import triton_attention
|
||||
from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
|
||||
else:
|
||||
merge_attn_states = None
|
||||
triton_attention = None
|
||||
|
||||
try:
|
||||
from vllm.vllm_flash_attn import flash_attn_varlen_func
|
||||
is_vllm_fa = True
|
||||
except ImportError:
|
||||
# For rocm use upstream flash attention
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
is_vllm_fa = False
|
||||
try:
|
||||
# For rocm use upstream flash attention
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
except ImportError:
|
||||
flash_attn_varlen_func = None
|
||||
|
||||
from vllm.attention.ops.triton_flash_attention import triton_attention
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||
|
||||
@ -357,11 +357,6 @@ class VllmBackend:
|
||||
# graph.
|
||||
|
||||
factors = []
|
||||
# 0. factors come from the env, for example, The values of
|
||||
# VLLM_PP_LAYER_PARTITION will affects the computation graph.
|
||||
env_hash = envs.compute_hash()
|
||||
factors.append(env_hash)
|
||||
|
||||
# 1. factors come from the vllm_config (it mainly summarizes how the
|
||||
# model is created)
|
||||
config_hash = vllm_config.compute_hash()
|
||||
|
||||
@ -1,29 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import hashlib
|
||||
import importlib.metadata
|
||||
import inspect
|
||||
import json
|
||||
import types
|
||||
from typing import Any, Callable, Dict, Optional, Union
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from torch import fx
|
||||
|
||||
if Version(importlib.metadata.version('torch')) >= Version("2.6"):
|
||||
from torch._inductor.custom_graph_pass import CustomGraphPass
|
||||
else:
|
||||
# CustomGraphPass is not present in 2.5 or lower, import our version
|
||||
from .torch25_custom_graph_pass import ( # noqa: yapf
|
||||
Torch25CustomGraphPass as CustomGraphPass)
|
||||
|
||||
class InductorPass(ABC):
|
||||
"""
|
||||
General custom inductor pass interface.
|
||||
"""
|
||||
|
||||
class InductorPass(CustomGraphPass):
|
||||
"""
|
||||
A custom graph pass that uses a hash of its source as the UUID.
|
||||
This is defined as a convenience and should work in most cases.
|
||||
"""
|
||||
@abstractmethod
|
||||
def __call__(self, graph: torch.fx.Graph):
|
||||
"""
|
||||
Execute the pass on the given graph.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def uuid(self) -> Any:
|
||||
"""
|
||||
@ -51,16 +48,7 @@ class InductorPass(CustomGraphPass):
|
||||
else:
|
||||
src_str = inspect.getsource(src.__class__)
|
||||
hasher.update(src_str.encode("utf-8"))
|
||||
return hasher.hexdigest()
|
||||
|
||||
@staticmethod
|
||||
def hash_dict(dict_: Dict[Any, Any]):
|
||||
"""
|
||||
Utility method to hash a dictionary, can alternatively be used for uuid.
|
||||
:return: A sha256 hash of the json rep of the dictionary.
|
||||
"""
|
||||
encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
|
||||
return hashlib.sha256(encoded).hexdigest()
|
||||
return hasher.digest()
|
||||
|
||||
|
||||
class CallableInductorPass(InductorPass):
|
||||
@ -73,10 +61,25 @@ class CallableInductorPass(InductorPass):
|
||||
callable: Callable[[fx.Graph], None],
|
||||
uuid: Optional[Any] = None):
|
||||
self.callable = callable
|
||||
self._uuid = self.hash_source(callable) if uuid is None else uuid
|
||||
if uuid is None:
|
||||
uuid = InductorPass.hash_source(callable)
|
||||
self._uuid = uuid
|
||||
|
||||
def __call__(self, graph: torch.fx.Graph):
|
||||
self.callable(graph)
|
||||
|
||||
def uuid(self) -> Any:
|
||||
return self._uuid
|
||||
|
||||
def __getstate__(self):
|
||||
"""
|
||||
Pickling occurs in the Inductor code cache if a pass is not given to
|
||||
the pass manager but is instead directly added to config as a pass.
|
||||
See PostGradPassManager for more.
|
||||
|
||||
TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
|
||||
"""
|
||||
return self._uuid
|
||||
|
||||
def __setstate__(self, state):
|
||||
raise ValueError("Cannot unpickle CallableInductorPass")
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import List
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import torch
|
||||
from torch import fx as fx
|
||||
|
||||
from vllm.config import CompilationConfig
|
||||
@ -9,18 +10,29 @@ from vllm.logger import init_logger
|
||||
|
||||
from .fix_functionalization import FixFunctionalizationPass
|
||||
from .fusion import FusionPass
|
||||
from .inductor_pass import CustomGraphPass, InductorPass
|
||||
from .inductor_pass import InductorPass
|
||||
from .noop_elimination import NoOpEliminationPass
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class PostGradPassManager(CustomGraphPass):
|
||||
class PlaceHolder:
|
||||
pass
|
||||
|
||||
|
||||
if torch.__version__ < "2.6":
|
||||
Parent = PlaceHolder # type: ignore
|
||||
else:
|
||||
Parent = torch._inductor.custom_graph_pass.CustomGraphPass # type: ignore
|
||||
|
||||
|
||||
class PostGradPassManager(Parent):
|
||||
"""
|
||||
The pass manager for post-grad passes.
|
||||
It handles configuration, adding custom passes, and running passes.
|
||||
It supports uuid for the Inductor code cache. That includes torch<2.6
|
||||
support using pickling (in .inductor_pass.CustomGraphPass).
|
||||
It also supports pickling, which is used by the Inductor code cache.
|
||||
TODO(torch==2.6), use CustomGraphPass
|
||||
(torch._inductor.custom_graph_pass.CustomGraphPass)
|
||||
|
||||
The order of the post-grad post-passes is:
|
||||
1. passes (constructor parameter)
|
||||
@ -55,13 +67,27 @@ class PostGradPassManager(CustomGraphPass):
|
||||
self.passes.append(pass_)
|
||||
|
||||
def uuid(self):
|
||||
return self.__getstate__()
|
||||
|
||||
def __getstate__(self) -> Dict[str, List[Any]]:
|
||||
"""
|
||||
The PostGradPassManager is set as a custom pass in the Inductor and
|
||||
affects compilation caching. Its uuid depends on the UUIDs of all
|
||||
dependent passes and the pass config. See InductorPass for more info.
|
||||
Custom pickling for the pass manager, as some passes cannot be pickled.
|
||||
Pickling occurs because the pass manager is set as the value of
|
||||
`config["post_grad_custom_post_pass"]` in the Inductor config.
|
||||
The config is pickled to act as a key in the Inductor code cache.
|
||||
Any other passes in the config are pickled as well.
|
||||
|
||||
TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
|
||||
"""
|
||||
state = {"pass_config": self.pass_config.uuid(), "passes": []}
|
||||
for pass_ in self.passes:
|
||||
state["passes"].append(pass_.uuid())
|
||||
state["passes"].append(self.fix_functionalization.uuid())
|
||||
return InductorPass.hash_dict(state)
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""
|
||||
Do not allow unpickling of the pass manager.
|
||||
If this is needed in the future, it should properly pickle the passes.
|
||||
"""
|
||||
raise ValueError("Cannot unpickle PostGradPassManager")
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class Torch25CustomGraphPass(ABC): # noqa (redefinition)
|
||||
"""
|
||||
This class replaces CustomGraphPass from torch==2.6 when using torch<2.6.
|
||||
It conforms to the 2.6 interface but also supports pickling, as that's what
|
||||
the inductor code cache uses to determine the cache key before 2.6.
|
||||
(in 2.6 and above, uuid() is used.)
|
||||
|
||||
Subclasses can just "pretend" that uuid is used.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, graph: torch.fx.graph.Graph) -> None:
|
||||
"""
|
||||
Implementation of the custom pass.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def uuid(self) -> Optional[Any]:
|
||||
"""
|
||||
Return an ID to uniquely identify your custom pass implementation.
|
||||
Return None to skip inductor code caching entirely.
|
||||
"""
|
||||
|
||||
def __getstate__(self):
|
||||
"""
|
||||
Pickling is used instead of uuid() in torch<2.6. Just return uuid()
|
||||
to enable subclasses to only have to implement uuid.
|
||||
"""
|
||||
return self.uuid()
|
||||
|
||||
def __setstate__(self, state):
|
||||
raise ValueError("Cannot unpickle CustomGraphPass because pickling"
|
||||
" is used for cache key uuid. Use torch>=2.6 with"
|
||||
" native uuid support for custom passes.")
|
||||
647
vllm/config.py
647
vllm/config.py
@ -4,7 +4,6 @@ import ast
|
||||
import copy
|
||||
import enum
|
||||
import hashlib
|
||||
import importlib.metadata
|
||||
import json
|
||||
import sys
|
||||
import warnings
|
||||
@ -18,7 +17,6 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
|
||||
Optional, Protocol, Union)
|
||||
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
from torch.distributed import ProcessGroup, ReduceOp
|
||||
from transformers import PretrainedConfig
|
||||
@ -54,6 +52,8 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
QuantizationConfig = None
|
||||
|
||||
from packaging.version import Version
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# This value is chosen to have a balance between ITL and TTFT. Note it is
|
||||
@ -221,9 +221,6 @@ class ModelConfig:
|
||||
factors.append(self.trust_remote_code)
|
||||
factors.append(self.rope_scaling)
|
||||
factors.append(self.rope_theta)
|
||||
# rope cos/sin cache depends on the max_position_embeddings
|
||||
factors.append(
|
||||
getattr(self.hf_config, "max_position_embeddings", "None"))
|
||||
return hashlib.sha256(str(factors).encode()).hexdigest()
|
||||
|
||||
def __init__(
|
||||
@ -1026,13 +1023,6 @@ class ModelConfig:
|
||||
"max_new_tokens")
|
||||
else:
|
||||
diff_sampling_param = {}
|
||||
|
||||
if diff_sampling_param:
|
||||
logger.warning_once(
|
||||
"Default sampling parameters have been overridden by the "
|
||||
"model's Hugging Face generation config recommended from the "
|
||||
"model creator. If this is not intended, please relaunch "
|
||||
"vLLM instance with `--generation-config vllm`.")
|
||||
return diff_sampling_param
|
||||
|
||||
@property
|
||||
@ -1160,6 +1150,10 @@ class CacheConfig:
|
||||
if self.cache_dtype == "auto":
|
||||
pass
|
||||
elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"V1 does not yet support fp8 KV cache. "
|
||||
"Set VLLM_USE_V1=0 to enable fp8 kv cache.")
|
||||
logger.info(
|
||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||
"memory footprint and boosts the performance. "
|
||||
@ -1280,7 +1274,6 @@ class LoadFormat(str, enum.Enum):
|
||||
BITSANDBYTES = "bitsandbytes"
|
||||
MISTRAL = "mistral"
|
||||
RUNAI_STREAMER = "runai_streamer"
|
||||
FASTSAFETENSORS = "fastsafetensors"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -1817,139 +1810,12 @@ class DeviceConfig:
|
||||
self.device = torch.device(self.device_type)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeculativeConfig:
|
||||
"""Configuration for speculative decoding.
|
||||
|
||||
The configuration is currently specialized to draft-model speculative
|
||||
decoding with top-1 proposals.
|
||||
"""
|
||||
Configuration for speculative decoding.
|
||||
Configurable parameters include:
|
||||
- General Speculative Decoding Control:
|
||||
- num_speculative_tokens (int): The number of speculative
|
||||
tokens, if provided. It will default to the number in the draft
|
||||
model config if present, otherwise, it is required.
|
||||
- model (Optional[str]): The name of the draft model, eagle head,
|
||||
or additional weights, if provided.
|
||||
- method (Optional[str]): The name of the speculative method to use.
|
||||
If users provide and set the `model` param, the speculative method
|
||||
type will be detected automatically if possible, if `model` param
|
||||
is not provided, the method name must be provided.
|
||||
- Possible values:
|
||||
- ngram
|
||||
Related additional configuration:
|
||||
- prompt_lookup_max (Optional[int]):
|
||||
Maximum size of ngram token window when using Ngram
|
||||
proposer, required when method is set to ngram.
|
||||
- prompt_lookup_min (Optional[int]):
|
||||
Minimum size of ngram token window when using Ngram
|
||||
proposer, if provided. Defaults to 1.
|
||||
- eagle
|
||||
- medusa
|
||||
- mlp_speculator
|
||||
- draft_model
|
||||
- acceptance_method (str): The method to use for accepting draft
|
||||
tokens. This can take two possible values: 'rejection_sampler' and
|
||||
'typical_acceptance_sampler' for RejectionSampler and
|
||||
TypicalAcceptanceSampler respectively. If not specified, it
|
||||
defaults to 'rejection_sampler'.
|
||||
- Possible values:
|
||||
- rejection_sampler
|
||||
- typical_acceptance_sampler
|
||||
Related additional configuration:
|
||||
- posterior_threshold (Optional[float]):
|
||||
A threshold value that sets a lower bound on the
|
||||
posterior probability of a token in the target model
|
||||
for it to be accepted. This threshold is used only
|
||||
when we use the TypicalAcceptanceSampler for token
|
||||
acceptance.
|
||||
- posterior_alpha (Optional[float]):
|
||||
Scaling factor for entropy-based threshold, applied
|
||||
when using TypicalAcceptanceSampler.
|
||||
- draft_tensor_parallel_size (Optional[int]): The degree of the tensor
|
||||
parallelism for the draft model. Can only be 1 or the same as the
|
||||
target model's tensor parallel size.
|
||||
- disable_logprobs (bool): If set to True, token log probabilities are
|
||||
not returned during speculative decoding. If set to False, token
|
||||
log probabilities are returned according to the log probability
|
||||
settings in SamplingParams. If not specified, it defaults to True.
|
||||
|
||||
- Draft Model Configuration:
|
||||
- quantization (Optional[str]): Quantization method that was used to
|
||||
quantize the draft model weights. If None, we assume the
|
||||
model weights are not quantized. Note that it only takes effect
|
||||
when using the draft model-based speculative method.
|
||||
- max_model_len (Optional[int]): The maximum model length of the
|
||||
draft model. Used when testing the ability to skip
|
||||
speculation for some sequences.
|
||||
- revision: The specific model version to use for the draft model. It
|
||||
can be a branch name, a tag name, or a commit id. If unspecified,
|
||||
will use the default version.
|
||||
- code_revision: The specific revision to use for the draft model code
|
||||
on Hugging Face Hub. It can be a branch name, a tag name, or a
|
||||
commit id. If unspecified, will use the default version.
|
||||
|
||||
- Advanced Control:
|
||||
- disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
|
||||
batch expansion for scoring proposals. If not specified, it
|
||||
defaults to False.
|
||||
- disable_by_batch_size (Optional[int]): Disable speculative decoding
|
||||
for new incoming requests when the number of enqueued requests is
|
||||
larger than this value, if provided.
|
||||
|
||||
Although the parameters above are structured hierarchically, there is no
|
||||
need to nest them during configuration.
|
||||
|
||||
Non-configurable internal parameters include:
|
||||
- Model Configuration:
|
||||
- target_model_config (ModelConfig): The configuration of the target
|
||||
model.
|
||||
- draft_model_config (ModelConfig): The configuration of the draft
|
||||
model initialized internal.
|
||||
- Parallelism Configuration:
|
||||
- target_parallel_config (ParallelConfig): The parallel configuration
|
||||
for the target model.
|
||||
- draft_parallel_config (ParallelConfig): The parallel configuration
|
||||
for the draft model initialized internal.
|
||||
- Execution Control:
|
||||
- enable_chunked_prefill (bool): Whether vLLM is configured to use
|
||||
chunked prefill or not. Used for raising an error since it's not
|
||||
yet compatible with speculative decode.
|
||||
- disable_log_stats (bool): Whether to disable the periodic printing of
|
||||
stage times in speculative decoding.
|
||||
"""
|
||||
# speculative configs from cli args
|
||||
num_speculative_tokens: int = field(default=None,
|
||||
init=True) # type: ignore
|
||||
method: Optional[str] = None
|
||||
acceptance_method: str = "rejection_sampler"
|
||||
draft_tensor_parallel_size: Optional[int] = None
|
||||
disable_logprobs: bool = True
|
||||
|
||||
model: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
max_model_len: Optional[int] = None
|
||||
revision: Optional[str] = None
|
||||
code_revision: Optional[str] = None
|
||||
|
||||
disable_mqa_scorer: bool = False
|
||||
disable_by_batch_size: Optional[int] = None
|
||||
prompt_lookup_max: Optional[int] = None
|
||||
prompt_lookup_min: Optional[int] = None
|
||||
posterior_threshold: Optional[float] = None
|
||||
posterior_alpha: Optional[float] = None
|
||||
|
||||
# required configuration params passed from engine
|
||||
target_model_config: ModelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
target_parallel_config: ParallelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
enable_chunked_prefill: bool = field(default=None,
|
||||
init=True) # type: ignore
|
||||
disable_log_stats: bool = field(default=None, init=True) # type: ignore
|
||||
|
||||
# params generated in the post-init stage
|
||||
draft_model_config: ModelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
draft_parallel_config: ParallelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
@ -1969,11 +1835,6 @@ class SpeculativeConfig:
|
||||
hash_str = hashlib.md5(str(factors).encode()).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
|
||||
"""Parse the CLI value for the speculative config."""
|
||||
return cls(**dict_value)
|
||||
|
||||
@staticmethod
|
||||
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
if hf_config.model_type == "deepseek_v3":
|
||||
@ -1986,172 +1847,230 @@ class SpeculativeConfig:
|
||||
})
|
||||
return hf_config
|
||||
|
||||
def __post_init__(self):
|
||||
@staticmethod
|
||||
def maybe_create_spec_config(
|
||||
target_model_config: ModelConfig,
|
||||
target_parallel_config: ParallelConfig,
|
||||
target_dtype: str,
|
||||
speculative_model: Optional[str],
|
||||
speculative_model_quantization: Optional[str],
|
||||
speculative_draft_tensor_parallel_size: Optional[int],
|
||||
num_speculative_tokens: Optional[int],
|
||||
speculative_disable_mqa_scorer: Optional[bool],
|
||||
speculative_max_model_len: Optional[int],
|
||||
enable_chunked_prefill: bool,
|
||||
disable_log_stats: bool,
|
||||
speculative_disable_by_batch_size: Optional[int],
|
||||
ngram_prompt_lookup_max: Optional[int],
|
||||
ngram_prompt_lookup_min: Optional[int],
|
||||
draft_token_acceptance_method: str,
|
||||
typical_acceptance_sampler_posterior_threshold: Optional[float],
|
||||
typical_acceptance_sampler_posterior_alpha: Optional[float],
|
||||
disable_logprobs: Optional[bool],
|
||||
) -> Optional["SpeculativeConfig"]:
|
||||
"""Create a SpeculativeConfig if possible, else return None.
|
||||
|
||||
# Note: After next release, the method parameter will be used to
|
||||
# specify the speculative method, which helps to extend the
|
||||
# configuration of non-model-based proposers, and the model parameter
|
||||
# will be used when the draft model or head is needed.
|
||||
# If users do not specify the method, the speculative method will
|
||||
# be detected automatically if possible. If the speculative method can
|
||||
# not be detected, it will be considered as the draft-model-based
|
||||
# method by default.
|
||||
This function attempts to create a SpeculativeConfig object based on the
|
||||
provided parameters. If the necessary conditions are met, it returns an
|
||||
instance of SpeculativeConfig. Otherwise, it returns None.
|
||||
|
||||
if self.model is None and self.num_speculative_tokens is not None:
|
||||
# TODO(Shangming): Refactor mtp configuration logic when supporting
|
||||
# mtp acceleration for more models besides deepseek_v3
|
||||
if self.target_model_config.hf_text_config.model_type \
|
||||
Args:
|
||||
target_model_config (ModelConfig): The configuration of the target
|
||||
model.
|
||||
target_parallel_config (ParallelConfig): The parallel configuration
|
||||
for the target model.
|
||||
target_dtype (str): The data type used for the target model.
|
||||
speculative_model (Optional[str]): The name of the speculative
|
||||
model, if provided.
|
||||
speculative_model_quantization (Optional[str]): Quantization method
|
||||
that was used to quantize the speculative model weights. If
|
||||
None, we assume the model weights are not quantized.
|
||||
speculative_draft_tensor_parallel_size (Optional[int]): The degree
|
||||
of the tensor parallelism for the draft model.
|
||||
num_speculative_tokens (Optional[int]): The number of speculative
|
||||
tokens, if provided. Will default to the number in the draft
|
||||
model config if present, otherwise is required.
|
||||
speculative_disable_mqa_scorer (Optional[bool]): Disable the MQA
|
||||
scorer for the speculative model and fall back to batch
|
||||
expansion for scoring.
|
||||
speculative_max_model_len (Optional[int]): The maximum model len of
|
||||
the speculative model. Used when testing the ability to skip
|
||||
speculation for some sequences.
|
||||
enable_chunked_prefill (bool): Whether vLLM is configured to use
|
||||
chunked prefill or not. Used for raising an error since its not
|
||||
yet compatible with spec decode.
|
||||
speculative_disable_by_batch_size (Optional[int]): Disable
|
||||
speculative decoding for new incoming requests when the number
|
||||
of enqueue requests is larger than this value, if provided.
|
||||
ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
|
||||
window, if provided.
|
||||
ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
|
||||
window, if provided.
|
||||
draft_token_acceptance_method (str): The method to use for
|
||||
accepting draft tokens. This can take two possible
|
||||
values 'rejection_sampler' and 'typical_acceptance_sampler'
|
||||
for RejectionSampler and TypicalAcceptanceSampler
|
||||
respectively.
|
||||
typical_acceptance_sampler_posterior_threshold (Optional[float]):
|
||||
A threshold value that sets a lower bound on the posterior
|
||||
probability of a token in the target model for it to be
|
||||
accepted. This threshold is used only when we use the
|
||||
TypicalAcceptanceSampler for token acceptance.
|
||||
typical_acceptance_sampler_posterior_alpha (Optional[float]):
|
||||
A scaling factor for the entropy-based threshold in the
|
||||
TypicalAcceptanceSampler.
|
||||
disable_logprobs (Optional[bool]): If set to True, token log
|
||||
probabilities are not returned during speculative decoding.
|
||||
If set to False, token log probabilities are returned
|
||||
according to the log probability settings in SamplingParams.
|
||||
If not specified, it defaults to True.
|
||||
|
||||
Returns:
|
||||
Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
|
||||
the necessary conditions are met, else None.
|
||||
"""
|
||||
if speculative_model is None:
|
||||
if num_speculative_tokens is not None:
|
||||
if target_model_config.hf_text_config.model_type \
|
||||
== "deepseek_v3":
|
||||
# use the draft model from the same model:
|
||||
self.model = self.target_model_config.model
|
||||
elif self.method in ("ngram", "[ngram]"):
|
||||
self.model = "ngram"
|
||||
# use the draft model from the same model:
|
||||
speculative_model = target_model_config.model
|
||||
else:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens was provided without "
|
||||
"speculative_model.")
|
||||
else:
|
||||
raise ValueError("num_speculative_tokens was provided without "
|
||||
"speculative model.")
|
||||
return None
|
||||
|
||||
# Automatically configure the ngram method during configuration
|
||||
# refactoring to ensure a smooth transition.
|
||||
if self.method is None and (self.model is not None
|
||||
and self.model in ("ngram", "[ngram]")):
|
||||
self.method = "ngram"
|
||||
if (speculative_disable_by_batch_size is not None
|
||||
and speculative_disable_by_batch_size < 2):
|
||||
raise ValueError("Expect the batch size threshold of disabling "
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{speculative_disable_by_batch_size=}")
|
||||
if (enable_chunked_prefill and speculative_model == "eagle"):
|
||||
raise ValueError("Chunked prefill and EAGLE are not compatible.")
|
||||
# TODO: The user should be able to specify revision/max model len
|
||||
# for the draft model. It is not currently supported.
|
||||
draft_revision = None
|
||||
draft_code_revision = None
|
||||
draft_quantization = speculative_model_quantization
|
||||
|
||||
if self.method in ("ngram", "[ngram]"):
|
||||
# Unified to "ngram" internally
|
||||
self.method = "ngram"
|
||||
# Set default values if not provided
|
||||
if (self.prompt_lookup_min is None
|
||||
and self.prompt_lookup_max is None):
|
||||
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
|
||||
self.prompt_lookup_min = 5
|
||||
self.prompt_lookup_max = 5
|
||||
elif self.prompt_lookup_min is None:
|
||||
assert self.prompt_lookup_max is not None
|
||||
self.prompt_lookup_min = self.prompt_lookup_max
|
||||
elif self.prompt_lookup_max is None:
|
||||
assert self.prompt_lookup_min is not None
|
||||
self.prompt_lookup_max = self.prompt_lookup_min
|
||||
|
||||
# Validate values
|
||||
if self.prompt_lookup_min < 1:
|
||||
raise ValueError(
|
||||
f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
|
||||
if self.prompt_lookup_max < 1:
|
||||
raise ValueError(
|
||||
f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
|
||||
if self.prompt_lookup_min > self.prompt_lookup_max:
|
||||
raise ValueError(
|
||||
f"prompt_lookup_min={self.prompt_lookup_min} must "
|
||||
f"be <= prompt_lookup_max={self.prompt_lookup_max}")
|
||||
if speculative_model == "[ngram]":
|
||||
if ngram_prompt_lookup_min is None:
|
||||
ngram_prompt_lookup_min = 1
|
||||
if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
|
||||
raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
|
||||
if ngram_prompt_lookup_min < 1:
|
||||
raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
|
||||
if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
|
||||
raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
|
||||
f"larger than {ngram_prompt_lookup_max=}")
|
||||
|
||||
# TODO: current we still need extract vocab_size from target model
|
||||
# config, in future, we may try refactor it out, and set
|
||||
# draft related config as None here.
|
||||
self.draft_model_config = self.target_model_config
|
||||
self.draft_parallel_config = self.target_parallel_config
|
||||
draft_model_config = target_model_config
|
||||
draft_parallel_config = target_parallel_config
|
||||
else:
|
||||
self.prompt_lookup_max = 0
|
||||
self.prompt_lookup_min = 0
|
||||
ngram_prompt_lookup_max = 0
|
||||
ngram_prompt_lookup_min = 0
|
||||
draft_model_config = ModelConfig(
|
||||
model=speculative_model,
|
||||
task="draft",
|
||||
tokenizer=target_model_config.tokenizer,
|
||||
tokenizer_mode=target_model_config.tokenizer_mode,
|
||||
trust_remote_code=target_model_config.trust_remote_code,
|
||||
allowed_local_media_path=target_model_config.
|
||||
allowed_local_media_path,
|
||||
dtype=target_model_config.dtype,
|
||||
seed=target_model_config.seed,
|
||||
revision=draft_revision,
|
||||
code_revision=draft_code_revision,
|
||||
tokenizer_revision=target_model_config.tokenizer_revision,
|
||||
max_model_len=None,
|
||||
spec_target_max_model_len=target_model_config.max_model_len,
|
||||
quantization=draft_quantization,
|
||||
enforce_eager=target_model_config.enforce_eager,
|
||||
max_seq_len_to_capture=target_model_config.
|
||||
max_seq_len_to_capture,
|
||||
max_logprobs=target_model_config.max_logprobs,
|
||||
hf_overrides=SpeculativeConfig.hf_config_override,
|
||||
)
|
||||
|
||||
if self.model is not None:
|
||||
self.draft_model_config = ModelConfig(
|
||||
model=self.model,
|
||||
task="draft",
|
||||
tokenizer=self.target_model_config.tokenizer,
|
||||
tokenizer_mode=self.target_model_config.tokenizer_mode,
|
||||
trust_remote_code=self.target_model_config.
|
||||
trust_remote_code,
|
||||
allowed_local_media_path=self.target_model_config.
|
||||
allowed_local_media_path,
|
||||
dtype=self.target_model_config.dtype,
|
||||
seed=self.target_model_config.seed,
|
||||
revision=self.revision,
|
||||
code_revision=self.code_revision,
|
||||
tokenizer_revision=self.target_model_config.
|
||||
tokenizer_revision,
|
||||
max_model_len=None,
|
||||
spec_target_max_model_len=self.target_model_config.
|
||||
max_model_len,
|
||||
quantization=self.quantization,
|
||||
enforce_eager=self.target_model_config.enforce_eager,
|
||||
max_seq_len_to_capture=self.target_model_config.
|
||||
max_seq_len_to_capture,
|
||||
max_logprobs=self.target_model_config.max_logprobs,
|
||||
hf_overrides=SpeculativeConfig.hf_config_override,
|
||||
)
|
||||
draft_hf_config = draft_model_config.hf_config
|
||||
|
||||
# Automatically detect the method
|
||||
if "eagle-" in self.draft_model_config.model.lower():
|
||||
self.method = "eagle"
|
||||
elif self.draft_model_config.hf_config.model_type == "medusa":
|
||||
self.method = "medusa"
|
||||
elif (self.draft_model_config.hf_config.model_type ==
|
||||
"mlp_speculator"):
|
||||
self.method = "mlp_speculator"
|
||||
# Detect EAGLE prefix to replace hf_config for EAGLE draft_model
|
||||
if "eagle-" in draft_model_config.model.lower():
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
if isinstance(draft_model_config.hf_config, EAGLEConfig):
|
||||
pass
|
||||
else:
|
||||
self.method = "draft_model"
|
||||
eagle_config = EAGLEConfig(draft_model_config.hf_config)
|
||||
draft_model_config.hf_config = eagle_config
|
||||
|
||||
# Replace hf_config for EAGLE draft_model
|
||||
if self.method == "eagle":
|
||||
if self.enable_chunked_prefill:
|
||||
raise ValueError(
|
||||
"Chunked prefill and EAGLE are not compatible.")
|
||||
if (num_speculative_tokens is not None
|
||||
and hasattr(draft_hf_config, "num_lookahead_tokens")):
|
||||
draft_hf_config.num_lookahead_tokens = num_speculative_tokens
|
||||
n_predict = getattr(draft_hf_config, "n_predict", None)
|
||||
if n_predict is not None:
|
||||
if num_speculative_tokens is None:
|
||||
# Default to max value defined in draft model config.
|
||||
num_speculative_tokens = n_predict
|
||||
elif num_speculative_tokens > n_predict and \
|
||||
num_speculative_tokens % n_predict != 0:
|
||||
# Ensure divisibility for MTP module reuse.
|
||||
raise ValueError(
|
||||
f"{num_speculative_tokens=} must be divisible by "
|
||||
f"{n_predict=}")
|
||||
|
||||
from vllm.transformers_utils.configs.eagle import (
|
||||
EAGLEConfig)
|
||||
if isinstance(self.draft_model_config.hf_config,
|
||||
EAGLEConfig):
|
||||
pass
|
||||
else:
|
||||
eagle_config = EAGLEConfig(
|
||||
self.draft_model_config.hf_config)
|
||||
self.draft_model_config.hf_config = eagle_config
|
||||
speculative_draft_tensor_parallel_size = \
|
||||
SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
|
||||
target_parallel_config,
|
||||
speculative_draft_tensor_parallel_size,
|
||||
draft_hf_config
|
||||
)
|
||||
|
||||
if (self.num_speculative_tokens is not None
|
||||
and hasattr(self.draft_model_config.hf_config,
|
||||
"num_lookahead_tokens")):
|
||||
self.draft_model_config.hf_config.num_lookahead_tokens = \
|
||||
self.num_speculative_tokens
|
||||
draft_model_config.max_model_len = (
|
||||
SpeculativeConfig._maybe_override_draft_max_model_len(
|
||||
speculative_max_model_len,
|
||||
draft_model_config.max_model_len,
|
||||
target_model_config.max_model_len,
|
||||
))
|
||||
|
||||
n_predict = getattr(self.draft_model_config.hf_config,
|
||||
"n_predict", None)
|
||||
if n_predict is not None:
|
||||
if self.num_speculative_tokens is None:
|
||||
# Default to max value defined in draft model config.
|
||||
self.num_speculative_tokens = n_predict
|
||||
elif self.num_speculative_tokens > n_predict and \
|
||||
self.num_speculative_tokens % n_predict != 0:
|
||||
# Ensure divisibility for MTP module reuse.
|
||||
raise ValueError(
|
||||
f"num_speculative_tokens:{self.num_speculative_tokens}"
|
||||
f" must be divisible by {n_predict=}")
|
||||
draft_parallel_config = (
|
||||
SpeculativeConfig.create_draft_parallel_config(
|
||||
target_parallel_config,
|
||||
speculative_draft_tensor_parallel_size, draft_hf_config))
|
||||
|
||||
self.draft_tensor_parallel_size = \
|
||||
SpeculativeConfig._verify_and_get_draft_tp(
|
||||
self.target_parallel_config,
|
||||
self.draft_tensor_parallel_size,
|
||||
self.draft_model_config.hf_config
|
||||
)
|
||||
if num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens must be provided with "
|
||||
"speculative_model unless the draft model config contains an "
|
||||
"n_predict parameter.")
|
||||
|
||||
self.draft_model_config.max_model_len = (
|
||||
SpeculativeConfig._maybe_override_draft_max_model_len(
|
||||
self.max_model_len,
|
||||
self.draft_model_config.max_model_len,
|
||||
self.target_model_config.max_model_len,
|
||||
))
|
||||
if typical_acceptance_sampler_posterior_threshold is None:
|
||||
typical_acceptance_sampler_posterior_threshold = 0.09
|
||||
if typical_acceptance_sampler_posterior_alpha is None:
|
||||
typical_acceptance_sampler_posterior_alpha = 0.3
|
||||
if disable_logprobs is None:
|
||||
disable_logprobs = True
|
||||
|
||||
self.draft_parallel_config = (
|
||||
SpeculativeConfig.create_draft_parallel_config(
|
||||
self.target_parallel_config,
|
||||
self.draft_tensor_parallel_size))
|
||||
|
||||
if self.acceptance_method == "typical_acceptance_sampler":
|
||||
if self.posterior_threshold is None:
|
||||
self.posterior_threshold = 0.09
|
||||
if self.posterior_alpha is None:
|
||||
self.posterior_alpha = 0.3
|
||||
|
||||
self._verify_args()
|
||||
return SpeculativeConfig(
|
||||
draft_model_config,
|
||||
draft_parallel_config,
|
||||
num_speculative_tokens,
|
||||
speculative_disable_mqa_scorer,
|
||||
speculative_disable_by_batch_size,
|
||||
ngram_prompt_lookup_max,
|
||||
ngram_prompt_lookup_min,
|
||||
draft_token_acceptance_method=draft_token_acceptance_method,
|
||||
typical_acceptance_sampler_posterior_threshold=\
|
||||
typical_acceptance_sampler_posterior_threshold,
|
||||
typical_acceptance_sampler_posterior_alpha=\
|
||||
typical_acceptance_sampler_posterior_alpha,
|
||||
disable_logprobs=disable_logprobs,
|
||||
disable_log_stats=disable_log_stats,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _maybe_override_draft_max_model_len(
|
||||
@ -2189,7 +2108,7 @@ class SpeculativeConfig:
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _verify_and_get_draft_tp(
|
||||
def _verify_and_get_draft_model_tensor_parallel_size(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: Optional[int],
|
||||
draft_hf_config: PretrainedConfig) -> int:
|
||||
@ -2221,6 +2140,7 @@ class SpeculativeConfig:
|
||||
def create_draft_parallel_config(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: int,
|
||||
draft_hf_config: PretrainedConfig,
|
||||
) -> ParallelConfig:
|
||||
"""Create a parallel config for use by the draft worker.
|
||||
|
||||
@ -2244,13 +2164,74 @@ class SpeculativeConfig:
|
||||
|
||||
return draft_parallel_config
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
if self.num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens must be provided with "
|
||||
"speculative model unless the draft model config contains an "
|
||||
"n_predict parameter.")
|
||||
def __init__(
|
||||
self,
|
||||
draft_model_config: ModelConfig,
|
||||
draft_parallel_config: ParallelConfig,
|
||||
num_speculative_tokens: int,
|
||||
speculative_disable_mqa_scorer: Optional[bool],
|
||||
speculative_disable_by_batch_size: Optional[int],
|
||||
ngram_prompt_lookup_max: Optional[int],
|
||||
ngram_prompt_lookup_min: Optional[int],
|
||||
draft_token_acceptance_method: str,
|
||||
typical_acceptance_sampler_posterior_threshold: float,
|
||||
typical_acceptance_sampler_posterior_alpha: float,
|
||||
disable_logprobs: bool,
|
||||
disable_log_stats: bool,
|
||||
):
|
||||
"""Create a SpeculativeConfig object.
|
||||
|
||||
Args:
|
||||
draft_model_config: ModelConfig for the draft model.
|
||||
draft_parallel_config: ParallelConfig for the draft model.
|
||||
num_speculative_tokens: The number of tokens to sample from the
|
||||
draft model before scoring with the target model.
|
||||
speculative_disable_by_batch_size: Disable speculative
|
||||
decoding for new incoming requests when the number of
|
||||
enqueue requests is larger than this value.
|
||||
ngram_prompt_lookup_max: Max size of ngram token window.
|
||||
ngram_prompt_lookup_min: Min size of ngram token window.
|
||||
draft_token_acceptance_method (str): The method to use for
|
||||
accepting draft tokens. This can take two possible
|
||||
values 'rejection_sampler' and 'typical_acceptance_sampler'
|
||||
for RejectionSampler and TypicalAcceptanceSampler
|
||||
respectively.
|
||||
typical_acceptance_sampler_posterior_threshold (Optional[float]):
|
||||
A threshold value that sets a lower bound on the posterior
|
||||
probability of a token in the target model for it to be
|
||||
accepted. This threshold is used only when we use the
|
||||
TypicalAcceptanceSampler for token acceptance.
|
||||
typical_acceptance_sampler_posterior_alpha (Optional[float]):
|
||||
A scaling factor for the entropy-based threshold in the
|
||||
TypicalAcceptanceSampler.
|
||||
disable_logprobs: If set to True, token log probabilities will not
|
||||
be returned even if requested by sampling parameters. This
|
||||
reduces latency by skipping logprob calculation in proposal
|
||||
sampling, target sampling, and after accepted tokens are
|
||||
determined. If set to False, log probabilities will be
|
||||
returned.
|
||||
disable_log_stats: Whether to disable periodic printing of stage
|
||||
times in speculative decoding.
|
||||
"""
|
||||
self.draft_model_config = draft_model_config
|
||||
self.draft_parallel_config = draft_parallel_config
|
||||
self.num_speculative_tokens = num_speculative_tokens
|
||||
self.speculative_disable_mqa_scorer = speculative_disable_mqa_scorer
|
||||
self.speculative_disable_by_batch_size = \
|
||||
speculative_disable_by_batch_size
|
||||
self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
|
||||
self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
|
||||
self.draft_token_acceptance_method = draft_token_acceptance_method
|
||||
self.typical_acceptance_sampler_posterior_threshold = \
|
||||
typical_acceptance_sampler_posterior_threshold
|
||||
self.typical_acceptance_sampler_posterior_alpha = \
|
||||
typical_acceptance_sampler_posterior_alpha
|
||||
self.disable_logprobs = disable_logprobs
|
||||
self.disable_log_stats = disable_log_stats
|
||||
|
||||
self._verify_args()
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
if self.num_speculative_tokens <= 0:
|
||||
raise ValueError("Expected num_speculative_tokens to be greater "
|
||||
f"than zero ({self.num_speculative_tokens}).")
|
||||
@ -2260,34 +2241,29 @@ class SpeculativeConfig:
|
||||
self.draft_parallel_config)
|
||||
# Validate and set draft token acceptance related settings.
|
||||
|
||||
if self.acceptance_method is None:
|
||||
raise ValueError("acceptance_method is not set. "
|
||||
if (self.draft_token_acceptance_method is None):
|
||||
raise ValueError("draft_token_acceptance_method is not set. "
|
||||
"Expected values are rejection_sampler or "
|
||||
"typical_acceptance_sampler.")
|
||||
|
||||
if (self.acceptance_method != 'rejection_sampler'
|
||||
and self.acceptance_method != 'typical_acceptance_sampler'):
|
||||
if (self.draft_token_acceptance_method != 'rejection_sampler'
|
||||
and self.draft_token_acceptance_method
|
||||
!= 'typical_acceptance_sampler'):
|
||||
raise ValueError(
|
||||
"Expected acceptance_method to be either "
|
||||
"Expected draft_token_acceptance_method to be either "
|
||||
"rejection_sampler or typical_acceptance_sampler. Instead it "
|
||||
f"is {self.acceptance_method}")
|
||||
f"is {self.draft_token_acceptance_method}")
|
||||
|
||||
if self.acceptance_method == "typical_acceptance_sampler" and (
|
||||
(self.posterior_threshold is not None
|
||||
and self.posterior_threshold < 0) or
|
||||
(self.posterior_alpha is not None and self.posterior_alpha < 0)):
|
||||
if (self.typical_acceptance_sampler_posterior_threshold < 0
|
||||
or self.typical_acceptance_sampler_posterior_alpha < 0):
|
||||
raise ValueError(
|
||||
"Expected the posterior_threshold and posterior_alpha of "
|
||||
"typical_acceptance_sampler to be > 0. "
|
||||
"Instead found posterior_threshold = "
|
||||
f"{self.posterior_threshold} and posterior_alpha = "
|
||||
f"{self.posterior_alpha}")
|
||||
|
||||
if (self.disable_by_batch_size is not None
|
||||
and self.disable_by_batch_size < 2):
|
||||
raise ValueError("Expect the batch size threshold of disabling "
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{self.disable_by_batch_size=}")
|
||||
"Expected typical_acceptance_sampler_posterior_threshold "
|
||||
"and typical_acceptance_sampler_posterior_alpha to be > 0. "
|
||||
"Instead found "
|
||||
f"typical_acceptance_sampler_posterior_threshold = "
|
||||
f"{self.typical_acceptance_sampler_posterior_threshold} and "
|
||||
f"typical_acceptance_sampler_posterior_alpha = "
|
||||
f"{self.typical_acceptance_sampler_posterior_alpha}")
|
||||
|
||||
@property
|
||||
def num_lookahead_slots(self) -> int:
|
||||
@ -2300,8 +2276,8 @@ class SpeculativeConfig:
|
||||
return self.num_speculative_tokens
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
|
||||
draft_model = "ngram"
|
||||
if self.ngram_prompt_lookup_max > 0:
|
||||
draft_model = "[ngram]"
|
||||
else:
|
||||
draft_model = self.draft_model_config.model
|
||||
num_spec_tokens = self.num_speculative_tokens
|
||||
@ -2376,6 +2352,12 @@ class LoRAConfig:
|
||||
self.lora_dtype = model_config.dtype
|
||||
elif isinstance(self.lora_dtype, str):
|
||||
self.lora_dtype = getattr(torch, self.lora_dtype)
|
||||
if model_config.quantization and model_config.quantization not in [
|
||||
"awq", "gptq"
|
||||
]:
|
||||
# TODO support marlin
|
||||
logger.warning("%s quantization is not tested with LoRA yet.",
|
||||
model_config.quantization)
|
||||
|
||||
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
||||
# Reminder: Please update docs/source/features/compatibility_matrix.md
|
||||
@ -2803,17 +2785,12 @@ class DecodingConfig:
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
v0_valid_guided_backends = [
|
||||
'outlines', 'lm-format-enforcer', 'xgrammar'
|
||||
valid_guided_backends = [
|
||||
'outlines', 'lm-format-enforcer', 'xgrammar', 'guidance'
|
||||
]
|
||||
v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
|
||||
|
||||
backend = GuidedDecodingParams(
|
||||
backend=self.guided_decoding_backend).backend_name
|
||||
if envs.VLLM_USE_V1:
|
||||
valid_guided_backends = v1_valid_guided_backends
|
||||
else:
|
||||
valid_guided_backends = v0_valid_guided_backends
|
||||
if backend not in valid_guided_backends:
|
||||
raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
|
||||
f" must be one of {valid_guided_backends}")
|
||||
@ -3091,7 +3068,8 @@ class CompilationConfig(BaseModel):
|
||||
compilation.
|
||||
"""
|
||||
dict_ = self.model_dump(include={"enable_fusion", "enable_noop"})
|
||||
return InductorPass.hash_dict(dict_)
|
||||
encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
|
||||
return hashlib.sha256(encoded).digest()
|
||||
|
||||
def model_post_init(self, __context: Any) -> None:
|
||||
if not self.enable_noop and self.enable_fusion:
|
||||
@ -3180,7 +3158,7 @@ class CompilationConfig(BaseModel):
|
||||
# and it is not yet a priority. RFC here:
|
||||
# https://github.com/vllm-project/vllm/issues/14703
|
||||
|
||||
if Version(importlib.metadata.version('torch')) >= Version("2.6"):
|
||||
if Version(torch.__version__) >= Version("2.6"):
|
||||
KEY = 'enable_auto_functionalized_v2'
|
||||
if KEY not in self.inductor_compile_config:
|
||||
self.inductor_compile_config[KEY] = False
|
||||
@ -3307,8 +3285,7 @@ class VllmConfig:
|
||||
init=True) # type: ignore
|
||||
load_config: LoadConfig = field(default=None, init=True) # type: ignore
|
||||
lora_config: Optional[LoRAConfig] = None
|
||||
speculative_config: SpeculativeConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
decoding_config: Optional[DecodingConfig] = None
|
||||
observability_config: Optional[ObservabilityConfig] = None
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user