Compare commits
17 Commits
v0.8.2
...
v1-sched-i
| Author | SHA1 | Date | |
|---|---|---|---|
| 8db54c7912 | |||
| 530dbecd1a | |||
| ada8a47b12 | |||
| 6b42a56d46 | |||
| a7facf98d9 | |||
| 1e7bf7970a | |||
| 24ce0a7638 | |||
| e484ecb947 | |||
| da07067215 | |||
| 6cd1b1a18c | |||
| 8730469cfa | |||
| 5b38e984b3 | |||
| 06e22ba44c | |||
| 8d46d5d11d | |||
| f198d7d07a | |||
| 0bf6e97493 | |||
| 6e7209347d |
@ -361,7 +361,7 @@ main() {
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOGGING_LEVEL="WARNING"
|
||||
export VLLM_LOG_LEVEL="WARNING"
|
||||
|
||||
# prepare for benchmarking
|
||||
cd benchmarks || exit 1
|
||||
|
||||
@ -14,7 +14,6 @@ DOCKER_BUILDKIT=1 docker build . \
|
||||
-t gh200-test \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg RUN_WHEEL_CHECK=false \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
|
||||
@ -24,6 +23,6 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
||||
|
||||
16
.buildkite/run-openvino-test.sh
Executable file
16
.buildkite/run-openvino-test.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t openvino-test -f Dockerfile.openvino .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
@ -19,19 +19,17 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo TEST_1 \
|
||||
&& pytest /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_3 \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& echo TEST_4 \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_5 \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
@ -118,7 +118,7 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -148,8 +148,8 @@ steps:
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- python3 rlhf.py
|
||||
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
@ -515,7 +515,7 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
commands:
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,8 +2,7 @@
|
||||
/vllm/_version.py
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
!vllm/vllm_flash_attn/fa_utils.py
|
||||
vllm/vllm_flash_attn/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
87
Dockerfile
87
Dockerfile
@ -14,22 +14,17 @@ ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install minimal dependencies and uv
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache git curl wget sudo \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Add uv to PATH
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
# Create venv with specified Python and activate by placing at the front of path
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -51,22 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# install build and runtime dependencies
|
||||
|
||||
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||
# we need to install torch and torchvision from the nightly builds first,
|
||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||
# after this step
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
||||
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||
fi
|
||||
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/cuda.txt
|
||||
uv pip install -r requirements/cuda.txt
|
||||
|
||||
# cuda arch list used by torch
|
||||
# can be useful for both `dev` and `test`
|
||||
@ -91,7 +83,7 @@ COPY requirements/build.txt requirements/build.txt
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt
|
||||
uv pip install -r requirements/build.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
@ -163,7 +155,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
uv pip install -r requirements/dev.txt
|
||||
#################### DEV IMAGE ####################
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
@ -179,23 +171,18 @@ ARG TARGETPLATFORM
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install minimal dependencies and uv
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache git curl wget sudo vim \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Add uv to PATH
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
# Create venv with specified Python and activate by placing at the front of path
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -213,14 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
# after this step
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
||||
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||
fi
|
||||
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose
|
||||
uv pip install dist/*.whl --verbose
|
||||
|
||||
# If we need to build FlashInfer wheel before its release:
|
||||
# $ export FLASHINFER_ENABLE_AOT=1
|
||||
@ -235,9 +221,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||
uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||
fi
|
||||
COPY examples examples
|
||||
|
||||
@ -247,7 +232,7 @@ COPY examples examples
|
||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt
|
||||
uv pip install -r requirements/build.txt
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
@ -264,15 +249,15 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
uv pip install -r requirements/dev.txt
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -e tests/vllm_test_utils
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
uv pip install hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
|
||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||
@ -297,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
else \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
fi
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
29
Dockerfile.openvino
Normal file
29
Dockerfile.openvino
Normal file
@ -0,0 +1,29 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
FROM ubuntu:22.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y \
|
||||
git python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN python3 -m pip install -U pip
|
||||
# install build requirements
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
|
||||
# build vLLM with OpenVINO backend
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||
|
||||
COPY examples/ /workspace/examples
|
||||
COPY benchmarks/ /workspace/benchmarks
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -40,7 +40,7 @@ ARG USE_CYTHON
|
||||
RUN cd vllm \
|
||||
&& python3 -m pip install -r requirements/rocm.txt \
|
||||
&& python3 setup.py clean --all \
|
||||
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
|
||||
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||
FROM scratch AS export_vllm
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
@ -12,8 +12,6 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="b7d29fb"
|
||||
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||
ARG AITER_BRANCH="21d47a9"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
@ -131,18 +129,8 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG AITER_REPO
|
||||
ARG AITER_BRANCH
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt \
|
||||
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
|
||||
|
||||
ARG BASE_IMAGE
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
ARG RCCL_BRANCH
|
||||
ARG RCCL_REPO
|
||||
@ -167,6 +155,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
|
||||
|
||||
@ -29,10 +29,6 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
|
||||
<details>
|
||||
<summary>Previous News</summary>
|
||||
|
||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||
@ -46,9 +42,8 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
## About
|
||||
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
@ -1,420 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
import triton
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
assert current_platform.is_cuda(
|
||||
), "Only support tune w8a8 block fp8 kernel on CUDA device."
|
||||
|
||||
DTYPE_MAP = {
|
||||
"float32": torch.float32,
|
||||
"float16": torch.float16,
|
||||
"half": torch.half,
|
||||
"bfloat16": torch.bfloat16,
|
||||
}
|
||||
|
||||
|
||||
def w8a8_block_matmul(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size: list[int],
|
||||
config: dict[str, Any],
|
||||
output_dtype: torch.dtype = torch.float16,
|
||||
) -> torch.Tensor:
|
||||
"""This function performs matrix multiplication with
|
||||
block-wise quantization.
|
||||
|
||||
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
||||
The output is returned in the specified `output_dtype`.
|
||||
|
||||
Args:
|
||||
A: The input tensor, e.g., activation.
|
||||
B: The input tensor, e.g., weight.
|
||||
As: The per-token-group quantization scale for `A`.
|
||||
Bs: The per-block quantization scale for `B`.
|
||||
block_size: The block size for per-block quantization.
|
||||
It should be 2-dim, e.g., [128, 128].
|
||||
output_dytpe: The dtype of the returned tensor.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The result of matmul.
|
||||
"""
|
||||
assert len(block_size) == 2
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
|
||||
assert A.shape[-1] == B.shape[-1]
|
||||
assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
|
||||
assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
|
||||
M = A.numel() // A.shape[-1]
|
||||
|
||||
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
||||
N, K = B.shape
|
||||
assert triton.cdiv(N, block_n) == Bs.shape[0]
|
||||
assert triton.cdiv(K, block_k) == Bs.shape[1]
|
||||
|
||||
C_shape = A.shape[:-1] + (N, )
|
||||
C = A.new_empty(C_shape, dtype=output_dtype)
|
||||
|
||||
def grid(META):
|
||||
return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
|
||||
triton.cdiv(N, META["BLOCK_SIZE_N"]), )
|
||||
|
||||
if A.dtype == torch.float8_e4m3fn:
|
||||
kernel = _w8a8_block_fp8_matmul
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Currently, only support tune w8a8 block fp8 kernel.")
|
||||
|
||||
kernel[grid](
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
As,
|
||||
Bs,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
block_n,
|
||||
block_k,
|
||||
A.stride(-2),
|
||||
A.stride(-1),
|
||||
B.stride(1),
|
||||
B.stride(0),
|
||||
C.stride(-2),
|
||||
C.stride(-1),
|
||||
As.stride(-2),
|
||||
As.stride(-1),
|
||||
Bs.stride(1),
|
||||
Bs.stride(0),
|
||||
**config,
|
||||
)
|
||||
|
||||
return C
|
||||
|
||||
|
||||
def get_configs_compute_bound():
|
||||
configs = []
|
||||
for num_stages in [2, 3, 4, 5]:
|
||||
for block_m in [16, 32, 64, 128, 256]:
|
||||
for block_k in [64, 128]:
|
||||
for block_n in [32, 64, 128, 256]:
|
||||
for num_warps in [4, 8]:
|
||||
for group_size in [1, 16, 32, 64]:
|
||||
configs.append({
|
||||
"BLOCK_SIZE_M": block_m,
|
||||
"BLOCK_SIZE_N": block_n,
|
||||
"BLOCK_SIZE_K": block_k,
|
||||
"GROUP_SIZE_M": group_size,
|
||||
"num_warps": num_warps,
|
||||
"num_stages": num_stages,
|
||||
})
|
||||
return configs
|
||||
|
||||
|
||||
def get_weight_shapes(tp_size):
|
||||
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
|
||||
# Modify them, if you tune for another different model.
|
||||
# cannot TP
|
||||
total = [
|
||||
(512 + 64, 7168),
|
||||
((128 + 64) * 128, 7168),
|
||||
(128 * (128 + 128), 512),
|
||||
(7168, 16384),
|
||||
(7168, 18432),
|
||||
]
|
||||
# N can TP
|
||||
n_tp = [
|
||||
(18432 * 2, 7168),
|
||||
((128 + 64) * 128, 7168),
|
||||
(128 * (128 + 128), 512),
|
||||
(24576, 1536),
|
||||
(12288, 7168),
|
||||
(4096, 7168),
|
||||
]
|
||||
# K can TP
|
||||
k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
|
||||
|
||||
weight_shapes = []
|
||||
for t in total:
|
||||
weight_shapes.append(t)
|
||||
for n_t in n_tp:
|
||||
new_t = (n_t[0] // tp_size, n_t[1])
|
||||
weight_shapes.append(new_t)
|
||||
for k_t in k_tp:
|
||||
new_t = (k_t[0], k_t[1] // tp_size)
|
||||
weight_shapes.append(new_t)
|
||||
return weight_shapes
|
||||
|
||||
|
||||
def benchmark_config(A,
|
||||
B,
|
||||
As,
|
||||
Bs,
|
||||
block_size,
|
||||
config,
|
||||
out_dtype=torch.float16,
|
||||
num_iters=10):
|
||||
|
||||
def run():
|
||||
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
# JIT complication & warmup
|
||||
for _ in range(5):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
torch.cuda.synchronize()
|
||||
start_event.record()
|
||||
run()
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
latencies.append(start_event.elapsed_time(end_event))
|
||||
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
||||
return avg
|
||||
|
||||
|
||||
def tune(M, N, K, block_size, out_dtype, search_space, input_type):
|
||||
factor_for_scale = 1e-2
|
||||
|
||||
if input_type == "fp8":
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||
|
||||
A_fp32 = (
|
||||
(torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
|
||||
fp8_max)
|
||||
A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
|
||||
B_fp32 = (
|
||||
(torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
|
||||
fp8_max)
|
||||
B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Currently, only support tune w8a8 block fp8 kernel.")
|
||||
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
n_tiles = (N + block_n - 1) // block_n
|
||||
k_tiles = (K + block_k - 1) // block_k
|
||||
|
||||
As = torch.rand(M, k_tiles, dtype=torch.float32,
|
||||
device="cuda") * factor_for_scale
|
||||
Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
|
||||
factor_for_scale)
|
||||
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
for config in tqdm(search_space):
|
||||
try:
|
||||
kernel_time = benchmark_config(
|
||||
A,
|
||||
B,
|
||||
As,
|
||||
Bs,
|
||||
block_size,
|
||||
config,
|
||||
out_dtype,
|
||||
num_iters=10,
|
||||
)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
# Some configurations may be invalid and fail to compile.
|
||||
continue
|
||||
|
||||
if kernel_time < best_time:
|
||||
best_time = kernel_time
|
||||
best_config = config
|
||||
now = datetime.now()
|
||||
print(f"{now.ctime()}] Completed tuning for batch_size={M}")
|
||||
assert best_config is not None
|
||||
return best_config
|
||||
|
||||
|
||||
def save_configs(
|
||||
N,
|
||||
K,
|
||||
block_n,
|
||||
block_k,
|
||||
configs,
|
||||
save_path,
|
||||
input_type="fp8",
|
||||
) -> None:
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
device_name = current_platform.get_device_name().replace(" ", "_")
|
||||
json_file_name = (
|
||||
f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
|
||||
f"block_shape=[{block_n},{block_k}].json")
|
||||
|
||||
config_file_path = os.path.join(save_path, json_file_name)
|
||||
print(f"Writing best config to {config_file_path}...")
|
||||
|
||||
with open(config_file_path, "w") as f:
|
||||
json.dump(configs, f, indent=4)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def tune_on_gpu(args_dict):
|
||||
"""Run tuning on a specific GPU."""
|
||||
gpu_id = args_dict["gpu_id"]
|
||||
batch_sizes = args_dict["batch_sizes"]
|
||||
weight_shapes = args_dict["weight_shapes"]
|
||||
args = args_dict["args"]
|
||||
|
||||
torch.cuda.set_device(gpu_id)
|
||||
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
|
||||
|
||||
block_n = args.block_n
|
||||
block_k = args.block_k
|
||||
out_dtype = DTYPE_MAP[args.out_dtype]
|
||||
save_path = args.save_path
|
||||
input_type = args.input_type
|
||||
|
||||
search_space = get_configs_compute_bound()
|
||||
search_space = [
|
||||
config for config in search_space
|
||||
if block_k % config["BLOCK_SIZE_K"] == 0
|
||||
]
|
||||
|
||||
start = time.time()
|
||||
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
|
||||
N, K = shape[0], shape[1]
|
||||
print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
|
||||
benchmark_results = [
|
||||
tune(
|
||||
batch_size,
|
||||
N,
|
||||
K,
|
||||
[block_n, block_k],
|
||||
out_dtype,
|
||||
search_space,
|
||||
input_type,
|
||||
) for batch_size in tqdm(batch_sizes,
|
||||
desc=f"GPU {gpu_id} - Batch sizes")
|
||||
]
|
||||
best_configs = {
|
||||
M: config
|
||||
for M, config in zip(batch_sizes, benchmark_results)
|
||||
}
|
||||
save_configs(N, K, block_n, block_k, best_configs, save_path,
|
||||
input_type)
|
||||
|
||||
end = time.time()
|
||||
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
|
||||
|
||||
|
||||
def distribute_batch_sizes(batch_sizes, num_gpus):
|
||||
"""Distribute batch sizes across available GPUs."""
|
||||
batches_per_gpu = []
|
||||
for i in range(num_gpus):
|
||||
start_idx = i * len(batch_sizes) // num_gpus
|
||||
end_idx = (i + 1) * len(batch_sizes) // num_gpus
|
||||
batches_per_gpu.append(batch_sizes[start_idx:end_idx])
|
||||
return batches_per_gpu
|
||||
|
||||
|
||||
def main(args):
|
||||
print(args)
|
||||
num_gpus = torch.cuda.device_count()
|
||||
if num_gpus == 0:
|
||||
raise RuntimeError("No GPU available for tuning")
|
||||
print(f"Found {num_gpus} GPUs for parallel tuning")
|
||||
|
||||
torch.cuda.init()
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
8,
|
||||
16,
|
||||
24,
|
||||
32,
|
||||
48,
|
||||
64,
|
||||
96,
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
1536,
|
||||
2048,
|
||||
3072,
|
||||
4096,
|
||||
]
|
||||
else:
|
||||
batch_sizes = [args.batch_size]
|
||||
num_gpus = 1 # If only one batch size, use only one GPU
|
||||
|
||||
weight_shapes = get_weight_shapes(args.tp_size)
|
||||
|
||||
batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
|
||||
|
||||
process_args = []
|
||||
for gpu_id in range(num_gpus):
|
||||
process_args.append({
|
||||
"gpu_id": gpu_id,
|
||||
"batch_sizes": batches_per_gpu[gpu_id],
|
||||
"weight_shapes":
|
||||
weight_shapes, # Each GPU processes all weight shapes
|
||||
"args": args,
|
||||
})
|
||||
|
||||
ctx = mp.get_context("spawn")
|
||||
with ctx.Pool(num_gpus) as pool:
|
||||
pool.map(tune_on_gpu, process_args)
|
||||
|
||||
print("Multi-GPU tuning completed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
|
||||
python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
|
||||
Then copy to model_executor/layers/quantization/utils/configs
|
||||
""",
|
||||
formatter_class=argparse.RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument("--tp-size", "-tp", type=int, default=8)
|
||||
parser.add_argument("--input-type",
|
||||
type=str,
|
||||
choices=["fp8"],
|
||||
default="fp8")
|
||||
parser.add_argument(
|
||||
"--out-dtype",
|
||||
type=str,
|
||||
choices=["float32", "float16", "bfloat16", "half"],
|
||||
default="float16",
|
||||
)
|
||||
parser.add_argument("--block-n", type=int, default=128)
|
||||
parser.add_argument("--block-k", type=int, default=128)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--save-path", type=str, default="./")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
// sum of squares
|
||||
float ss = 0.0f;
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
|
||||
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
;
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
int32_t const num_vec_elems = hidden_size >> 2;
|
||||
|
||||
#pragma unroll 4
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
|
||||
vec4_t<float> x;
|
||||
@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
|
||||
#pragma unroll 4
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
|
||||
@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
// TODO(luka/varun) extract into type-agnostic vectorized quant function to
|
||||
// replace scaled_fp8_conversion_vec
|
||||
#pragma unroll 4
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> const in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
|
||||
|
||||
@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int n = tid/32;
|
||||
const int l = tid - 32*n;
|
||||
const int is = 8*n + l/16;
|
||||
@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
const auto r = threadIdx.x/4;
|
||||
const int r = threadIdx.x/4;
|
||||
const int tid = r/2;
|
||||
const int is0 = r%2;
|
||||
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
@ -164,10 +164,10 @@ template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int is = 2*il;
|
||||
@ -197,10 +197,10 @@ template<typename dst_t>
|
||||
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/16; // il is in 0...3
|
||||
const int ir = tid%16; // ir is in 0...15
|
||||
const int is = 2*il; // is is in 0...6
|
||||
@ -231,10 +231,10 @@ template<typename dst_t>
|
||||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int ip = tid/32; // ip is 0 or 1
|
||||
const int il = tid - 32*ip; // 0...32
|
||||
const int is = 8*ip + il/16;
|
||||
@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const auto i = blockIdx.x;
|
||||
const int i = blockIdx.x;
|
||||
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
||||
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -19,11 +19,11 @@ template <typename scalar_t>
|
||||
static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
|
||||
void* __restrict__ vy, const int kx,
|
||||
const int kx_padded) {
|
||||
const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
}
|
||||
const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
const int i_padded = iy * kx_padded + ix;
|
||||
|
||||
block_q8_1* y = (block_q8_1*)vy;
|
||||
|
||||
@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
|
||||
const int & ncols_dst = ncols_y;
|
||||
|
||||
const auto row_dst_0 = blockIdx.x*mmq_y;
|
||||
const int row_dst_0 = blockIdx.x*mmq_y;
|
||||
const int & row_x_0 = row_dst_0;
|
||||
|
||||
const auto col_dst_0 = blockIdx.y*mmq_x;
|
||||
const int col_dst_0 = blockIdx.y*mmq_x;
|
||||
const int & col_y_0 = col_dst_0;
|
||||
|
||||
int * tile_x_ql = nullptr;
|
||||
@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
|
||||
#pragma unroll
|
||||
for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
|
||||
const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
|
||||
const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
|
||||
const int kbxd = kqs / QI8_1;
|
||||
|
||||
#pragma unroll
|
||||
@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
#pragma unroll
|
||||
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
||||
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
|
||||
const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
|
||||
const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
|
||||
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
||||
|
||||
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
||||
@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < mmq_x; j += nwarps) {
|
||||
const auto col_dst = col_dst_0 + j + threadIdx.y;
|
||||
const int col_dst = col_dst_0 + j + threadIdx.y;
|
||||
if (col_dst >= ncols_dst) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
|
||||
const auto row_dst = row_dst_0 + threadIdx.x + i;
|
||||
const int row_dst = row_dst_0 + threadIdx.x + i;
|
||||
if (row_dst >= nrows_dst) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
|
||||
template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
|
||||
const auto row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
const block_q_t * x = (const block_q_t *) vx;
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
|
||||
for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i; // x block index
|
||||
|
||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
||||
|
||||
@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(
|
||||
|
||||
const int ncols_dst = ncols_y * top_k;
|
||||
|
||||
const auto row_dst_0 = blockIdx.x * mmq_y;
|
||||
const int row_dst_0 = blockIdx.x * mmq_y;
|
||||
const int& row_x_0 = row_dst_0;
|
||||
|
||||
const auto col_dst_0 = blockIdx.y * mmq_x;
|
||||
const int col_dst_0 = blockIdx.y * mmq_x;
|
||||
|
||||
int token_offs[mmq_x / nwarps];
|
||||
for (int i = 0; i < mmq_x; i += nwarps) {
|
||||
@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
const int n_per_r = ((qk * blocks_per_warp) / qr);
|
||||
#pragma unroll
|
||||
for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
|
||||
const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
|
||||
const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
|
||||
const int kbxd = kqs / QI8_1;
|
||||
|
||||
#pragma unroll
|
||||
@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
}
|
||||
|
||||
if (threadIdx.x < n_per_r / QK8_1) {
|
||||
const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
|
||||
const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
|
||||
const int col_y_eff = token_offs[threadIdx.y] / top_k;
|
||||
const int block_x =
|
||||
ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
|
||||
@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
|
||||
const auto row_dst = row_dst_0 + threadIdx.x + i;
|
||||
const int row_dst = row_dst_0 + threadIdx.x + i;
|
||||
if (row_dst >= nrows_dst) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -199,12 +199,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -337,12 +337,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -458,12 +458,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -586,12 +586,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
auto t = threadIdx.x;
|
||||
int t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
int zero_width = width / 8;
|
||||
int vec_height = height * 4;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
int b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
|
||||
auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
|
||||
int h = BLOCK_KN_SIZE * blockIdx.z / 8;
|
||||
int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
}
|
||||
|
||||
__shared__ half2 deq2[256][8];
|
||||
auto val = threadIdx.x / 8;
|
||||
auto off = threadIdx.x % 8;
|
||||
int val = threadIdx.x / 8;
|
||||
int off = threadIdx.x % 8;
|
||||
for (; val < 256; val += BLOCK_KN_SIZE / 8) {
|
||||
deq2[val][off] =
|
||||
__halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
|
||||
@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
int zero_width = width / 4;
|
||||
int vec_height = height * 2;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
int b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
|
||||
auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
|
||||
int h = BLOCK_KN_SIZE * blockIdx.z / 4;
|
||||
int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
half* __restrict__ out) {
|
||||
// Start of block
|
||||
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto row = blockIdx.y * 32 / bit;
|
||||
int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
int row = blockIdx.y * 32 / bit;
|
||||
if (column >= width) return;
|
||||
|
||||
// Views
|
||||
@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
const int height, const int width, const int group,
|
||||
half* __restrict__ out) {
|
||||
// Start of block
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto row = blockIdx.y * 32;
|
||||
int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
int row = blockIdx.y * 32;
|
||||
if (column >= width) return;
|
||||
|
||||
// Views
|
||||
@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
|
||||
__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
|
||||
__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
|
||||
__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
|
||||
__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
|
||||
const uint64_t* w2 = (uint64_t*)w;
|
||||
uint64_t* w_new2 = (uint64_t*)w_new;
|
||||
int w2_stride = w_width >> 1;
|
||||
auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w2_column >= w2_stride) return;
|
||||
auto w_new2_row = blockIdx.y;
|
||||
int w_new2_row = blockIdx.y;
|
||||
int q_perm_idx = w_new2_row << 3;
|
||||
uint64_t dst = 0;
|
||||
|
||||
@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
|
||||
const uint64_t* w2 = (uint64_t*)w;
|
||||
uint64_t* w_new2 = (uint64_t*)w_new;
|
||||
int w2_stride = w_width >> 1;
|
||||
auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w2_column >= w2_stride) return;
|
||||
auto w_new2_row = blockIdx.y;
|
||||
int w_new2_row = blockIdx.y;
|
||||
int q_perm_idx = w_new2_row << 4;
|
||||
uint64_t dst = 0;
|
||||
|
||||
@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
|
||||
uint32_t* __restrict__ w_new,
|
||||
const int* __restrict__ q_perm,
|
||||
const int w_width) {
|
||||
auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
int w_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w_column >= w_width) return;
|
||||
auto w_new_row = blockIdx.y * 3;
|
||||
auto q_perm_idx = blockIdx.y << 5;
|
||||
int w_new_row = blockIdx.y * 3;
|
||||
int q_perm_idx = blockIdx.y << 5;
|
||||
uint32_t dst[3] = {0, 0, 0};
|
||||
|
||||
#pragma unroll
|
||||
@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
|
||||
const uint64_t* w2 = (uint64_t*)w;
|
||||
uint64_t* w_new2 = (uint64_t*)w_new;
|
||||
int w2_stride = w_width >> 1;
|
||||
auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w2_column >= w2_stride) return;
|
||||
auto w_new2_row = blockIdx.y;
|
||||
int w_new2_row = blockIdx.y;
|
||||
int q_perm_idx = w_new2_row << 2;
|
||||
uint64_t dst = 0;
|
||||
|
||||
|
||||
@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
|
||||
blockIdx.z * params.SplitK * 4;
|
||||
|
||||
const auto lane_id = threadIdx.x % WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
// For matrix A, a block load/store Mtile(row) x 32(col) elements in
|
||||
// multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
|
||||
const auto Aldg_row_base_idx = threadIdx.x / 4;
|
||||
const int Aldg_row_base_idx = threadIdx.x / 4;
|
||||
Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
|
||||
const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
|
||||
|
||||
@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
// elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
|
||||
// * 128(col) per iter
|
||||
Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
|
||||
const auto Bldg_row_base_idx = threadIdx.x / 8;
|
||||
const int Bldg_row_base_idx = threadIdx.x / 8;
|
||||
const int Bldg_base_offset =
|
||||
Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
|
||||
|
||||
@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
B_ldg_guard = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
|
||||
auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
|
||||
int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
|
||||
if (m_idx < params.M) {
|
||||
A_ldg_guard |= (1u << i);
|
||||
}
|
||||
@ -98,8 +98,8 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
const int N_padded = (params.N + 31) / 32 * 32;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
|
||||
auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
|
||||
i * N_SIZE_ONE_LOAD;
|
||||
int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
|
||||
i * N_SIZE_ONE_LOAD;
|
||||
if (n_idx < N_padded) {
|
||||
B_ldg_guard |= (1u << i);
|
||||
}
|
||||
@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
__device__ void fused_splitk_reduce() {
|
||||
// need splitk-reduce if enable splitk
|
||||
if (gridDim.z > 1) {
|
||||
auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
|
||||
int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
|
||||
// Wait for all previous blocks in the splitk direction to accumulate the
|
||||
// results into C_tmp
|
||||
if (threadIdx.x == 0) {
|
||||
@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
|
||||
int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
|
||||
if (blockIdx.z != 0) {
|
||||
// expecting that temporary register here reuses the previous A&B frag
|
||||
// register
|
||||
@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
|
||||
FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
|
||||
// C_tile lds and stg
|
||||
auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
|
||||
int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
|
||||
bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
|
||||
if (WARP_NTILE == 32) {
|
||||
int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
|
||||
@ -580,9 +580,9 @@ __global__ void __launch_bounds__(BLOCK)
|
||||
int sts_stage_idx = 0;
|
||||
int lds_stage_idx = 0;
|
||||
|
||||
auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
|
||||
? params.SplitK
|
||||
: params.K - blockIdx.z * params.SplitK;
|
||||
int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
|
||||
? params.SplitK
|
||||
: params.K - blockIdx.z * params.SplitK;
|
||||
int k_tiles = (tb_k_slice + 31) / 32;
|
||||
int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;
|
||||
|
||||
@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
|
||||
const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
|
||||
const int N_32align, const int N, const int K) {
|
||||
__shared__ FT smem[64 * 32];
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
auto lane_id = threadIdx.x % 32;
|
||||
const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int lane_id = threadIdx.x % 32;
|
||||
const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
|
||||
const int src_col_idx =
|
||||
blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
|
||||
const int src_offset = src_row_idx * K * 4 + src_col_idx;
|
||||
auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
|
||||
int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
|
||||
|
||||
QT qval_reg[16];
|
||||
const QT* pdata = qdata + src_offset;
|
||||
@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
|
||||
*reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
|
||||
}
|
||||
|
||||
const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
|
||||
const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
|
||||
const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
|
||||
const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
int dst_row_kidx = dst_row_base_kidx + i * 32;
|
||||
@ -1008,4 +1008,4 @@ torch::Tensor allspark_w8a16_gemm(
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
|
||||
}
|
||||
}
|
||||
@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
|
||||
const uint8_t* B, const FType* B_scale, const FType* B_zero,
|
||||
uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
|
||||
const int K, const int N, const int N_32align) {
|
||||
const auto lane_id = threadIdx.x % 32;
|
||||
const auto warp_id = threadIdx.x / 32;
|
||||
const int lane_id = threadIdx.x % 32;
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
|
||||
if (blockIdx.x != gridDim.x - 1) {
|
||||
// Load B
|
||||
@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
|
||||
}
|
||||
|
||||
// Store B
|
||||
const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
|
||||
const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
|
||||
const int dst_col_idx =
|
||||
blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
|
||||
} else {
|
||||
// Load B_scale and B_zero
|
||||
FType b_scale_reg, b_zero_reg;
|
||||
auto src_offset = blockIdx.y * 128 + threadIdx.x;
|
||||
int src_offset = blockIdx.y * 128 + threadIdx.x;
|
||||
ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
|
||||
if (B_zero != nullptr)
|
||||
ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
|
||||
|
||||
@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX>
|
||||
__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
|
||||
uint32_t n, uint32_t n_matrix,
|
||||
uint32_t matrix_size) {
|
||||
auto idx = blockIdx.x * BLOCK + threadIdx.x;
|
||||
int idx = blockIdx.x * BLOCK + threadIdx.x;
|
||||
|
||||
if (idx >= matrix_size) {
|
||||
return;
|
||||
@ -407,4 +407,4 @@ static __device__ half2 inline num2num2(const half x) {
|
||||
return __half2half2(x);
|
||||
}
|
||||
|
||||
} // namespace allspark
|
||||
} // namespace allspark
|
||||
@ -42,7 +42,7 @@ namespace marlin {
|
||||
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
int const* __restrict__ perm_int_ptr,
|
||||
int4* __restrict__ out_int4_ptr, int size_m,
|
||||
int size_k, int lda, int block_rows) {}
|
||||
int size_k, int block_rows) {}
|
||||
|
||||
template <typename scalar_t, // compute dtype, half or nv_float16
|
||||
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
|
||||
@ -459,7 +459,7 @@ __device__ inline void barrier_release(int* lock, bool reset = false) {
|
||||
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
int const* __restrict__ perm_int_ptr,
|
||||
int4* __restrict__ out_int4_ptr, int size_m,
|
||||
int size_k, int lda, int block_rows) {
|
||||
int size_k, int block_rows) {
|
||||
int start_row = block_rows * blockIdx.x;
|
||||
int finish_row = start_row + block_rows;
|
||||
if (finish_row > size_m) {
|
||||
@ -467,19 +467,16 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
}
|
||||
int cur_block_rows = finish_row - start_row;
|
||||
|
||||
int input_row_stride = lda * sizeof(half) / 16;
|
||||
int output_row_stride = size_k * sizeof(half) / 16;
|
||||
int row_stride = size_k * sizeof(half) / 16;
|
||||
|
||||
auto permute_row = [&](int row) {
|
||||
int iters = size_k / default_threads;
|
||||
int rest = size_k % default_threads;
|
||||
|
||||
int input_offset = row * input_row_stride;
|
||||
int output_offset = row * output_row_stride;
|
||||
int offset = row * row_stride;
|
||||
|
||||
half const* a_row_half =
|
||||
reinterpret_cast<half const*>(a_int4_ptr + input_offset);
|
||||
half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);
|
||||
half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
|
||||
half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
|
||||
|
||||
int base_k = 0;
|
||||
|
||||
@ -540,7 +537,6 @@ __global__ void Marlin(
|
||||
int prob_m, // batch dimension m
|
||||
int prob_n, // output dimension n
|
||||
int prob_k, // reduction dimension k
|
||||
int lda, // A.stride(0), equal to prob_k is A is contiguous
|
||||
int* locks, // extra global storage for barrier synchronization
|
||||
bool use_atomic_add, // whether to use atomic add to reduce
|
||||
bool use_fp32_reduce // whether to use fp32 global reduce
|
||||
@ -604,7 +600,7 @@ __global__ void Marlin(
|
||||
// We can easily implement parallel problem execution by just remapping
|
||||
// indices and advancing global pointers
|
||||
if (slice_col_par >= n_tiles) {
|
||||
A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
|
||||
A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
|
||||
C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
|
||||
locks += (slice_col_par / n_tiles) * n_tiles;
|
||||
slice_col = slice_col_par % n_tiles;
|
||||
@ -635,7 +631,7 @@ __global__ void Marlin(
|
||||
}
|
||||
}
|
||||
if (slice_col == n_tiles) {
|
||||
A += 16 * thread_m_blocks * lda / 8;
|
||||
A += 16 * thread_m_blocks * prob_k / 8;
|
||||
C += 16 * thread_m_blocks * prob_n / 8;
|
||||
locks += n_tiles;
|
||||
slice_col = 0;
|
||||
@ -647,7 +643,7 @@ __global__ void Marlin(
|
||||
// A sizes/strides
|
||||
|
||||
// stride of the A matrix in global memory
|
||||
int a_gl_stride = lda / 8;
|
||||
int a_gl_stride = prob_k / 8;
|
||||
// stride of an A matrix tile in shared memory
|
||||
constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
|
||||
// delta between subsequent A tiles in global memory
|
||||
@ -1784,8 +1780,8 @@ __global__ void Marlin(
|
||||
HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT> \
|
||||
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \
|
||||
num_groups, prob_m, prob_n, prob_k, lda, locks, \
|
||||
use_atomic_add, use_fp32_reduce); \
|
||||
num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add, \
|
||||
use_fp32_reduce); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -2075,7 +2071,7 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
|
||||
template <typename scalar_t>
|
||||
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
|
||||
int prob_n, int prob_k, int lda, void* workspace,
|
||||
int prob_n, int prob_k, void* workspace,
|
||||
vllm::ScalarType const& q_type, bool has_act_order,
|
||||
bool is_k_full, bool has_zp, int num_groups, int group_size,
|
||||
int dev, cudaStream_t stream, int thread_k, int thread_n,
|
||||
@ -2188,9 +2184,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
// Permute A columns
|
||||
int block_rows = div_ceil(prob_m, blocks);
|
||||
permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
|
||||
A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
|
||||
A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
|
||||
A_ptr = a_tmp_ptr;
|
||||
lda = prob_k;
|
||||
}
|
||||
|
||||
// If we have a full K, then we can run the non-act-order version of Marlin
|
||||
@ -2249,7 +2244,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
", num_bits = ", num_bits);
|
||||
}
|
||||
|
||||
A_ptr += 16 * thread_m_blocks * (lda / 8) * par;
|
||||
A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
|
||||
C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
|
||||
}
|
||||
}
|
||||
@ -2305,10 +2300,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
|
||||
// Verify device and strides
|
||||
TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
|
||||
TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
|
||||
// We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
|
||||
TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
|
||||
TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");
|
||||
TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
|
||||
|
||||
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
|
||||
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
|
||||
@ -2440,7 +2432,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
|
||||
c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
|
||||
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
|
||||
a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
|
||||
a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
|
||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||
thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
|
||||
@ -2451,10 +2443,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
|
||||
b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
|
||||
perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
|
||||
a.stride(0), workspace.data_ptr(), b_q_type, has_act_order, is_k_full,
|
||||
has_zp, num_groups, group_size, dev,
|
||||
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
|
||||
marlin::max_par, use_atomic_add, use_fp32_reduce, is_zp_float);
|
||||
workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
|
||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||
thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
|
||||
use_fp32_reduce, is_zp_float);
|
||||
} else {
|
||||
TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
|
||||
}
|
||||
|
||||
@ -85,7 +85,6 @@ html_static_path = ["_static"]
|
||||
html_js_files = ["custom.js"]
|
||||
html_css_files = ["custom.css"]
|
||||
|
||||
myst_heading_anchors = 2
|
||||
myst_url_schemes = {
|
||||
'http': None,
|
||||
'https': None,
|
||||
|
||||
@ -124,52 +124,3 @@ nsys stats report1.nsys-rep
|
||||
GUI example:
|
||||
|
||||
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
||||
|
||||
## Profiling vLLM Python Code
|
||||
|
||||
The Python standard library includes
|
||||
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
||||
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
||||
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
|
||||
used to profile a section of code.
|
||||
|
||||
### Example usage - decorator
|
||||
|
||||
The first helper is a Python decorator that can be used to profile a function.
|
||||
If a filename is specified, the profile will be saved to that file. If no filename is
|
||||
specified, profile data will be printed to stdout.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
|
||||
@vllm.utils.cprofile("expensive_function.prof")
|
||||
def expensive_function():
|
||||
# some expensive code
|
||||
pass
|
||||
```
|
||||
|
||||
### Example Usage - context manager
|
||||
|
||||
The second helper is a context manager that can be used to profile a block of
|
||||
code. Similar to the decorator, the filename is optional.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
|
||||
def another_function():
|
||||
# more expensive code
|
||||
pass
|
||||
|
||||
with vllm.utils.cprofile_context("another_function.prof"):
|
||||
another_function()
|
||||
```
|
||||
|
||||
### Analyzing Profile Results
|
||||
|
||||
There are multiple tools available that can help analyze the profile results.
|
||||
One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
|
||||
|
||||
```bash
|
||||
pip install snakeviz
|
||||
snakeviz expensive_function.prof
|
||||
```
|
||||
|
||||
@ -7,192 +7,5 @@ A major use case is for multi-host/multi-node distributed inference.
|
||||
|
||||
vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* At least two Kubernetes nodes, each with 8 GPUs, are required.
|
||||
* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
|
||||
|
||||
## Deploy and Serve
|
||||
|
||||
Deploy the following yaml file `lws.yaml`
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
replicas: 2
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: <your-hf-token>
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: <your-hf-token>
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-leader
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: vllm
|
||||
role: leader
|
||||
type: ClusterIP
|
||||
```
|
||||
|
||||
```bash
|
||||
kubectl apply -f lws.yaml
|
||||
```
|
||||
|
||||
Verify the status of the pods:
|
||||
|
||||
```bash
|
||||
kubectl get pods
|
||||
```
|
||||
|
||||
Should get an output similar to this:
|
||||
|
||||
```bash
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
vllm-0 1/1 Running 0 2s
|
||||
vllm-0-1 1/1 Running 0 2s
|
||||
vllm-1 1/1 Running 0 2s
|
||||
vllm-1-1 1/1 Running 0 2s
|
||||
```
|
||||
|
||||
Verify that the distributed tensor-parallel inference works:
|
||||
|
||||
```bash
|
||||
kubectl logs vllm-0 |grep -i "Loading model weights took"
|
||||
```
|
||||
|
||||
Should get something similar to this:
|
||||
|
||||
```text
|
||||
INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
|
||||
(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
|
||||
```
|
||||
|
||||
## Access ClusterIP service
|
||||
|
||||
```bash
|
||||
# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
|
||||
kubectl port-forward svc/vllm-leader 8080:8080
|
||||
```
|
||||
|
||||
The output should be similar to the following:
|
||||
|
||||
```text
|
||||
Forwarding from 127.0.0.1:8080 -> 8080
|
||||
Forwarding from [::1]:8080 -> 8080
|
||||
```
|
||||
|
||||
## Serve the model
|
||||
|
||||
Open another terminal and send a request
|
||||
|
||||
```text
|
||||
curl http://localhost:8080/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
|
||||
The output should be similar to the following
|
||||
|
||||
```text
|
||||
{
|
||||
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
|
||||
"object": "text_completion",
|
||||
"created": 1715138766,
|
||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"text": " top destination for foodies, with",
|
||||
"logprobs": null,
|
||||
"finish_reason": "length",
|
||||
"stop_reason": null
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 5,
|
||||
"total_tokens": 12,
|
||||
"completion_tokens": 7
|
||||
}
|
||||
}
|
||||
```
|
||||
Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
|
||||
deploying vLLM on Kubernetes using LWS.
|
||||
|
||||
@ -4,9 +4,6 @@
|
||||
|
||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
||||
|
||||
* [Deployment with CPUs](#deployment-with-cpus)
|
||||
* [Deployment with GPUs](#deployment-with-gpus)
|
||||
|
||||
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [Helm](frameworks/helm.md)
|
||||
* [InftyAI/llmaz](integrations/llmaz.md)
|
||||
@ -17,107 +14,11 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
||||
* [vllm-project/production-stack](integrations/production-stack.md)
|
||||
|
||||
## Deployment with CPUs
|
||||
## Pre-requisite
|
||||
|
||||
:::{note}
|
||||
The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
|
||||
:::
|
||||
Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
|
||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
type: Opaque
|
||||
data:
|
||||
token: $(HF_TOKEN)
|
||||
```
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
||||
EOF
|
||||
```
|
||||
|
||||
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
|
||||
|
||||
```console
|
||||
kubectl logs -l app.kubernetes.io/name=vllm
|
||||
...
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
```
|
||||
|
||||
## Deployment with GPUs
|
||||
|
||||
**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
## Deployment using native K8s
|
||||
|
||||
1. Create a PVC, Secret and Deployment for vLLM
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
|
||||
Below are the steps to utilize BitsAndBytes with vLLM.
|
||||
|
||||
```console
|
||||
pip install bitsandbytes>=0.45.3
|
||||
pip install bitsandbytes>=0.45.0
|
||||
```
|
||||
|
||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||
@ -25,7 +25,7 @@ import torch
|
||||
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
|
||||
model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes")
|
||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
```
|
||||
|
||||
## Inflight quantization: load as 4bit quantization
|
||||
@ -35,7 +35,7 @@ from vllm import LLM
|
||||
import torch
|
||||
model_id = "huggyllama/llama-7b"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes")
|
||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
```
|
||||
|
||||
## OpenAI Compatible Server
|
||||
@ -43,5 +43,5 @@ quantization="bitsandbytes")
|
||||
Append the following to your 4bit model arguments:
|
||||
|
||||
```console
|
||||
--quantization bitsandbytes
|
||||
--quantization bitsandbytes --load-format bitsandbytes
|
||||
```
|
||||
|
||||
@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,
|
||||
|
||||
vLLM currently supports the following reasoning models:
|
||||
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
|
||||
| Model Series | Parser Name | Structured Output Support |
|
||||
|--------------|-------------|------------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
|
||||
|
||||
## Quickstart
|
||||
|
||||
@ -170,51 +170,10 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
## Tool Calling
|
||||
|
||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto"
|
||||
)
|
||||
|
||||
print(response)
|
||||
tool_call = response.choices[0].message.tool_calls[0].function
|
||||
|
||||
print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
|
||||
print(f"Function called: {tool_call.name}")
|
||||
print(f"Arguments: {tool_call.arguments}")
|
||||
```
|
||||
|
||||
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
|
||||
|
||||
## Limitations
|
||||
|
||||
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
|
||||
- It is not compatible with [`tool_calling`](#tool_calling).
|
||||
|
||||
## How to support a new reasoning model
|
||||
|
||||
|
||||
@ -30,10 +30,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="facebook/opt-6.7b",
|
||||
tensor_parallel_size=1,
|
||||
speculative_config={
|
||||
"model": "facebook/opt-125m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
speculative_model="facebook/opt-125m",
|
||||
num_speculative_tokens=5,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -47,14 +45,10 @@ To perform the same with an online mode launch the server:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
|
||||
--seed 42 -tp 1 --gpu_memory_utilization 0.8 \
|
||||
--speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
|
||||
--seed 42 -tp 1 --speculative_model facebook/opt-125m \
|
||||
--num_speculative_tokens 5 --gpu_memory_utilization 0.8
|
||||
```
|
||||
|
||||
:::{warning}
|
||||
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
|
||||
:::
|
||||
|
||||
Then use a client:
|
||||
|
||||
```python
|
||||
@ -107,11 +101,9 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="facebook/opt-6.7b",
|
||||
tensor_parallel_size=1,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 4,
|
||||
},
|
||||
speculative_model="[ngram]",
|
||||
num_speculative_tokens=5,
|
||||
ngram_prompt_lookup_max=4,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -139,10 +131,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
speculative_config={
|
||||
"model": "ibm-ai-platform/llama3-70b-accelerator",
|
||||
"draft_tensor_parallel_size": 1,
|
||||
},
|
||||
speculative_model="ibm-ai-platform/llama3-70b-accelerator",
|
||||
speculative_draft_tensor_parallel_size=1,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -185,10 +175,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM(
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
speculative_config={
|
||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"draft_tensor_parallel_size": 1,
|
||||
},
|
||||
speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
speculative_draft_tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@ -206,10 +194,11 @@ A few important things to consider when using the EAGLE based draft models:
|
||||
be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
|
||||
If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
|
||||
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
|
||||
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
|
||||
and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
|
||||
the latest version of vLLM, please leave a comment or raise an issue.
|
||||
|
||||
2. The EAGLE based draft models need to be run without tensor parallelism
|
||||
(i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
|
||||
(i.e. speculative_draft_tensor_parallel_size is set to 1), although
|
||||
it is possible to run the main model using tensor parallelism (see example above).
|
||||
|
||||
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
||||
|
||||
@ -26,3 +26,4 @@ installation/ai_accelerator
|
||||
- Google TPU
|
||||
- Intel Gaudi
|
||||
- AWS Neuron
|
||||
- OpenVINO
|
||||
|
||||
@ -36,6 +36,16 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "# Installation"
|
||||
:end-before: "## Requirements"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Requirements
|
||||
@ -73,6 +83,16 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "## Requirements"
|
||||
:end-before: "## Set up using Python"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Configure a new environment
|
||||
@ -110,6 +130,14 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} python_env_setup.inc.md
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Set up using Python
|
||||
@ -149,6 +177,16 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Pre-built wheels"
|
||||
:end-before: "### Build wheel from source"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Build wheel from source
|
||||
@ -186,6 +224,16 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Build wheel from source"
|
||||
:end-before: "## Set up using Docker"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Set up using Docker
|
||||
@ -225,6 +273,16 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Pre-built images"
|
||||
:end-before: "### Build image from source"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Build image from source
|
||||
@ -262,6 +320,16 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Build image from source"
|
||||
:end-before: "## Extra information"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Extra information
|
||||
@ -296,4 +364,13 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "## Extra information"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
@ -0,0 +1,110 @@
|
||||
# Installation
|
||||
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
|
||||
|
||||
:::{attention}
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
:::
|
||||
|
||||
## Requirements
|
||||
|
||||
- OS: Linux
|
||||
- Instruction set architecture (ISA) requirement: at least AVX2.
|
||||
|
||||
## Set up using Python
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built OpenVINO wheels.
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
|
||||
|
||||
```console
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install python3
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
|
||||
|
||||
```console
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Finally, install vLLM with OpenVINO backend:
|
||||
|
||||
```console
|
||||
PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
|
||||
:::
|
||||
|
||||
## Set up using Docker
|
||||
|
||||
### Pre-built images
|
||||
|
||||
Currently, there are no pre-built OpenVINO images.
|
||||
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
docker build -f Dockerfile.openvino -t vllm-openvino-env .
|
||||
docker run -it --rm vllm-openvino-env
|
||||
```
|
||||
|
||||
## Extra information
|
||||
|
||||
## Supported features
|
||||
|
||||
OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
|
||||
- Prefix caching (`--enable-prefix-caching`)
|
||||
- Chunked prefill (`--enable-chunked-prefill`)
|
||||
|
||||
## Performance tips
|
||||
|
||||
### vLLM OpenVINO backend environment variables
|
||||
|
||||
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
|
||||
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
|
||||
|
||||
### CPU performance tips
|
||||
|
||||
CPU uses the following environment variables to control behavior:
|
||||
|
||||
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
|
||||
|
||||
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
|
||||
|
||||
OpenVINO best known configuration for CPU is:
|
||||
|
||||
```console
|
||||
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
|
||||
```
|
||||
|
||||
### GPU performance tips
|
||||
|
||||
GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
|
||||
|
||||
Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
|
||||
|
||||
OpenVINO best known configuration for GPU is:
|
||||
|
||||
```console
|
||||
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- LoRA serving is not supported.
|
||||
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
|
||||
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
|
||||
@ -193,7 +193,7 @@ vLLM CPU backend supports the following vLLM features:
|
||||
|
||||
## Related runtime environment variables
|
||||
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
|
||||
- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
|
||||
|
||||
|
||||
@ -58,11 +58,6 @@ from vllm import LLM, SamplingParams
|
||||
```
|
||||
|
||||
The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
|
||||
:::{important}
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
|
||||
:::
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
@ -81,7 +76,7 @@ llm = LLM(model="facebook/opt-125m")
|
||||
```
|
||||
|
||||
:::{note}
|
||||
By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
|
||||
By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
|
||||
:::
|
||||
|
||||
Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
|
||||
@ -112,11 +107,6 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
|
||||
By default, the server uses a predefined chat template stored in the tokenizer.
|
||||
You can learn about overriding it [here](#chat-template).
|
||||
:::
|
||||
:::{important}
|
||||
By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
To disable this behavior, please pass `--generation-config vllm` when launching the server.
|
||||
:::
|
||||
|
||||
This server can be queried in the same format as OpenAI API. For example, to list the models:
|
||||
|
||||
|
||||
@ -156,9 +156,6 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
|
||||
|
||||
For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
||||
## Frequently Asked Questions
|
||||
## FAQ
|
||||
|
||||
**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
|
||||
The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
|
||||
|
||||
On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
|
||||
TODO
|
||||
|
||||
@ -1,5 +0,0 @@
|
||||
Loading Model weights with fastsafetensors
|
||||
===================================================================
|
||||
|
||||
Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
|
||||
For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
|
||||
@ -5,5 +5,4 @@
|
||||
|
||||
runai_model_streamer
|
||||
tensorizer
|
||||
fastsafetensor
|
||||
:::
|
||||
|
||||
@ -46,11 +46,6 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
:::{important}
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
|
||||
:::
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:
|
||||
|
||||
- <project:#quantization-index> (except GGUF)
|
||||
- <project:#lora-adapter>
|
||||
- <project:#distributed-serving> (requires `transformers>=4.49.0`)
|
||||
- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
|
||||
|
||||
#### Remote code
|
||||
|
||||
@ -472,11 +472,6 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `TeleFLMForCausalLM`
|
||||
* TeleFLM
|
||||
* `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `XverseForCausalLM`
|
||||
* XVERSE
|
||||
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
|
||||
|
||||
@ -2,12 +2,7 @@
|
||||
|
||||
# Engine Arguments
|
||||
|
||||
Engine arguments control the behavior of the vLLM engine.
|
||||
|
||||
- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
|
||||
- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
|
||||
|
||||
Below, you can find an explanation of every engine argument:
|
||||
Below, you can find an explanation of every engine argument for vLLM:
|
||||
|
||||
<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
|
||||
```{eval-rst}
|
||||
@ -20,7 +15,7 @@ Below, you can find an explanation of every engine argument:
|
||||
|
||||
## Async Engine Arguments
|
||||
|
||||
Additional arguments are available to the asynchronous engine which is used for online serving:
|
||||
Below are the additional arguments related to the asynchronous engine:
|
||||
|
||||
<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
|
||||
```{eval-rst}
|
||||
|
||||
@ -97,13 +97,6 @@ llm = LLM(model="adept/fuyu-8b",
|
||||
max_num_seqs=2)
|
||||
```
|
||||
|
||||
#### Adjust cache size
|
||||
|
||||
If you run out of CPU RAM, try the following options:
|
||||
|
||||
- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
|
||||
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
|
||||
|
||||
### Performance optimization and tuning
|
||||
|
||||
You can potentially improve the performance of vLLM by finetuning various options.
|
||||
|
||||
@ -29,15 +29,6 @@ completion = client.chat.completions.create(
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
||||
You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
|
||||
:::
|
||||
:::{important}
|
||||
By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
To disable this behavior, please pass `--generation-config vllm` when launching the server.
|
||||
:::
|
||||
## Supported APIs
|
||||
|
||||
We currently support the following OpenAI APIs:
|
||||
|
||||
@ -83,6 +83,7 @@ def initialize_engine(model: str, quantization: str,
|
||||
engine_args = EngineArgs(model=model,
|
||||
quantization=quantization,
|
||||
qlora_adapter_name_or_path=lora_repo,
|
||||
load_format="bitsandbytes",
|
||||
enable_lora=True,
|
||||
max_lora_rank=64)
|
||||
else:
|
||||
|
||||
@ -50,9 +50,7 @@ if __name__ == "__main__":
|
||||
# Create an LLM with spec decoding
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-13b-chat-hf",
|
||||
speculative_config={
|
||||
"model": "ibm-ai-platform/llama-13b-accelerator",
|
||||
},
|
||||
speculative_model="ibm-ai-platform/llama-13b-accelerator",
|
||||
)
|
||||
|
||||
print("With speculation")
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# vLLM does not guarantee the reproducibility of the results by default,
|
||||
# for the sake of performance. You need to do the following to achieve
|
||||
# reproducible results:
|
||||
# 1. Turn off multiprocessing to make the scheduling deterministic.
|
||||
# NOTE(woosuk): This is not needed and will be ignored for V0.
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
# 2. Fix the global seed for reproducibility. The default seed is None, which is
|
||||
# not reproducible.
|
||||
SEED = 42
|
||||
|
||||
# NOTE(woosuk): Even with the above two settings, vLLM only provides
|
||||
# reproducibility when it runs on the same hardware and the same vLLM version.
|
||||
# Also, the online serving API (`vllm serve`) does not support reproducibility
|
||||
# because it is almost impossible to make the scheduling deterministic in the
|
||||
# online serving setting.
|
||||
|
||||
llm = LLM(model="facebook/opt-125m", seed=SEED)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
@ -1,177 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
An example demonstrates how to use tool calling with reasoning models
|
||||
like QwQ-32B. The reasoning_content will not be parsed by the tool
|
||||
calling process; only the final output will be parsed.
|
||||
|
||||
To run this example, you need to start the vLLM server with both
|
||||
the reasoning parser and tool calling enabled.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/QwQ-32B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1 \
|
||||
--enable-auto-tool-choice --tool-call-parser hermes
|
||||
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
# Now, simulate a tool call
|
||||
def get_current_weather(city: str, state: str, unit: 'str'):
|
||||
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
|
||||
"partly cloudly, with highs in the 90's.")
|
||||
|
||||
|
||||
available_tools = {"get_current_weather": get_current_weather}
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": "Hi! How are you doing today?"
|
||||
}, {
|
||||
"role": "assistant",
|
||||
"content": "I'm doing well! How can I help you?"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
}]
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning_content = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
function_names.append("")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
function_names[tool_call_idx] = tool_call.function.name
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content += chunk.choices[0].delta.reasoning_content
|
||||
return reasoning_content, arguments, function_names
|
||||
|
||||
|
||||
print("---------Full Generate With Automatic Function Calling-------------")
|
||||
tool_calls = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools)
|
||||
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
|
||||
print(f"function name: "
|
||||
f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
|
||||
print(f"function arguments: "
|
||||
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
|
||||
|
||||
print("----------Stream Generate With Automatic Function Calling-----------")
|
||||
tool_calls_stream = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
stream=True)
|
||||
chunks = []
|
||||
for chunk in tool_calls_stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
|
||||
chunks)
|
||||
|
||||
print(f"reasoning_content: {reasoning_content}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
|
||||
print("----------Full Generate With Named Function Calling-----------------")
|
||||
tool_calls = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name":
|
||||
"get_current_weather"
|
||||
}
|
||||
})
|
||||
|
||||
tool_call = tool_calls.choices[0].message.tool_calls[0].function
|
||||
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
|
||||
print(f"function name: {tool_call.name}")
|
||||
print(f"function arguments: {tool_call.arguments}")
|
||||
print("----------Stream Generate With Named Function Calling--------------")
|
||||
|
||||
tool_calls_stream = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather"
|
||||
}
|
||||
},
|
||||
stream=True)
|
||||
|
||||
chunks = []
|
||||
for chunk in tool_calls_stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
|
||||
chunks)
|
||||
print(f"reasoning_content: {reasoning_content}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
print("\n\n")
|
||||
@ -1,12 +0,0 @@
|
||||
{%- for message in messages %}
|
||||
{%- if message['role'] == 'user' %}
|
||||
{{- '<_user>' + message['content']|trim }}
|
||||
{%- elif message['role'] == 'system' %}
|
||||
{{- '<_system>' + message['content']|trim }}
|
||||
{%- elif message['role'] == 'assistant' %}
|
||||
{{- '<_bot>' + message['content'] }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<_bot>' }}
|
||||
{%- endif %}
|
||||
@ -86,7 +86,6 @@ exclude = [
|
||||
"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/worker/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/utils.py" = ["UP006", "UP035"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
|
||||
@ -18,7 +18,7 @@ pillow # Required for image processing
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer >= 0.10.11, < 0.11
|
||||
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
|
||||
8
requirements/openvino.txt
Normal file
8
requirements/openvino.txt
Normal file
@ -0,0 +1,8 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
torch == 2.5.1 # should be aligned with "common" vLLM torch version
|
||||
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
|
||||
|
||||
optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
|
||||
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
|
||||
@ -1,10 +1,10 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
||||
torch==2.6.0
|
||||
torchvision==0.21.0
|
||||
torchaudio==2.6.0
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2
|
||||
torch==2.5.1
|
||||
torchvision==0.20.1
|
||||
torchaudio==2.5.1
|
||||
|
||||
cmake>=3.26
|
||||
packaging
|
||||
|
||||
@ -41,4 +41,3 @@ tritonclient==2.51.0
|
||||
numpy < 2.0.0
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
fastsafetensors>=0.1.10
|
||||
|
||||
@ -67,7 +67,6 @@ click==8.1.7
|
||||
# jiwer
|
||||
# nltk
|
||||
# ray
|
||||
# typer
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# awscli
|
||||
@ -123,8 +122,6 @@ fastparquet==2024.11.0
|
||||
# via genai-perf
|
||||
fastrlock==0.8.2
|
||||
# via cupy-cuda12x
|
||||
fastsafetensors==0.1.10
|
||||
# via -r requirements/test.in
|
||||
filelock==3.16.1
|
||||
# via
|
||||
# datasets
|
||||
@ -508,9 +505,7 @@ requests==2.32.3
|
||||
responses==0.25.3
|
||||
# via genai-perf
|
||||
rich==13.9.4
|
||||
# via
|
||||
# genai-perf
|
||||
# typer
|
||||
# via genai-perf
|
||||
rouge-score==0.1.2
|
||||
# via lm-eval
|
||||
rpds-py==0.20.1
|
||||
@ -555,8 +550,6 @@ setuptools==75.8.0
|
||||
# via
|
||||
# pytablewriter
|
||||
# torch
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
six==1.16.0
|
||||
# via
|
||||
# python-dateutil
|
||||
@ -607,7 +600,6 @@ torch==2.6.0
|
||||
# accelerate
|
||||
# bitsandbytes
|
||||
# encodec
|
||||
# fastsafetensors
|
||||
# lm-eval
|
||||
# peft
|
||||
# runai-model-streamer
|
||||
@ -662,8 +654,6 @@ typepy==1.3.2
|
||||
# dataproperty
|
||||
# pytablewriter
|
||||
# tabledata
|
||||
typer==0.15.2
|
||||
# via fastsafetensors
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
@ -673,7 +663,6 @@ typing-extensions==4.12.2
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# torch
|
||||
# typer
|
||||
tzdata==2024.2
|
||||
# via pandas
|
||||
urllib3==2.2.3
|
||||
|
||||
11
setup.py
11
setup.py
@ -449,6 +449,10 @@ def _is_cpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "cpu"
|
||||
|
||||
|
||||
def _is_openvino() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "openvino"
|
||||
|
||||
|
||||
def _is_xpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "xpu"
|
||||
|
||||
@ -568,6 +572,8 @@ def get_vllm_version() -> str:
|
||||
if gaudi_sw_version != MAIN_CUDA_VERSION:
|
||||
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
|
||||
version += f"{sep}gaudi{gaudi_sw_version}"
|
||||
elif _is_openvino():
|
||||
version += f"{sep}openvino"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
elif _is_cpu():
|
||||
@ -617,6 +623,8 @@ def get_requirements() -> list[str]:
|
||||
requirements = _read_requirements("neuron.txt")
|
||||
elif _is_hpu():
|
||||
requirements = _read_requirements("hpu.txt")
|
||||
elif _is_openvino():
|
||||
requirements = _read_requirements("openvino.txt")
|
||||
elif _is_tpu():
|
||||
requirements = _read_requirements("tpu.txt")
|
||||
elif _is_cpu():
|
||||
@ -626,7 +634,7 @@ def get_requirements() -> list[str]:
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
|
||||
"or CPU.")
|
||||
"OpenVINO, or CPU.")
|
||||
return requirements
|
||||
|
||||
|
||||
@ -680,7 +688,6 @@ setup(
|
||||
install_requires=get_requirements(),
|
||||
extras_require={
|
||||
"tensorizer": ["tensorizer>=2.9.0"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile"], # Required for audio processing
|
||||
"video": ["decord"] # Required for video processing
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import Cython.Compiler.Options
|
||||
from Cython.Build import cythonize
|
||||
from setuptools import setup
|
||||
|
||||
Cython.Compiler.Options.annotate = True
|
||||
|
||||
infiles = []
|
||||
|
||||
infiles += [
|
||||
"vllm/engine/llm_engine.py",
|
||||
"vllm/transformers_utils/detokenizer.py",
|
||||
"vllm/engine/output_processor/single_step.py",
|
||||
"vllm/outputs.py",
|
||||
"vllm/engine/output_processor/stop_checker.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/core/scheduler.py",
|
||||
"vllm/sequence.py",
|
||||
"vllm/core/block_manager.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/model_executor/layers/sampler.py",
|
||||
"vllm/sampling_params.py",
|
||||
"vllm/utils.py",
|
||||
]
|
||||
|
||||
setup(ext_modules=cythonize(infiles,
|
||||
annotate=False,
|
||||
force=True,
|
||||
compiler_directives={
|
||||
'language_level': "3",
|
||||
'infer_types': True
|
||||
}))
|
||||
|
||||
# example usage: python3 build_cython.py build_ext --inplace
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import copy
|
||||
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -9,63 +10,32 @@ from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import CompilationConfig
|
||||
|
||||
|
||||
# dummy custom pass that doesn't inherit
|
||||
def simple_callable(graph: torch.fx.Graph):
|
||||
pass
|
||||
|
||||
|
||||
# Should fail to add directly to the pass manager
|
||||
def test_bad_callable():
|
||||
config = CompilationConfig().pass_config
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
pass_manager.add(simple_callable) # noqa, type wrong on purpose
|
||||
|
||||
|
||||
# Pass that inherits from InductorPass
|
||||
class ProperPass(InductorPass):
|
||||
|
||||
def __call__(self, graph: torch.fx.graph.Graph) -> None:
|
||||
pass
|
||||
callable_uuid = CallableInductorPass(simple_callable,
|
||||
InductorPass.hash_source(__file__))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"callable",
|
||||
"works, callable",
|
||||
[
|
||||
ProperPass(),
|
||||
# Can also wrap callables in CallableInductorPass for compliance
|
||||
CallableInductorPass(simple_callable),
|
||||
CallableInductorPass(simple_callable,
|
||||
InductorPass.hash_source(__file__))
|
||||
(False, simple_callable),
|
||||
(True, callable_uuid),
|
||||
(True, CallableInductorPass(simple_callable)),
|
||||
],
|
||||
)
|
||||
def test_pass_manager_uuid(callable):
|
||||
def test_pass_manager(works: bool, callable):
|
||||
config = CompilationConfig().pass_config
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
# Check that UUID is different if the same pass is added 2x
|
||||
pass_manager.add(callable)
|
||||
uuid1 = pass_manager.uuid()
|
||||
pass_manager.add(callable)
|
||||
uuid2 = pass_manager.uuid()
|
||||
assert uuid1 != uuid2
|
||||
|
||||
# UUID should be the same as the original one,
|
||||
# as we constructed in the same way.
|
||||
pass_manager2 = PostGradPassManager()
|
||||
pass_manager2.configure(config)
|
||||
pass_manager2.add(callable)
|
||||
assert uuid1 == pass_manager2.uuid()
|
||||
|
||||
# UUID should be different due to config change
|
||||
config2 = copy.deepcopy(config)
|
||||
config2.enable_fusion = not config2.enable_fusion
|
||||
pass_manager3 = PostGradPassManager()
|
||||
pass_manager3.configure(config2)
|
||||
pass_manager3.add(callable)
|
||||
assert uuid1 != pass_manager3.uuid()
|
||||
# Try to add the callable to the pass manager
|
||||
if works:
|
||||
pass_manager.add(callable)
|
||||
pickle.dumps(pass_manager)
|
||||
else:
|
||||
with pytest.raises(AssertionError):
|
||||
pass_manager.add(callable)
|
||||
|
||||
@ -273,7 +273,8 @@ class HfRunner:
|
||||
def get_default_device(self):
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
return ("cpu" if current_platform.is_cpu() else "cuda")
|
||||
return ("cpu" if current_platform.is_cpu()
|
||||
or current_platform.is_openvino() else "cuda")
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
if x is None or isinstance(x, (bool, )):
|
||||
|
||||
@ -175,8 +175,6 @@ TEXT_GENERATION_MODELS = {
|
||||
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
||||
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
|
||||
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
|
||||
# Tests TransformersModel
|
||||
"ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
|
||||
# Uses Llama
|
||||
@ -245,7 +243,6 @@ TEST_MODELS = [
|
||||
# [LANGUAGE GENERATION]
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
# "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
|
||||
"ibm/PowerLM-3b",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
|
||||
@ -21,9 +21,18 @@ def test_collective_rpc(tp_size, backend):
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
class MyWorker(Worker):
|
||||
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
load_format="dummy",
|
||||
tensor_parallel_size=tp_size,
|
||||
distributed_executor_backend=backend)
|
||||
assert llm.collective_rpc(echo_rank) == list(range(tp_size))
|
||||
distributed_executor_backend=backend,
|
||||
worker_cls=MyWorker)
|
||||
for method in ["echo_rank", echo_rank]:
|
||||
assert llm.collective_rpc(method) == list(range(tp_size))
|
||||
|
||||
@ -107,10 +107,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||
# Call the function and get the result
|
||||
result = apply_hf_chat_template(
|
||||
tokenizer,
|
||||
trust_remote_code=True,
|
||||
conversation=mock_request.messages,
|
||||
chat_template=mock_request.chat_template or template_content,
|
||||
tools=None,
|
||||
add_generation_prompt=mock_request.add_generation_prompt,
|
||||
continue_final_message=mock_request.continue_final_message,
|
||||
)
|
||||
|
||||
@ -1,145 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# a reasoning and tool calling model
|
||||
MODEL_NAME = "Qwen/QwQ-32B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
|
||||
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
|
||||
"--tool-call-parser", "hermes"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
TOOLS = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
MESSAGES = [{
|
||||
"role": "user",
|
||||
"content": "Hi! How are you doing today?"
|
||||
}, {
|
||||
"role": "assistant",
|
||||
"content": "I'm doing well! How can I help you?"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
}]
|
||||
|
||||
FUNC_NAME = "get_current_weather"
|
||||
FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning_content = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
for chunk in chunks:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
|
||||
arguments.append("")
|
||||
function_names.append("")
|
||||
|
||||
if tool_call.function:
|
||||
if tool_call.function.name:
|
||||
function_names[tool_call_idx] = tool_call.function.name
|
||||
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content += chunk.choices[0].delta.reasoning_content
|
||||
return reasoning_content, arguments, function_names
|
||||
|
||||
|
||||
# test streaming
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_streaming_of_tool_and_reasoning(
|
||||
client: openai.AsyncOpenAI):
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
chunks = []
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
|
||||
chunks)
|
||||
assert len(reasoning_content) > 0
|
||||
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
|
||||
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
|
||||
|
||||
|
||||
# test full generate
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
|
||||
|
||||
tool_calls = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=MESSAGES,
|
||||
tools=TOOLS,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert len(tool_calls.choices[0].message.reasoning_content) > 0
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.name \
|
||||
== FUNC_NAME
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
|
||||
== FUNC_ARGS
|
||||
@ -87,7 +87,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
|
||||
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
@ -180,7 +180,7 @@ async def test_single_chat_session_video_base64encoded(
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
|
||||
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
|
||||
@ -4,13 +4,10 @@ import warnings
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
|
||||
_try_extract_ast, load_chat_template,
|
||||
from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
parse_chat_messages,
|
||||
parse_chat_messages_futures,
|
||||
resolve_chat_template_content_format)
|
||||
@ -26,10 +23,8 @@ EXAMPLES_DIR = VLLM_PATH / "examples"
|
||||
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
|
||||
ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
|
||||
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
@ -708,70 +703,25 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
vllm_result = apply_hf_chat_template(
|
||||
tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
conversation=conversation,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
assert hf_result == vllm_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
QWEN2VL_MODEL_ID, # tokenizer.chat_template is of type str
|
||||
HERMES_MODEL_ID, # tokenizer.chat_template is of type dict
|
||||
])
|
||||
@pytest.mark.parametrize("use_tools", [True, False])
|
||||
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
"""checks that chat_template is a dict type for HF models."""
|
||||
|
||||
# Build the tokenizer group and grab the underlying tokenizer
|
||||
tokenizer_group = TokenizerGroup(
|
||||
model,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
)
|
||||
tokenizer = tokenizer_group.tokenizer
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}] if use_tools else None
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
chat_template = _resolve_hf_chat_template(
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=tools,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("model", "expected_format"),
|
||||
[(PHI3V_MODEL_ID, "string"),
|
||||
(QWEN2VL_MODEL_ID, "openai"),
|
||||
(QWEN25VL_MODEL_ID, "openai"),
|
||||
(ULTRAVOX_MODEL_ID, "string"),
|
||||
(MLLAMA_MODEL_ID, "openai"),
|
||||
(LLAMA_GUARD_MODEL_ID, "openai")],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
|
||||
"4.49.0"):
|
||||
pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
|
||||
|
||||
tokenizer_group = TokenizerGroup(
|
||||
model,
|
||||
enable_lora=False,
|
||||
@ -780,13 +730,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
)
|
||||
tokenizer = tokenizer_group.tokenizer
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
chat_template = _resolve_hf_chat_template(
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
chat_template = tokenizer.chat_template
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
print("[TEXT]")
|
||||
@ -796,10 +740,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
|
||||
resolved_format = resolve_chat_template_content_format(
|
||||
None, # Test detecting the tokenizer's chat_template
|
||||
None,
|
||||
"auto",
|
||||
tokenizer,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
@ -849,10 +791,8 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
||||
|
||||
resolved_format = resolve_chat_template_content_format(
|
||||
chat_template,
|
||||
None,
|
||||
"auto",
|
||||
dummy_tokenizer,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
|
||||
test_model = "openai-community/gpt2"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
|
||||
|
||||
|
||||
def test_model_loader_download_files(vllm_runner):
|
||||
with vllm_runner(test_model,
|
||||
load_format=LoadFormat.FASTSAFETENSORS) as llm:
|
||||
deserialized_outputs = llm.generate(prompts, sampling_params)
|
||||
assert deserialized_outputs
|
||||
@ -1,46 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import glob
|
||||
import tempfile
|
||||
|
||||
import huggingface_hub.constants
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf, fastsafetensors_weights_iterator,
|
||||
safetensors_weights_iterator)
|
||||
|
||||
|
||||
def test_fastsafetensors_model_loader():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = False
|
||||
download_weights_from_hf("openai-community/gpt2",
|
||||
allow_patterns=["*.safetensors"],
|
||||
cache_dir=tmpdir)
|
||||
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
|
||||
assert len(safetensors) > 0
|
||||
|
||||
fastsafetensors_tensors = {}
|
||||
hf_safetensors_tensors = {}
|
||||
|
||||
for name, tensor in fastsafetensors_weights_iterator(
|
||||
safetensors, True):
|
||||
fastsafetensors_tensors[name] = tensor
|
||||
|
||||
for name, tensor in safetensors_weights_iterator(safetensors, True):
|
||||
hf_safetensors_tensors[name] = tensor
|
||||
|
||||
assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
|
||||
|
||||
for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
|
||||
fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
|
||||
assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
|
||||
name].dtype
|
||||
assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
|
||||
name].shape
|
||||
assert torch.all(
|
||||
fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_fastsafetensors_model_loader()
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -8,6 +8,7 @@ import torch
|
||||
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
from vllm.platforms.openvino import OpenVinoPlatform
|
||||
from vllm.platforms.rocm import RocmPlatform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
||||
|
||||
@ -20,9 +21,9 @@ def clear_cache():
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
|
||||
def test_env(
|
||||
name: str,
|
||||
use_v1: bool,
|
||||
@ -48,8 +49,15 @@ def test_env(
|
||||
RocmPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||
16, False)
|
||||
EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
assert backend.get_name() == EXPECTED
|
||||
elif device == "openvino":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
OpenVinoPlatform()), patch.dict('sys.modules',
|
||||
{'openvino': Mock()}):
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||
16, False)
|
||||
assert backend.get_name() == "OPENVINO"
|
||||
else:
|
||||
if name in ["XFORMERS", "FLASHINFER"]:
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
|
||||
@ -606,51 +606,6 @@ def test_marlin_qqq_gemm(
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
def test_marlin_gemm_subset_input():
|
||||
quant_type = scalar_types.uint4b8
|
||||
group_size = 128
|
||||
|
||||
size_m, size_k, size_n = 32, 1024, 2048
|
||||
big_m = size_m * 2
|
||||
big_k = size_k * 2
|
||||
|
||||
a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
|
||||
b_weight, quant_type, group_size, False)
|
||||
|
||||
marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
|
||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
a_input,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_zp,
|
||||
g_idx,
|
||||
sort_indices,
|
||||
workspace.scratch,
|
||||
quant_type,
|
||||
a_input.shape[0],
|
||||
b_weight.shape[1],
|
||||
a_input.shape[1],
|
||||
is_k_full=True,
|
||||
has_zp=False,
|
||||
use_atomic_add=False,
|
||||
use_fp32_reduce=True,
|
||||
is_zp_float=False,
|
||||
)
|
||||
output_ref = torch.matmul(a_input, w_ref)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
def test_marlin_gemm_opcheck():
|
||||
size_m = 2048
|
||||
size_n = 4096
|
||||
|
||||
@ -3,11 +3,8 @@
|
||||
|
||||
Run `pytest tests/kernels/test_moe.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch.nn import Parameter
|
||||
from torch.nn import functional as F
|
||||
from transformers import MixtralConfig
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||
|
||||
@ -40,7 +37,6 @@ TOP_KS = [2, 6]
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
def test_fused_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
@ -49,7 +45,6 @@ def test_fused_moe(
|
||||
topk: int,
|
||||
ep_size: int,
|
||||
dtype: torch.dtype,
|
||||
padding: bool,
|
||||
):
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
@ -70,23 +65,6 @@ def test_fused_moe(
|
||||
else:
|
||||
e_map = None
|
||||
|
||||
torch_output = torch_moe(a, w1, w2, score, topk, e_map)
|
||||
iterative_output = iterative_moe(a,
|
||||
w1,
|
||||
w2,
|
||||
score,
|
||||
topk,
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False)
|
||||
|
||||
# Pad the weight if moe padding is enabled
|
||||
if padding:
|
||||
w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
|
||||
torch.cuda.empty_cache()
|
||||
w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
triton_output = fused_moe(a,
|
||||
w1,
|
||||
w2,
|
||||
@ -95,7 +73,16 @@ def test_fused_moe(
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False)
|
||||
torch_output = torch_moe(a, w1, w2, score, topk, e_map)
|
||||
torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
|
||||
iterative_output = iterative_moe(a,
|
||||
w1,
|
||||
w2,
|
||||
score,
|
||||
topk,
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False)
|
||||
torch.testing.assert_close(iterative_output,
|
||||
torch_output,
|
||||
atol=2e-2,
|
||||
@ -215,9 +202,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
|
||||
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_mixtral_moe(dtype: torch.dtype, padding: bool):
|
||||
def test_mixtral_moe(dtype: torch.dtype):
|
||||
"""Make sure our Mixtral MoE implementation agrees with the one from
|
||||
huggingface."""
|
||||
|
||||
@ -247,17 +233,6 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
|
||||
# vLLM uses 1D query [num_tokens, hidden_dim]
|
||||
vllm_inputs = hf_inputs.flatten(0, 1)
|
||||
|
||||
# Pad the weight if moe padding is enabled
|
||||
if padding:
|
||||
vllm_moe.experts.w13_weight = Parameter(F.pad(
|
||||
vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[..., 0:-128],
|
||||
requires_grad=False)
|
||||
torch.cuda.empty_cache()
|
||||
vllm_moe.experts.w2_weight = Parameter(F.pad(
|
||||
vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
|
||||
requires_grad=False)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Run forward passes for both MoE blocks
|
||||
hf_states, _ = hf_moe.forward(hf_inputs)
|
||||
vllm_states = vllm_moe.forward(vllm_inputs)
|
||||
|
||||
@ -26,7 +26,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
# Test standard ROCm attention
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
||||
assert (backend.get_name() == "ROCM_FLASH"
|
||||
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
|
||||
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
|
||||
|
||||
# mla test for deepseek related
|
||||
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import asyncio
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm.envs as env
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
@ -11,9 +13,35 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import merge_async_iterators
|
||||
|
||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||
LORA_RANK = 64
|
||||
DEFAULT_MAX_LORAS = 4 * 3
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa
|
||||
LORA_RANK = 8
|
||||
DEFAULT_MAX_LORAS = 16 * 3
|
||||
|
||||
|
||||
def download_and_prepare_lora_module():
|
||||
"""
|
||||
Request submission is expensive when the LoRA adapters have their own
|
||||
tokenizers. This is because, for each request with a new LoRA adapter ID,
|
||||
the front-end loads the tokenizer from disk.
|
||||
|
||||
In this test, as we are comparing request processing times, we want to
|
||||
minimize any extra activity. To this effect, we download the LoRA
|
||||
adapter and remove all the tokenizer files, so the engine will default
|
||||
to the base model tokenizer.
|
||||
"""
|
||||
global LORA_MODULE_DOWNLOAD_PATH
|
||||
|
||||
LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
|
||||
|
||||
tokenizer_files = [
|
||||
'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
|
||||
'tokenizer.model'
|
||||
]
|
||||
for tokenizer_file in tokenizer_files:
|
||||
del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
|
||||
del_path.unlink(missing_ok=True)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@ -24,9 +52,11 @@ def v1(run_with_both_engines_lora):
|
||||
pass
|
||||
|
||||
|
||||
def get_lora_requests(lora_path) -> list[LoRARequest]:
|
||||
def get_lora_requests() -> list[LoRARequest]:
|
||||
lora_requests: list[LoRARequest] = [
|
||||
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
|
||||
LoRARequest(lora_name=f"{i}",
|
||||
lora_int_id=i,
|
||||
lora_path=LORA_MODULE_DOWNLOAD_PATH)
|
||||
for i in range(1, DEFAULT_MAX_LORAS + 1)
|
||||
]
|
||||
return lora_requests
|
||||
@ -63,7 +93,7 @@ async def requests_processing_time(llm,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_lora(chatglm3_lora_files):
|
||||
async def test_add_lora():
|
||||
"""
|
||||
The add_lora function is used to pre-load some LoRA adapters into the
|
||||
engine in anticipation of future requests using these adapters. To test
|
||||
@ -73,7 +103,10 @@ async def test_add_lora(chatglm3_lora_files):
|
||||
We measure the request processing time in both cases and expect the time
|
||||
to be lesser in the case with add_lora() calls.
|
||||
"""
|
||||
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
|
||||
|
||||
download_and_prepare_lora_module()
|
||||
|
||||
lora_requests: list[LoRARequest] = get_lora_requests()
|
||||
|
||||
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
|
||||
# Create engine in eager-mode. Due to high max_loras, the CI can
|
||||
@ -85,7 +118,6 @@ async def test_add_lora(chatglm3_lora_files):
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8, #avoid OOM
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True)
|
||||
|
||||
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
|
||||
|
||||
@ -84,14 +84,12 @@ def v1(run_with_both_engines_lora):
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=13,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
enable_chunked_prefill=True)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
|
||||
@ -24,10 +24,12 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
|
||||
)
|
||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
||||
prompt="prompt", lora_request=lora_request)
|
||||
request_id="request_id", prompt="prompt", lora_request=lora_request)
|
||||
assert reference_tokenizer.encode(
|
||||
"prompt") == await tokenizer_group.encode_async(
|
||||
prompt="prompt", lora_request=lora_request)
|
||||
request_id="request_id",
|
||||
prompt="prompt",
|
||||
lora_request=lora_request)
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
|
||||
@ -7,10 +7,7 @@ from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.layers.activation import (GeluAndMul,
|
||||
ReLUSquaredActivation,
|
||||
SiluAndMul)
|
||||
from vllm.model_executor.layers.layernorm import (
|
||||
RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
|
||||
rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
|
||||
|
||||
# Registered subclass for test
|
||||
@ -90,27 +87,3 @@ def test_enabled_ops_invalid(env: str):
|
||||
custom_ops=env.split(",")))
|
||||
with set_current_vllm_config(vllm_config):
|
||||
RMSNorm(1024).enabled()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_residual", [True, False])
|
||||
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
|
||||
@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
|
||||
@pytest.mark.skipif(not current_platform.is_rocm(),
|
||||
reason="AITER is a feature exclusive for ROCm")
|
||||
def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
|
||||
use_rocm_aiter_norm: str, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
|
||||
rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
|
||||
|
||||
if not add_residual:
|
||||
if current_platform.is_rocm() and int(use_rocm_aiter) and int(
|
||||
use_rocm_aiter_norm):
|
||||
assert rms_norm_func == rocm_aiter_rms_norm
|
||||
else:
|
||||
assert rms_norm_func == rms_norm
|
||||
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
|
||||
use_rocm_aiter_norm):
|
||||
assert rms_norm_func == rocm_aiter_fused_add_rms_norm
|
||||
else:
|
||||
assert rms_norm_func == fused_add_rms_norm
|
||||
|
||||
@ -3,11 +3,7 @@
|
||||
|
||||
Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
@ -17,21 +13,7 @@ from ...utils import check_logprobs_close
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
AITER_MODEL_LIST = [
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"openbmb/MiniCPM3-4B",
|
||||
"Qwen/Qwen-7B",
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"ehristoforu/Falcon3-MoE-2x7B-Insruct",
|
||||
]
|
||||
|
||||
|
||||
# @maybe_test_rocm_aiter
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
@ -87,24 +69,19 @@ AITER_MODEL_LIST = [
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
|
||||
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
dtype: str, max_tokens: int, num_logprobs: int,
|
||||
use_rocm_aiter: bool, monkeypatch) -> None:
|
||||
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if model in REQUIRES_V0:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
if model.startswith("THUDM/chatglm3"):
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
@ -123,10 +100,3 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
if use_rocm_aiter:
|
||||
# this is to ensure that vllm engine
|
||||
# has deallocated the memory before running the next
|
||||
# unit tests. On ROCm, when using AITER
|
||||
# the memory might not be deallocated completely
|
||||
# before running the next test case
|
||||
torch.cuda.synchronize()
|
||||
|
||||
@ -508,19 +508,6 @@ VLM_TEST_SETTINGS = {
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
),
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
||||
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)],
|
||||
),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@ -1,11 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
@ -106,17 +102,3 @@ def different_patch_input_cases_internvl():
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
|
||||
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
|
||||
image = Image.open(BytesIO(requests.get(image_url).content))
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
|
||||
@ -192,8 +192,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
||||
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
|
||||
trust_remote_code=True),
|
||||
"TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
|
||||
trust_remote_code=True),
|
||||
"XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
|
||||
is_available_online=False,
|
||||
trust_remote_code=True),
|
||||
|
||||
@ -56,7 +56,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
def maybe_assert_ngram_worker(llm):
|
||||
# Verify the proposer worker is ngram if ngram is specified.
|
||||
if (llm.llm_engine.speculative_config is not None
|
||||
and llm.llm_engine.speculative_config.method == "ngram"):
|
||||
and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
|
||||
from vllm.spec_decode.ngram_worker import NGramWorker
|
||||
assert isinstance(
|
||||
llm.llm_engine.model_executor.driver_worker.proposer_worker,
|
||||
|
||||
@ -7,39 +7,28 @@ from vllm import SamplingParams
|
||||
from .conftest import get_output_from_llm_generator
|
||||
|
||||
|
||||
@pytest.mark.parametrize("common_llm_kwargs",
|
||||
[{
|
||||
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||
}])
|
||||
@pytest.mark.parametrize("common_llm_kwargs", [{
|
||||
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# Speculative max model len > overridden max model len should raise.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 129,
|
||||
},
|
||||
"max_model_len": 128,
|
||||
"speculative_max_model_len": 129,
|
||||
},
|
||||
{
|
||||
# Speculative max model len > draft max model len should raise.
|
||||
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 2048 + 1,
|
||||
},
|
||||
"speculative_max_model_len": 2048 + 1,
|
||||
},
|
||||
{
|
||||
# Speculative max model len > target max model len should raise.
|
||||
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 131072 + 1,
|
||||
},
|
||||
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
|
||||
"speculative_max_model_len": 131072 + 1,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
|
||||
@ -57,10 +57,8 @@ PRECISION = "float32"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -97,19 +95,18 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": False,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": True,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
}])
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
128,
|
||||
])
|
||||
@ -122,19 +119,18 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
batch_size: int, output_len: int, seed: int,
|
||||
logprobs: int):
|
||||
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -155,10 +151,8 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -199,10 +193,8 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -244,10 +236,8 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -287,13 +277,12 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -335,10 +324,8 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "yuhuili/EAGLE-llama2-chat-7B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -385,10 +372,8 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -435,10 +420,8 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -23,10 +23,8 @@ MAIN_MODEL = "JackFram/llama-68m"
|
||||
[
|
||||
{
|
||||
# Identical models.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@ -59,33 +57,26 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
# Explicitly specify draft model quantization
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
"quantization": "gptq",
|
||||
},
|
||||
"speculative_model_quantization": "gptq",
|
||||
},
|
||||
# Explicitly specify GPTQ-based draft model to use marlin quantization
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
"quantization": "marlin",
|
||||
},
|
||||
"speculative_model_quantization": "marlin",
|
||||
},
|
||||
# Not explicitly specify draft model quantization
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"num_speculative_tokens": 5,
|
||||
"quantization": None,
|
||||
},
|
||||
"speculative_model_quantization": None,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@ -116,16 +107,15 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -137,7 +127,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
|
||||
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||
output_len: int, seed: int):
|
||||
"""Verify that speculative decoding generates the same output
|
||||
"""Verify that ngram speculative decoding generates the same output
|
||||
with batch expansion scorer and mqa scorer.
|
||||
"""
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
|
||||
@ -27,19 +27,18 @@ from .conftest import run_equality_correctness_test_tp
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
[
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num-speculative-tokens",
|
||||
"3",
|
||||
],
|
||||
[
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"[ngram]",
|
||||
"--num-speculative-tokens",
|
||||
"5",
|
||||
"--ngram-prompt-lookup-max",
|
||||
"3",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
@ -84,24 +83,23 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
]])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize(
|
||||
"model, test_llm_kwargs",
|
||||
[("JackFram/llama-68m", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
]),
|
||||
("ibm-granite/granite-3b-code-instruct", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "ibm-granite/granite-3b-code-instruct",
|
||||
"num_speculative_tokens": 5,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
])])
|
||||
@pytest.mark.parametrize("model, test_llm_kwargs",
|
||||
[("JackFram/llama-68m", [
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num_speculative-tokens",
|
||||
"5",
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
]),
|
||||
("ibm-granite/granite-3b-code-instruct", [
|
||||
"--speculative-model",
|
||||
"ibm-granite/granite-3b-code-instruct",
|
||||
"--num_speculative-tokens",
|
||||
"5",
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
])])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
|
||||
@ -146,19 +144,18 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize("model, test_llm_kwargs",
|
||||
[("JackFram/llama-68m", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num_speculative-tokens",
|
||||
"3",
|
||||
]),
|
||||
("JackFram/llama-68m", [
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
"--speculative-model",
|
||||
"JackFram/llama-68m",
|
||||
"--num_speculative-tokens",
|
||||
"3",
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
])])
|
||||
@pytest.mark.parametrize("logprobs", [None, 2])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
|
||||
@ -24,7 +24,12 @@ SPEC_MODEL = "JackFram/llama-68m"
|
||||
"4",
|
||||
]])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||
[],
|
||||
[
|
||||
"--speculative-model",
|
||||
f"{SPEC_MODEL}",
|
||||
"--num-speculative-tokens",
|
||||
"5",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||
@pytest.mark.parametrize(
|
||||
@ -32,12 +37,8 @@ SPEC_MODEL = "JackFram/llama-68m"
|
||||
[
|
||||
#TODO(wooyeon): add spec_draft_dp=2 case
|
||||
[
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": f"{SPEC_MODEL}",
|
||||
"num_speculative_tokens": 5,
|
||||
"draft_tensor_parallel_size": 1,
|
||||
}),
|
||||
"--speculative-draft-tensor-parallel-size",
|
||||
"1",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [2])
|
||||
@ -77,14 +78,15 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
[
|
||||
"--speculative-model",
|
||||
f"{SPEC_MODEL}",
|
||||
"--num-speculative-tokens",
|
||||
"5",
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||
"--speculative_config",
|
||||
str({
|
||||
"model": f"{SPEC_MODEL}",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 32,
|
||||
}),
|
||||
"--speculative-max-model-len",
|
||||
"32",
|
||||
],
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
|
||||
@ -20,19 +20,16 @@ from .conftest import run_equality_correctness_test
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -51,20 +48,19 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
|
||||
as well as with and without chunked prefill.
|
||||
"""
|
||||
maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -77,19 +73,16 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 6,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 6,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -105,19 +98,18 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
|
||||
output_len: int, seed: int, logprobs: int):
|
||||
"""Veriy logprob greedy equality with different speculation lens.
|
||||
"""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -133,15 +125,13 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
# Artificially limit the draft model max model len; this forces
|
||||
# vLLM to skip speculation once the sequences grow beyond 32-k
|
||||
# tokens.
|
||||
"max_model_len": 32,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||
"speculative_max_model_len": 32,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize(
|
||||
@ -159,19 +149,18 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
seed: int, logprobs: int):
|
||||
"""Verify logprobs greedy equality when some sequences skip speculation.
|
||||
"""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -184,13 +173,12 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -260,13 +248,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize(
|
||||
@ -283,16 +270,15 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
|
||||
"""Check the behavior when logprobs are disabled.
|
||||
Token choices should match with the base model.
|
||||
"""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
@ -60,10 +60,8 @@ PRECISION = "float32"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -109,18 +107,14 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -138,20 +132,19 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
prefill_chunk_size: int):
|
||||
"""Verify greedy equality with different batch size."""
|
||||
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -172,10 +165,8 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -223,10 +214,8 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -275,10 +264,8 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -325,13 +312,12 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -373,17 +359,16 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Main model
|
||||
"model_name": MAIN_MODEL,
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -62,9 +62,7 @@ PRECISION = "float32"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -110,16 +108,12 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [8])
|
||||
@ -139,20 +133,19 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
# up sampling different tokens at the tail (ie top tokens don't change).
|
||||
# TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
|
||||
maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -174,9 +167,7 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [2048])
|
||||
@ -218,10 +209,8 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
|
||||
# Main model
|
||||
"model_name": MAIN_MODEL,
|
||||
|
||||
# Speculative config
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
# Speculative model
|
||||
"speculative_model": SPEC_MODEL,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
|
||||
@ -285,9 +274,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -339,9 +326,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -397,10 +382,8 @@ def test_mlp_e2e_greedy_correctness_with_padding(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -447,12 +430,11 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_by_batch_size": 4,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": SPEC_MODEL,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -493,15 +475,14 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
"speculative_model": SPEC_MODEL,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": SPEC_MODEL,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -57,9 +57,7 @@ PRECISION = "bfloat16"
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -101,16 +99,12 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -125,19 +119,18 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
batch_size: int, output_len: int, seed: int,
|
||||
logprobs: int):
|
||||
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
output_len,
|
||||
seed,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -159,9 +152,7 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -207,9 +198,7 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
@ -254,9 +243,7 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"num_speculative_tokens": k,
|
||||
}
|
||||
# Try a range of num. speculative tokens
|
||||
for k in range(1, 1 + MAX_SPEC_TOKENS)
|
||||
@ -299,12 +286,11 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"disable_by_batch_size": 4
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -61,19 +61,15 @@ from .conftest import (get_output_from_llm_generator,
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
# Chunked prefill enabled with small value
|
||||
# to make sure we get mixed batches.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -152,23 +148,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"enable_chunked_prefill": False,
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
"disable_logprobs_during_spec_decoding": False
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
"disable_logprobs_during_spec_decoding": False
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
[
|
||||
@ -191,7 +184,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
whether all speculative tokens are accepted.
|
||||
"""
|
||||
ensure_all_accepted = per_test_common_llm_kwargs.get(
|
||||
"model_name") == test_llm_kwargs.get("speculative_config")["model"]
|
||||
"model_name") == test_llm_kwargs.get("speculative_model")
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
@ -231,17 +224,13 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -294,17 +283,13 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -351,17 +336,13 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -410,17 +391,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -472,17 +449,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -541,17 +514,13 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
@ -598,25 +567,21 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 32,
|
||||
},
|
||||
"speculative_max_model_len": 32,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 32,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
"speculative_max_model_len": 32,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@ -662,19 +627,15 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"disable_by_batch_size": 2,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"speculative_disable_by_batch_size": 2,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"disable_by_batch_size": 2,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"speculative_disable_by_batch_size": 2,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
@ -715,19 +676,15 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"enable_chunked_prefill": False,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
|
||||
] + [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4,
|
||||
@ -772,21 +729,17 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"acceptance_method": "typical_acceptance_sampler",
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
|
||||
"enable_chunked_prefill": False
|
||||
}
|
||||
# Try a range of common k.
|
||||
for k in [1, 2, 3]
|
||||
] + [{
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"acceptance_method": "typical_acceptance_sampler",
|
||||
},
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
|
||||
@ -48,20 +48,16 @@ from .conftest import run_equality_correctness_test
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": False,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_mqa_scorer": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -105,20 +101,16 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_logprobs": False,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"disable_logprobs_during_spec_decoding": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_logprobs": True,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"disable_logprobs_during_spec_decoding": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("output_len", [
|
||||
@ -133,20 +125,19 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
batch_size: int, output_len: int, seed: int,
|
||||
logprobs: int):
|
||||
"""Verify greedy equality on a tiny model with different batch size."""
|
||||
run_equality_correctness_test(
|
||||
vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs["speculative_config"]
|
||||
["disable_logprobs"])
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs,
|
||||
test_llm_kwargs,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
seed=seed,
|
||||
temperature=0.0,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=logprobs,
|
||||
disable_logprobs=test_llm_kwargs[
|
||||
'disable_logprobs_during_spec_decoding'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -168,21 +159,17 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"enable_chunked_prefill": True,
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
},
|
||||
@ -227,21 +214,17 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": k,
|
||||
"prompt_lookup_max": 3,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": k,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 3, 5]
|
||||
] + [
|
||||
{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": k,
|
||||
"prompt_lookup_max": 1,
|
||||
},
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": k,
|
||||
"ngram_prompt_lookup_max": 1,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 3, 5]
|
||||
@ -260,7 +243,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
|
||||
seed: int):
|
||||
"""Verify that ngram speculative decoding produces exact equality
|
||||
to without spec decode with many different values of k and
|
||||
different ngram prompt_lookup_max.
|
||||
different ngram_prompt_lookup_max.
|
||||
"""
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
@ -283,25 +266,22 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_by_batch_size": 4
|
||||
},
|
||||
}, {
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_by_batch_size": 4,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_by_batch_size": 4
|
||||
}, {
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
"speculative_disable_by_batch_size": 4,
|
||||
"enable_chunked_prefill": True,
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
"max_num_seqs": 4
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
@ -316,7 +296,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
seed: int):
|
||||
"""Verify that ngram speculative decoding produces exact equality
|
||||
to without spec decode with many different values of k and
|
||||
different ngram prompt_lookup_max.
|
||||
different ngram_prompt_lookup_max.
|
||||
"""
|
||||
run_equality_correctness_test(vllm_runner,
|
||||
common_llm_kwargs,
|
||||
@ -336,17 +316,18 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Required for spec decode.
|
||||
"speculative_model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
"ngram_prompt_lookup_max": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"speculative_config": {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": 5,
|
||||
"prompt_lookup_max": 3,
|
||||
"disable_mqa_scorer": True,
|
||||
},
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_disable_mqa_scorer": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
|
||||
@ -19,11 +19,11 @@ SPEC_MODEL = "JackFram/llama-160m"
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# speculative config
|
||||
"speculative_config": {
|
||||
"model": "JackFram/llama-160m",
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
# speculative model
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
|
||||
# num speculative tokens
|
||||
"num_speculative_tokens": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
|
||||
|
||||
@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type):
|
||||
max_input_length=None,
|
||||
)
|
||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
||||
prompt="prompt", lora_request=None)
|
||||
request_id="request_id", prompt="prompt", lora_request=None)
|
||||
assert reference_tokenizer.encode(
|
||||
"prompt") == await tokenizer_group.encode_async(prompt="prompt",
|
||||
lora_request=None)
|
||||
"prompt") == await tokenizer_group.encode_async(
|
||||
request_id="request_id", prompt="prompt", lora_request=None)
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
@ -69,7 +69,8 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
|
||||
# and check that all requests are processed correctly.
|
||||
num_requests = tokenizer_group_pool.pool_size * 5
|
||||
requests = [
|
||||
tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
|
||||
tokenizer_group_pool.encode_async(request_id=str(i),
|
||||
prompt=f"prompt {i}",
|
||||
lora_request=None)
|
||||
for i in range(num_requests)
|
||||
]
|
||||
@ -160,8 +161,12 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
|
||||
fail_at[0] = 1000
|
||||
|
||||
# We should recover successfully.
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
|
||||
# Check that we have a new actor
|
||||
assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
|
||||
@ -179,7 +184,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
|
||||
|
||||
# We should fail after re-initialization.
|
||||
with pytest.raises(RuntimeError):
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt",
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
|
||||
# check_health should raise the same thing
|
||||
@ -200,8 +206,11 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
|
||||
|
||||
# Prompt too long error
|
||||
with pytest.raises(ValueError):
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt" * 100,
|
||||
lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
# Actors should stay the same.
|
||||
assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
|
||||
|
||||
@ -39,10 +39,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
|
||||
|
||||
# universal args for all models go here. also good if you need to test locally
|
||||
# and change type or KV cache quantization or something.
|
||||
ARGS: list[str] = [
|
||||
"--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
|
||||
"256"
|
||||
]
|
||||
ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
|
||||
|
||||
CONFIGS: dict[str, ServerConfig] = {
|
||||
"hermes": {
|
||||
|
||||
@ -5,96 +5,92 @@ import os
|
||||
import tempfile
|
||||
|
||||
import depyf
|
||||
import pytest
|
||||
|
||||
from vllm.config import CompilationLevel
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with depyf.prepare_debug(temp_dir):
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
@pytest.mark.skip(reason="Not working; needs investigation.")
|
||||
def test_tpu_compilation():
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with depyf.prepare_debug(temp_dir):
|
||||
from vllm import LLM, SamplingParams
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
"The greatest glory in living lies not in never falling,",
|
||||
]
|
||||
answers = [
|
||||
" or, through inaction, allow a human being to come to harm.",
|
||||
" what is essential is invisible to the eye.",
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
N = 1
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
sampling_params = SamplingParams(temperature=0.7,
|
||||
top_p=1.0,
|
||||
n=N,
|
||||
max_tokens=16)
|
||||
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
"The greatest glory in living lies not in never falling,",
|
||||
]
|
||||
answers = [
|
||||
" or, through inaction, allow a human being to come to harm.",
|
||||
" what is essential is invisible to the eye.",
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
N = 1
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
sampling_params = SamplingParams(temperature=0.7,
|
||||
top_p=1.0,
|
||||
n=N,
|
||||
max_tokens=16)
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
# disable custom dispatcher, let Dynamo takes over
|
||||
# all the control
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=512,
|
||||
max_num_seqs=64,
|
||||
enforce_eager=True,
|
||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
# disable custom dispatcher, let Dynamo takes over
|
||||
# all the control
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=512,
|
||||
max_num_seqs=64,
|
||||
enforce_eager=True,
|
||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
# We should only trigger Dynamo compilation 4 times:
|
||||
# 1. forward pass (symbolic)
|
||||
# 2. compute_logits (symbolic)
|
||||
# 3. forward pass (shape 16)
|
||||
# 4. forward pass (shape 32)
|
||||
# and later calls should not trigger Dynamo compilation again.
|
||||
# NOTE: It might still trigger XLA compilation.
|
||||
|
||||
# We should only trigger Dynamo compilation 4 times:
|
||||
# 1. forward pass (symbolic)
|
||||
# 2. compute_logits (symbolic)
|
||||
# 3. forward pass (shape 16)
|
||||
# 4. forward pass (shape 32)
|
||||
# and later calls should not trigger Dynamo compilation again.
|
||||
# NOTE: It might still trigger XLA compilation.
|
||||
# Check we have 4 compiled codes
|
||||
assert len(compiled_codes) == 4
|
||||
|
||||
# Check we have 4 compiled codes
|
||||
assert len(compiled_codes) == 4
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
# The first compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The first compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
# The second compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The second compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
# The third compilation is shape 16, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[2]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
# The third compilation is shape 16, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[2]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[3]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[3]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||
without enough resources, or called when filtering tests to run directly.
|
||||
"""
|
||||
try:
|
||||
if current_platform.is_cpu():
|
||||
if current_platform.is_cpu() or current_platform.is_openvino():
|
||||
memory_gb = 0
|
||||
else:
|
||||
memory_gb = current_platform.get_device_total_memory() / GB_bytes
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -9,10 +8,7 @@ from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
|
||||
KVCacheBlock, PrefixCachingMetrics,
|
||||
generate_block_hash_extra_keys,
|
||||
hash_block_tokens,
|
||||
hash_request_tokens,
|
||||
unify_kv_cache_configs)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheTensor)
|
||||
hash_request_tokens)
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
@ -318,107 +314,3 @@ def test_metrics():
|
||||
assert metrics.aggregated_query_total == 0
|
||||
assert metrics.aggregated_query_hit == 0
|
||||
assert not metrics.query_queue
|
||||
|
||||
|
||||
def test_unify_kv_cache_configs():
|
||||
|
||||
def new_kv_cache_spec(block_size=16,
|
||||
num_kv_heads=2,
|
||||
head_size=64,
|
||||
dtype=torch.float32,
|
||||
use_mla=False):
|
||||
return FullAttentionSpec(block_size=block_size,
|
||||
num_kv_heads=num_kv_heads,
|
||||
head_size=head_size,
|
||||
dtype=dtype,
|
||||
use_mla=use_mla)
|
||||
|
||||
same_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
]
|
||||
unify_kv_cache_configs(same_kv_cache_config)
|
||||
assert same_kv_cache_config[0].num_blocks == 10
|
||||
assert same_kv_cache_config[1].num_blocks == 10
|
||||
|
||||
need_sort_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
unify_kv_cache_configs(need_sort_kv_cache_config)
|
||||
assert need_sort_kv_cache_config[0].num_blocks == 10
|
||||
assert need_sort_kv_cache_config[1].num_blocks == 10
|
||||
|
||||
diff_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=8)),
|
||||
],
|
||||
),
|
||||
]
|
||||
with pytest.raises(AssertionError):
|
||||
unify_kv_cache_configs(diff_kv_cache_config)
|
||||
|
||||
@ -70,16 +70,12 @@ def test_ngram_correctness(
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
)
|
||||
spec_llm = LLM(model=model_name,
|
||||
speculative_model='[ngram]',
|
||||
ngram_prompt_lookup_max=5,
|
||||
ngram_prompt_lookup_min=3,
|
||||
num_speculative_tokens=3,
|
||||
max_model_len=1024)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
|
||||
@ -11,13 +11,11 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
|
||||
STOP_STRINGS,
|
||||
DummyOutputProcessorTestVectors,
|
||||
MockEngineCore)
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.output_processor import (OutputProcessor,
|
||||
RequestOutputCollector)
|
||||
from vllm.v1.engine.output_processor import OutputProcessor
|
||||
from vllm.v1.metrics.stats import IterationStats
|
||||
|
||||
|
||||
@ -836,88 +834,3 @@ def test_iteration_stats(dummy_test_vectors):
|
||||
|
||||
assert iteration_stats.num_prompt_tokens == 0
|
||||
assert iteration_stats.num_generation_tokens == num_active
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_output_collector():
|
||||
NUM_REQS = 3
|
||||
TEXT = "a"
|
||||
|
||||
def make_outputs() -> list[RequestOutput]:
|
||||
return [
|
||||
RequestOutput(
|
||||
request_id="my-request-id",
|
||||
prompt=None,
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[
|
||||
CompletionOutput(
|
||||
index=0,
|
||||
text=TEXT,
|
||||
token_ids=[idx],
|
||||
cumulative_logprob=(idx + 1 * 1.0),
|
||||
logprobs=[{
|
||||
"a": idx,
|
||||
"b": idx
|
||||
}],
|
||||
finish_reason="length" if
|
||||
(idx == NUM_REQS - 1) else None,
|
||||
)
|
||||
],
|
||||
finished=(idx == NUM_REQS - 1),
|
||||
) for idx in range(NUM_REQS)
|
||||
]
|
||||
|
||||
collector = RequestOutputCollector(RequestOutputKind.DELTA)
|
||||
|
||||
# CASE 1: Put then get.
|
||||
outputs = make_outputs()
|
||||
collector.put(outputs[0])
|
||||
output = await collector.get()
|
||||
assert not collector.ready.is_set()
|
||||
assert collector.output is None
|
||||
assert output.outputs[0].text == "a"
|
||||
assert output.outputs[0].token_ids == [0]
|
||||
|
||||
# CASE 2: 2 puts then get.
|
||||
num_to_put = 2
|
||||
outputs = make_outputs()
|
||||
for i in range(num_to_put):
|
||||
collector.put(outputs[i])
|
||||
output = await collector.get()
|
||||
assert not collector.ready.is_set()
|
||||
assert collector.output is None
|
||||
|
||||
assert not output.finished
|
||||
# Text, token_ids, and logprobs should get merged.
|
||||
assert output.outputs[0].text == TEXT * num_to_put
|
||||
for tok_0, tok_1 in zip(output.outputs[0].token_ids,
|
||||
list(range(num_to_put))):
|
||||
assert tok_0 == tok_1
|
||||
assert len(output.outputs[0].logprobs) == num_to_put
|
||||
|
||||
# Cumulative logprobs should be the last one.
|
||||
cumulative_logprob_expected = 1.0 * num_to_put
|
||||
assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
|
||||
|
||||
# CASE 3: Put all 3 (including a finished).
|
||||
num_to_put = 3
|
||||
outputs = make_outputs()
|
||||
for i in range(num_to_put):
|
||||
collector.put(outputs[i])
|
||||
output = await collector.get()
|
||||
assert not collector.ready.is_set()
|
||||
assert collector.output is None
|
||||
|
||||
assert output.finished
|
||||
assert output.outputs[0].finish_reason == "length"
|
||||
# Text, token_ids, and logprobs should get merged.
|
||||
assert output.outputs[0].text == TEXT * num_to_put
|
||||
for tok_0, tok_1 in zip(output.outputs[0].token_ids,
|
||||
list(range(num_to_put))):
|
||||
assert tok_0 == tok_1
|
||||
assert len(output.outputs[0].logprobs) == num_to_put
|
||||
|
||||
# Cumulative logprobs should be the last one.
|
||||
cumulative_logprob_expected = 1.0 * num_to_put
|
||||
assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
|
||||
GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
|
||||
MODELS_TO_TEST = [
|
||||
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
||||
]
|
||||
@ -30,13 +30,12 @@ def test_guided_json_completion(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json=sample_json_schema,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
@ -58,50 +57,6 @@ def test_guided_json_completion(
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
GUIDED_DECODING_BACKENDS_V1)
|
||||
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
|
||||
def test_guided_json_completion_disable_any_whitespace(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
sample_json_schema: dict[str, Any],
|
||||
guided_decoding_backend: str,
|
||||
model_name: str,
|
||||
):
|
||||
if guided_decoding_backend != "xgrammar":
|
||||
pytest.skip("disable-any-whitespace is only supported for xgrammar.")
|
||||
guided_decoding_backend = 'xgrammar:disable-any-whitespace'
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
outputs = llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
assert "\n" not in generated_text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
GUIDED_DECODING_BACKENDS_V1)
|
||||
@ -112,14 +67,13 @@ def test_guided_json_object(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=100,
|
||||
n=2,
|
||||
guided_decoding=GuidedDecodingParams(json_object=True))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=100,
|
||||
n=2,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json_object=True,
|
||||
backend=guided_decoding_backend))
|
||||
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a JSON object with curly braces for a person with "
|
||||
@ -139,20 +93,12 @@ def test_guided_json_object(
|
||||
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
allowed_types: tuple[type, ...] = (dict, )
|
||||
if guided_decoding_backend == "xgrammar":
|
||||
# TODO - we are currently too permissive with xgrammar and
|
||||
# allow # any valid json (typically comes back as a list or
|
||||
# object). We can fix this by specifying a jsonschema of
|
||||
# {"type": "object"}, # but we need this fix in a release
|
||||
# first: https://github.com/mlc-ai/xgrammar/pull/264
|
||||
allowed_types = (dict, list)
|
||||
assert isinstance(parsed_json, allowed_types)
|
||||
assert isinstance(parsed_json, dict)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
GUIDED_DECODING_BACKENDS_V1 + ["auto"])
|
||||
GUIDED_DECODING_BACKENDS_V1)
|
||||
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
|
||||
def test_guided_json_unsupported_schema(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@ -161,43 +107,21 @@ def test_guided_json_unsupported_schema(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
|
||||
if guided_decoding_backend == "xgrammar":
|
||||
with pytest.raises(ValueError,
|
||||
match="The provided JSON schema contains features "
|
||||
"not supported by xgrammar."):
|
||||
llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {unsupported_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
else:
|
||||
# This should work for both "guidance" and "auto".
|
||||
|
||||
outputs = llm.generate(
|
||||
prompts=("Give an example JSON object for a grade "
|
||||
"that fits this schema: "
|
||||
f"{unsupported_json_schema}"),
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
print(generated_text)
|
||||
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json=unsupported_json_schema,
|
||||
backend=guided_decoding_backend))
|
||||
with pytest.raises(ValueError,
|
||||
match="The provided JSON schema contains features "
|
||||
"not supported by xgrammar."):
|
||||
llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {unsupported_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@ -211,14 +135,13 @@ def test_guided_grammar_ebnf(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
grammar=sample_sql_ebnf,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1"),
|
||||
@ -255,14 +178,13 @@ def test_guided_grammar_lark(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
grammar=sample_sql_lark,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1"),
|
||||
@ -303,15 +225,16 @@ def test_guided_grammar_ebnf_invalid(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
|
||||
with pytest.raises(ValueError, match="Failed to convert the grammar "):
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
grammar="not a grammar",
|
||||
backend=guided_decoding_backend))
|
||||
with pytest.raises(ValueError,
|
||||
match="Failed to convert the grammar "
|
||||
"from Lark to EBNF."):
|
||||
llm.generate(
|
||||
prompts=("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1"),
|
||||
@ -331,13 +254,12 @@ def test_guided_regex(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
regex=sample_regex,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts=[
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
@ -369,17 +291,17 @@ def test_guided_choice_completion(
|
||||
model_name: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
|
||||
llm = LLM(model=model_name, max_model_len=1024)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
choice=sample_guided_choice,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts="The best language for type-safe systems programming is ",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user