Compare commits

..

4 Commits

Author SHA1 Message Date
4c42267293 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 02:26:20 +00:00
24f68342b4 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 02:17:42 +00:00
c5d963835b updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 01:54:01 +00:00
b313220727 updates
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-27 23:51:36 +00:00
317 changed files with 3645 additions and 13496 deletions

View File

@ -63,12 +63,10 @@
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"draft_tensor_parallel_size": 1
}
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -3,7 +3,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
@ -14,7 +14,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
@ -31,7 +31,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
@ -48,7 +48,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image"
@ -57,7 +57,7 @@ steps:
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
@ -82,7 +82,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"

View File

@ -10,5 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
docker build -t cpu-test -f Dockerfile.ppc64le .

View File

@ -8,19 +8,15 @@ set -ex
CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
}
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,6 +36,8 @@ function cpu_tests() {
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model

View File

@ -9,7 +9,6 @@ python3 use_existing_torch.py
# Try building the docker image
DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \
--platform "linux/arm64" \
-t gh200-test \

View File

@ -5,7 +5,7 @@
set -ex
# Try building the docker image
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup
# certain versions of HPU software stack have a bug that can

View File

@ -35,7 +35,7 @@ else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() {

View File

@ -3,7 +3,7 @@
set -e
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
@ -21,8 +21,6 @@ docker run --privileged --net host --shm-size=16G -it \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo TEST_0 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
&& echo TEST_1 \
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \
@ -34,10 +32,11 @@ docker run --privileged --net host --shm-size=16G -it \
&& echo TEST_5 \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
&& echo TEST_6 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
&& echo TEST_7 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu .
docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() {

View File

@ -150,8 +150,8 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
- python3 rlhf.py
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
- label: Metrics, Tracing Test # 10min
@ -431,7 +431,6 @@ steps:
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
- label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true
@ -521,7 +520,7 @@ steps:
- vllm/v1/engine/
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'

2
.github/mergify.yml vendored
View File

@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/
- files~=^cmake/
- files=CMakeLists.txt
- files~=^docker/Dockerfile
- files~=^Dockerfile
- files~=^requirements.*\.txt
- files=setup.py
actions:

View File

@ -50,7 +50,7 @@ jobs:
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster
run: |

View File

@ -1,6 +1,3 @@
default_install_hook_types:
- pre-commit
- commit-msg
default_stages:
- pre-commit # Run locally
- manual # Run in CI

View File

@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
#
# Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@ -234,7 +234,6 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
@ -242,7 +241,6 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -284,6 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"

69
Dockerfile.cpu Normal file
View File

@ -0,0 +1,69 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,18 +1,18 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="db8e93b4"
ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="295f2ed4"
ARG PYTORCH_VISION_BRANCH="v0.21.0"
ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="8970b25b"
ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
ARG AITER_BRANCH="21d47a9"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
&& apt-get install -y software-properties-common git curl sudo vim less \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@ -42,7 +42,7 @@ RUN apt-get update -y \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
@ -60,8 +60,7 @@ RUN cd hipBLAS-common \
RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \
&& make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -111,24 +110,11 @@ RUN git clone ${FA_REPO}
RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \
@ -144,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl
ARG AITER_REPO
ARG AITER_BRANCH
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH
ARG RCCL_REPO
@ -161,8 +154,6 @@ ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -176,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -15,12 +15,14 @@ Easy, fast, and cheap LLM serving for everyone
---
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
---
*Latest News* 🔥
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.

View File

@ -41,33 +41,29 @@ become available.
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/VisionArena-Chat</code></td>
<td><strong>HuggingFace</strong></td>
<td style="text-align: center;">🟡</td>
<td style="text-align: center;">🟡</td>
<td>Specify your dataset path on HuggingFace</td>
</tr>
<tr>
<td><strong>HuggingFace-InstructCoder</strong></td>
<td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
</tbody>
</table>
✅: supported
🟡: Partial support
🚧: to be supported
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
If you need support for other dataset formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf`
---
## Example - Online Benchmark
@ -75,7 +71,8 @@ become available.
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
vllm serve ${MODEL_NAME} --disable-log-requests
```
Then run the benchmarking script
@ -83,13 +80,12 @@ Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```
If successful, you will see the following output
@ -126,76 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--hf-split train \
--num-prompts 1000
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}"
```
### InstructCoder Benchmark with Speculative Decoding
### HuggingFaceDataset Examples
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-model "[ngram]" \
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Other HuggingFaceDataset Examples
Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
formats, please consider contributing.
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
DATASET_SPLIT='train'
DATASET_SUBSET='chart2text(cauldron)'
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-subset "${DATASET_SUBSET}"
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}" \
```
---
## Example - Offline Throughput Benchmark
```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts 10
--model "${MODEL_NAME}" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}"
```
If successful, you will see the following output
@ -209,13 +217,19 @@ Total num output tokens: 1500
### VisionArena Benchmark for Vision Language Models
``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts 1000 \
--hf-split train
--model "${MODEL_NAME}" \
--backend "vllm-chat" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-split "${DATASET_SPLIT}"
```
The `num prompt tokens` now includes image token counts
@ -226,71 +240,29 @@ Total num prompt tokens: 14527
Total num output tokens: 1280
```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-model="[ngram]" \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
### Benchmark with LoRA Adapters
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 10 \
--max-loras 2 \
--max-lora-rank 8 \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
--model "${MODEL_NAME}" \
--backend "${BACKEND}" \
--dataset_path "${DATASET_PATH}" \
--dataset_name "${DATASET_NAME}" \
--num-prompts "${NUM_PROMPTS}" \
--max-loras "${MAX_LORAS}" \
--max-lora-rank "${MAX_LORA_RANK}" \
${ENABLE_LORA} \
--lora-path "${LORA_PATH}"
```

View File

@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from functools import cache
from io import BytesIO
from typing import Any, Callable, Optional, Union
from typing import Any, Optional, Union
import numpy as np
import pandas as pd
@ -240,24 +239,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
Supports three input types:
For a PIL.Image.Image input:
- Converts the image to RGB.
- Saves the image as a JPEG in-memory.
- Encodes the JPEG data as a base64 string.
- Returns a dictionary with the image as a base64 data URL.
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
For a string input:
- Treats the string as a URL or file path.
- Prepends "file://" if the string doesn't start with "http://" or
"file://".
- Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
ValueError: If the input is neither a PIL.Image.Image nor a string.
"""
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
@ -276,8 +272,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
" or str or dictionary with raw image bytes.")
raise ValueError(
f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
# -----------------------------------------------------------------------------
@ -566,56 +562,48 @@ class BurstGPTDataset(BenchmarkDataset):
# -----------------------------------------------------------------------------
# HuggingFace Dataset Base Implementation
# HuggingFace Dataset Implementation
# -----------------------------------------------------------------------------
class HuggingFaceDataset(BenchmarkDataset):
"""Base class for datasets hosted on HuggingFace."""
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
class HuggingFaceDataset(BenchmarkDataset):
"""
Dataset class for processing a HuggingFace dataset with conversation data
and optional images.
"""
def __init__(
self,
dataset_path: str,
dataset_split: str,
dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(dataset_path=dataset_path, **kwargs)
# Validate dataset path
if self.SUPPORTED_DATASET_PATHS and \
self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
raise ValueError(
f"{self.__class__.__name__} "
f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
"Please consider contributing if you would "
"like to add support for additional dataset formats.")
super().__init__(**kwargs)
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
self.load_data()
def load_data(self) -> None:
"""Load data from HuggingFace datasets."""
if not self.dataset_path:
raise ValueError("dataset_path must be provided for loading data.")
self.data = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = self.data.shuffle(seed=self.random_seed)
# -----------------------------------------------------------------------------
# Conversation Dataset Implementation
# -----------------------------------------------------------------------------
class ConversationDataset(HuggingFaceDataset):
"""Dataset for conversation data with multimodal support."""
SUPPORTED_DATASET_PATHS = {
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
}
if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)
def sample(self,
tokenizer: PreTrainedTokenizerBase,
@ -623,13 +611,10 @@ class ConversationDataset(HuggingFaceDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(
lambda x: len(x["conversations"]) >= 2)
sampled_requests = []
dynamic_output = output_len is None
for item in filtered_data:
for item in self.data:
if len(sampled_requests) >= num_requests:
break
conv = item["conversations"]
@ -674,12 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
"""
DEFAULT_OUTPUT_LEN = 128
SUPPORTED_DATASET_PATHS = {
"lmarena-ai/VisionArena-Chat":
lambda x: x["conversation"][0][0]["content"],
"lmarena-ai/vision-arena-bench-v0.1":
lambda x: x["turns"][0][0]["content"]
}
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
if self.dataset_subset is None and self.dataset_split != "train":
raise ValueError("Dataset split must be 'train'.")
self.load_data()
def load_data(self) -> None:
dataset = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = dataset.shuffle(seed=self.random_seed)
def sample(
self,
@ -695,11 +697,7 @@ class VisionArenaDataset(HuggingFaceDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
if parser_fn is None:
raise ValueError(
f"Unsupported dataset path: {self.dataset_path}")
prompt = parser_fn(item)
prompt = item["turns"][0][0]["content"]
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat:
@ -717,47 +715,3 @@ class VisionArenaDataset(HuggingFaceDataset):
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------
class InstructCoderDataset(HuggingFaceDataset):
"""
InstructCoder Dataset.
https://huggingface.co/datasets/likaixin/InstructCoder
InstructCoder is the dataset designed for general code editing. It consists
of 114,239 instruction-input-output triplets, and covers multiple distinct
code editing scenario.
"""
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
SUPPORTED_DATASET_PATHS = {
"likaixin/InstructCoder",
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests

View File

@ -7,6 +7,9 @@ On the server side, run one of the following commands:
--swap-space 16 \
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
@ -49,10 +52,9 @@ try:
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
InstructCoderDataset, RandomDataset,
SampleRequest, ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -584,17 +586,11 @@ def main(args: argparse.Namespace):
return_prompt_formatted=True)
elif args.dataset_name == "hf":
# all following datasets are implemented from the
# HuggingFaceDataset base class
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_class = VisionArenaDataset
args.hf_split = "train"
args.hf_subset = None
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_class = InstructCoderDataset
args.hf_split = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_class = ConversationDataset
# Choose between VisionArenaDataset
# and HuggingFaceDataset based on provided parameters.
dataset_class = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
input_requests = dataset_class(
dataset_path=args.dataset_path,
dataset_subset=args.hf_subset,

View File

@ -5,6 +5,9 @@ On the server side, run one of the following commands:
(vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \
--backend <backend> \

View File

@ -11,10 +11,9 @@ from typing import Any, Optional, Union
import torch
import uvloop
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
InstructCoderDataset, RandomDataset,
SampleRequest, ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -301,7 +300,6 @@ def get_requests(args, tokenizer):
"input_len": args.input_len,
"output_len": args.output_len,
}
if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len
@ -319,19 +317,17 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = InstructCoderDataset
common_kwargs['dataset_split'] = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = ConversationDataset
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
if args.backend != "vllm-chat":
raise ValueError(
"hf datasets only are supported by vllm-chat backend")
# Choose between VisionArenaDataset and HuggingFaceDataset based on
# provided parameters.
dataset_cls = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
@ -466,16 +462,9 @@ def validate_args(args):
warnings.warn("--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.",
stacklevel=2)
elif args.dataset_name == "hf":
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend." #noqa: E501
else:
raise ValueError(
f"{args.dataset_path} is not supported by hf dataset.")
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
raise ValueError(
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
# --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None:

View File

@ -30,18 +30,19 @@ class BenchmarkConfig(TypedDict):
num_stages: int
def benchmark_config(config: BenchmarkConfig,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
use_deep_gemm: bool = False) -> float:
def benchmark_config(
config: BenchmarkConfig,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int8_w8a16:
@ -114,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
def run():
from vllm.model_executor.layers.fused_moe import override_config
with override_config(config):
if use_deep_gemm:
topk_weights, topk_ids = fused_topk(x, input_gating, topk,
False)
return fused_experts(
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
allow_deep_gemm=True,
)
else:
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
# JIT compilation & warmup
run()
@ -384,7 +366,6 @@ class BenchmarkWorker:
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
@ -415,8 +396,7 @@ class BenchmarkWorker:
use_fp8_w8a8,
use_int8_w8a16,
num_iters=100,
block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm)
block_quant_shape=block_quant_shape)
return config, kernel_time
def tune(
@ -431,7 +411,6 @@ class BenchmarkWorker:
use_int8_w8a16: bool,
search_space: list[dict[str, int]],
block_quant_shape: list[int],
use_deep_gemm: bool,
) -> dict[str, int]:
best_config = None
best_time = float("inf")
@ -457,8 +436,7 @@ class BenchmarkWorker:
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20,
block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm)
block_quant_shape=block_quant_shape)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
@ -572,8 +550,6 @@ def main(args: argparse.Namespace):
else:
batch_sizes = [args.batch_size]
use_deep_gemm = bool(args.use_deep_gemm)
ray.init()
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -596,10 +572,10 @@ def main(args: argparse.Namespace):
start = time.time()
configs = _distribute(
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
block_quant_shape, use_deep_gemm)
for batch_size in batch_sizes])
"tune",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
for batch_size in batch_sizes])
best_configs = {
M: sort_config(config)
for M, config in zip(batch_sizes, configs)
@ -613,7 +589,7 @@ def main(args: argparse.Namespace):
outputs = _distribute(
"benchmark",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -635,7 +611,6 @@ if __name__ == "__main__":
type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16"],
default="auto")
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true")

16
benchmarks/launch_tgi_server.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
PORT=8000
MODEL=$1
TOKENS=$2
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-v "$PWD/data:/data" \
ghcr.io/huggingface/text-generation-inference:2.2.0 \
--model-id "$MODEL" \
--sharded false \
--max-input-length 1024 \
--max-total-tokens 2048 \
--max-best-of 5 \
--max-concurrent-requests 5000 \
--max-batch-total-tokens "$TOKENS"

View File

@ -482,28 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
if patterns is None:
patterns = DEFAULT_PIP_PATTERNS
def run_with_pip():
try:
import importlib.util
pip_spec = importlib.util.find_spec('pip')
pip_available = pip_spec is not None
except ImportError:
pip_available = False
if pip_available:
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
elif os.environ.get("UV") is not None:
print("uv is set")
cmd = ["uv", "pip", "list", "--format=freeze"]
else:
raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
out = run_and_read_all(run_lambda, cmd)
# People generally have `pip` as `pip` or `pip3`
# But here it is invoked as `python -mpip`
def run_with_pip(pip):
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
return "\n".join(line for line in out.splitlines()
if any(name in line for name in patterns))
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
out = run_with_pip()
out = run_with_pip([sys.executable, '-mpip'])
return pip_version, out

View File

@ -1,39 +0,0 @@
#include <torch/all.h>
#include <torch/cuda.h>
#include <cuda_runtime.h>
// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
// memory, and that UVA (Unified Virtual Addressing) is enabled.
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
// Get raw host pointer from CPU tensor
void* host_ptr = cpu_tensor.data_ptr();
// Get a device pointer corresponding to the pinned host memory
void* device_ptr = nullptr;
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
TORCH_CHECK(err == cudaSuccess,
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
// We'll use the same sizes, strides, and dtype as the CPU tensor.
// TODO: check if layout is respected.
auto sizes = cpu_tensor.sizes();
auto strides = cpu_tensor.strides();
auto options = cpu_tensor.options().device(torch::kCUDA);
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
// memory, so we don't free it here.
auto deleter = [](void*) {
// no-op, since the memory is owned by the original CPU tensor
};
torch::Tensor cuda_tensor =
torch::from_blob(device_ptr, sizes, strides, deleter, options);
TORCH_CHECK(cuda_tensor.device().is_cuda(),
"Resulting tensor is not on CUDA device");
return cuda_tensor;
}

View File

@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank,
bool fully_connected) {
bool full_nvlink) {
int world_size = fake_ipc_ptrs.size();
if (world_size > 8)
throw std::invalid_argument("world size > 8 is not supported");
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
}
return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
rank_data.numel(), rank, world_size,
fully_connected);
full_nvlink);
}
/**
@ -142,48 +142,3 @@ void register_graph_buffers(fptr_t _fa,
bytes.reserve(handles.size());
fa->register_graph_buffers(bytes, offsets);
}
std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
int64_t size) {
auto device_index = c10::cuda::current_device();
at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
void* buffer;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
auto stream = c10::cuda::getCurrentCUDAStream().stream();
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Allocate buffer
#if defined(USE_ROCM)
// data buffers need to be "uncached" for signal on MI200
AT_CUDA_CHECK(
hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
#else
AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
#endif
AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Create IPC memhandle for the allocated buffer.
// Will use it in open_mem_handle.
auto options =
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
auto handle =
torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
AT_CUDA_CHECK(
cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
}
fptr_t open_mem_handle(torch::Tensor& mem_handle) {
void* ipc_ptr;
AT_CUDA_CHECK(cudaIpcOpenMemHandle(
(void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
cudaIpcMemLazyEnablePeerAccess));
return reinterpret_cast<fptr_t>(ipc_ptr);
}
void free_shared_buffer(fptr_t buffer) {
AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
}

View File

@ -5,10 +5,6 @@
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#if defined(USE_ROCM)
typedef __hip_bfloat16 nv_bfloat16;
#endif
#include <iostream>
#include <array>
#include <limits>
@ -16,7 +12,6 @@ typedef __hip_bfloat16 nv_bfloat16;
#include <unordered_map>
#include <vector>
namespace vllm {
#define CUDACHECK(cmd) \
do { \
cudaError_t e = cmd; \
@ -27,37 +22,24 @@ namespace vllm {
} \
} while (0)
// Maximal number of blocks in allreduce kernel.
namespace vllm {
constexpr int kMaxBlocks = 36;
// Default number of blocks in allreduce kernel.
#ifndef USE_ROCM
const int defaultBlockLimit = 36;
CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
#else
const int defaultBlockLimit = 16;
hipPointer_attribute rangeStartAddrAttr =
HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
#endif
// Counter may overflow, but it's fine since unsigned int overflow is
// well-defined behavior.
using FlagType = uint32_t;
// Two sets of peer counters are needed for two syncs: starting and ending an
// operation. The reason is that it's possible for peer GPU block to arrive at
// the second sync point while the current GPU block haven't passed the first
// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
// waiting for counter. We use alternating counter array to avoid this
// possibility.
struct Signal {
alignas(128) FlagType start[kMaxBlocks][8];
alignas(128) FlagType end[kMaxBlocks][8];
alignas(128) FlagType _flag[kMaxBlocks]; // incremental flags for each rank
alignas(128) FlagType self_counter[kMaxBlocks][8];
// Two sets of peer counters are needed for two syncs. The reason is that
// it's possible for peer GPU block to arrive at the second sync point while
// the current GPU block haven't passed the first sync point. Thus, peer GPU
// may write counter+1 while current GPU is busy waiting for counter. We use
// alternating counter array to avoid this possibility.
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};
struct __align__(16) RankData {
const void* ptrs[8];
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankSignals {
@ -152,29 +134,27 @@ DINLINE O downcast(array_t<float, O::size> val) {
}
}
#if !defined(USE_ROCM)
static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
"l"(flag_addr));
#else
#else
asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
"l"(flag_addr));
#endif
#endif
}
static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
FlagType flag;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
: "=r"(flag)
: "l"(flag_addr));
#else
#else
asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
: "=r"(flag)
: "l"(flag_addr));
#endif
#endif
return flag;
}
@ -190,99 +170,37 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
return flag;
}
// This function is meant to be used as the first synchronization in the all
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
// prior memory accesses. Note: volatile writes will not be reordered against
// other volatile writes.
template <int ngpus>
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
int rank) {
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
// is_start: whether this is the very first synchronization barrier.
// need_fence: whether a memory fence is needed. If true, a release-acquire
// semantic is used to enforce memory access order before and after this
// barrier.
template <int ngpus, bool is_start, bool need_fence = false>
DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
int rank) {
if constexpr (!is_start) __syncthreads();
static_assert(
!(is_start && need_fence)); // Start barrier shouldn't need fence.
if (threadIdx.x < ngpus) {
auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
// Write the expected counter value to peer and wait for correct value
// from peer.
st_flag_volatile(peer_counter_ptr, flag);
while (ld_flag_volatile(self_counter_ptr) != flag);
}
__syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
// This function is meant to be used as the second or the final
// synchronization barrier in the all reduce kernel. If it's the final
// synchronization barrier, we don't need to make any visibility guarantees
// for prior memory accesses.
template <int ngpus, bool final_sync = false>
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
__syncthreads();
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
// Increment the counter. Technically we only need one counter, but we use
// multiple per block to eliminate the need to share the counter via smem.
auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
// Write the expected counter value to peer and wait for correct value from
// peer.
if constexpr (!final_sync) {
st_flag_release(peer_counter_ptr, flag);
while (ld_flag_acquire(self_counter_ptr) != flag);
auto peer_counter_ptr =
&sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
auto self_counter_ptr =
&self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
if constexpr (need_fence) {
st_flag_release(peer_counter_ptr, val);
while (ld_flag_acquire(self_counter_ptr) != val);
} else {
st_flag_volatile(peer_counter_ptr, flag);
while (ld_flag_volatile(self_counter_ptr) != flag);
st_flag_volatile(peer_counter_ptr, val);
while (ld_flag_volatile(self_counter_ptr) != val);
}
}
if constexpr (!final_sync) __syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
if constexpr (is_start || need_fence) __syncthreads();
}
#else
template <int ngpus>
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
int rank) {
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
// wait until we got true from all ranks
while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
__ATOMIC_RELAXED,
__MEMORY_SCOPE_DEVICE) < flag);
}
__syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
template <int ngpus, bool final_sync = false>
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
__syncthreads();
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
flag,
final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
__MEMORY_SCOPE_SYSTEM);
// wait until we got true from all ranks
while (
__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
__MEMORY_SCOPE_DEVICE) < flag);
}
if constexpr (!final_sync) __syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
#endif
template <typename P, int ngpus, typename A>
DINLINE P packed_reduce(const P* ptrs[], int idx) {
A tmp = upcast(ptrs[0][idx]);
@ -302,13 +220,13 @@ __global__ void __launch_bounds__(512, 1)
// note: we don't reorder the address so the accumulation order is the same
// for all ranks, ensuring bitwise identical results
auto dp = *_dp;
barrier_at_start<ngpus>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// do the actual reduction
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += gridDim.x * blockDim.x) {
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
}
barrier_at_end<ngpus, true>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
}
template <typename P>
@ -337,20 +255,18 @@ __global__ void __launch_bounds__(512, 1)
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
}
auto tmp_out = tmps[0];
barrier_at_start<ngpus>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// stage 1: reduce scatter
for (int idx = start + tid; idx < end; idx += stride) {
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
}
barrier_at_end<ngpus>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
// stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed
// between threads that have the same tid. If thread i computes the sum of
// start + i in the first stage, then thread i also gathers start + i from
// all ranks.
// start + i in the first stage, then thread i also gathers start + i from all
// ranks.
for (int idx = tid; idx < largest_part; idx += stride) {
#pragma unroll
for (int i = 0; i < ngpus; i++) {
@ -371,22 +287,21 @@ class CustomAllreduce {
public:
int rank_;
int world_size_;
// Full NVLink or xGMI connection between GPUs.
bool fully_connected_;
bool full_nvlink_;
RankSignals sg_;
// Stores an map from a pointer to its peer pointers from all ranks.
// Stores an map from a pointer to its peer pointters from all ranks.
std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_;
// Stores rank data from all ranks. This is mainly for cuda graph purposes.
// For cuda graph to work, all kernel arguments must be fixed during graph
// capture time. However, the peer pointers are not known during graph
// capture time. Therefore, during capture, we increment the rank data
// pointer and use that as the argument to the kernel. The kernel arguments
// are stored in graph_unreg_buffers_. The actual peer pointers will be
// filled in at the memory pointed to by the pointers in
// graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
// capture time. However, the peer pointers are not known during graph capture
// time. Therefore, during capture, we increment the rank data pointer and use
// that as the argument to the kernel. The kernel arguments are stored in
// graph_unreg_buffers_. The actual peer pointers will be filled in at the
// memory pointed to by the pointers in graph_unreg_buffers_ when
// the IPC handles are exchanged between ranks.
//
// The overall process looks like this:
// 1. Graph capture.
@ -404,18 +319,17 @@ class CustomAllreduce {
* Signals are an array of ipc-enabled buffers from all ranks.
* For each of the buffer, the layout is as follows:
* | -- sizeof(Signal) -- | ------ a few MB ----- |
* The first section is for allreduce synchronization, and the second
* section is for storing the intermediate results required by some
* allreduce algos.
* The first section is for allreduce synchronization, and the second section
* is for storing the intermediate results required by some allreduce algos.
*
* Note: this class does not own any device memory. Any required buffers
* are passed in from the constructor.
*/
CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
int rank, int world_size, bool fully_connected = true)
int rank, int world_size, bool full_nvlink = true)
: rank_(rank),
world_size_(world_size),
fully_connected_(fully_connected),
full_nvlink_(full_nvlink),
self_sg_(signals[rank]),
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@ -447,7 +361,8 @@ class CustomAllreduce {
void* base_ptr;
// note: must share the base address of each allocation, or we get wrong
// address
if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
if (cuPointerGetAttribute(&base_ptr,
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
(CUdeviceptr)ptr) != CUDA_SUCCESS)
throw std::runtime_error("failed to get pointer attr");
CUDACHECK(cudaIpcGetMemHandle(
@ -481,11 +396,11 @@ class CustomAllreduce {
// Note: when registering graph buffers, we intentionally choose to not
// deduplicate the addresses. That means if the allocator reuses some
// addresses, they will be registered again. This is to account for the
// remote possibility of different allocation patterns between ranks. For
// example, rank 1 may get the same input address for the second allreduce,
// but rank 2 got a different address. IPC handles have internal reference
// counting mechanism so overhead should be small.
// addresses, they will be registered again. This is to account for the remote
// possibility of different allocation patterns between ranks. For example,
// rank 1 may get the same input address for the second allreduce, but rank 2
// got a different address. IPC handles have internal reference counting
// mechanism so overhead should be small.
void register_graph_buffers(
const std::vector<std::string>& handles,
const std::vector<std::vector<int64_t>>& offsets) {
@ -516,15 +431,15 @@ class CustomAllreduce {
/**
* Performs allreduce, assuming input has already been registered.
*
* Block and grid default configs are results after careful grid search.
* Using 36 blocks give the best or close to the best runtime on the devices
* I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
* only take a small amount of SMs. Not quite sure the underlying reason,
* but my guess is that too many SMs will cause contention on NVLink bus.
* Block and grid default configs are results after careful grid search. Using
* 36 blocks give the best or close to the best runtime on the devices I
* tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
* take a small amount of SMs. Not quite sure the underlying reason, but my
* guess is that too many SMs will cause contention on NVLink bus.
*/
template <typename T>
void allreduce(cudaStream_t stream, T* input, T* output, int size,
int threads = 512, int block_limit = defaultBlockLimit) {
int threads = 512, int block_limit = 36) {
auto d = packed_t<T>::P::size;
if (size % d != 0)
throw std::runtime_error(
@ -558,11 +473,13 @@ class CustomAllreduce {
#define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
rank_, size);
// TODO(hanzhi713): Threshold is different for A100 and H100.
// Add per device threshold.
#define REDUCE_CASE(ngpus) \
case ngpus: { \
if (world_size_ == 2) { \
KL(ngpus, cross_device_reduce_1stage); \
} else if (fully_connected_) { \
} else if (full_nvlink_) { \
if ((world_size_ <= 4 && bytes < 512 * 1024) || \
(world_size_ <= 8 && bytes < 256 * 1024)) { \
KL(ngpus, cross_device_reduce_1stage); \
@ -580,8 +497,7 @@ class CustomAllreduce {
REDUCE_CASE(8)
default:
throw std::runtime_error(
"custom allreduce only supports num gpus in (2,4,6,8). Actual "
"num "
"custom allreduce only supports num gpus in (2,4,6,8). Actual num "
"gpus = " +
std::to_string(world_size_));
}
@ -595,11 +511,10 @@ class CustomAllreduce {
}
}
};
/**
* To inspect PTX/SASS, copy paste this header file to compiler explorer and
add a template instantiation:
* To inspect PTX/SASS, copy paste this header file to compiler explorer and add
a template instantiation:
* template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
half *, int, int, int);
*/
} // namespace vllm
} // namespace vllm

View File

@ -1,9 +1,9 @@
/**
* This is a standalone test for custom allreduce.
* To compile, make sure you have MPI and NCCL installed in your system.
* export MPI_HOME=XXX
* export MPI_HOME=xxx
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
* custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
*
* Warning: this C++ test is not designed to be very readable and was used
* during the rapid prototyping process.
@ -22,15 +22,7 @@
#include "cuda_profiler_api.h"
#include "custom_all_reduce.cuh"
#include "mpi.h"
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
typedef __hip_bfloat16 nv_bfloat16;
#include "rccl/rccl.h"
#include "custom_all_reduce_hip.cuh"
#else
#include "nccl.h"
#include "custom_all_reduce.cuh"
#endif
#include "nccl.h"
#define MPICHECK(cmd) \
do { \
@ -51,29 +43,16 @@ typedef __hip_bfloat16 nv_bfloat16;
} \
} while (0)
#ifdef USE_ROCM
__global__ void dummy_kernel() {
for (int i = 0; i < 100; i++) {
uint64_t start = wall_clock64();
uint64_t cycles_elapsed;
do {
cycles_elapsed = wall_clock64() - start;
} while (cycles_elapsed < 100);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
}
#else
__global__ void dummy_kernel() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
#else
for (int i = 0; i < 100; i++) {
long long int start = clock64();
while (clock64() - start < 150000000); // approximately 98.4ms on P40
}
#endif
}
#endif
}
template <typename T>
__global__ void set_data(T* data, int size, int myRank) {
@ -142,14 +121,8 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
* registration, they are allocated and registered together in the test for
* convenience.
*/
#ifdef USE_ROCM
CUDACHECK(hipExtMallocWithFlags(
(void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
hipDeviceMallocUncached));
#else
CUDACHECK(
cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
#endif
CUDACHECK(
cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@ -338,18 +311,13 @@ int main(int argc, char** argv) {
bool performance_test = true;
cudaProfilerStart();
// Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test);
// }
// }
#ifdef USE_ROCM
const int block_limit = 16;
#else
const int block_limit = 36;
#endif
// Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test);
// }
// }
// Scan through different sizes to test performance.
for (int sz = 512; sz <= (8 << 20); sz *= 2) {
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@ -358,4 +326,4 @@ int main(int argc, char** argv) {
cudaProfilerStop();
MPICHECK(MPI_Finalize());
return EXIT_SUCCESS;
}
}

View File

@ -119,8 +119,6 @@ void advance_step_flashinfer(
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
#ifndef USE_ROCM
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
const torch::Tensor& codebooks,
@ -267,10 +265,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
const std::optional<at::Tensor>& has_initial_state,
bool silu_activation, int64_t pad_slot_id);
#ifndef USE_ROCM
using fptr_t = int64_t;
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank,
bool fully_connected);
torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
void dispose(fptr_t _fa);
@ -281,7 +279,4 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
void register_graph_buffers(fptr_t _fa,
const std::vector<std::vector<int64_t>>& handles,
const std::vector<std::vector<int64_t>>& offsets);
std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
int64_t size);
int64_t open_mem_handle(torch::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer);
#endif

View File

@ -30,6 +30,9 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
fp8_type* __restrict__ out, float* __restrict__ scale,
scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
const int hidden_size) {
float const min_scaling_factor =
1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
int const tid = threadIdx.x;
int const token_idx = blockIdx.x;
@ -64,8 +67,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
token_scale = block_absmax_val_maybe;
}
// token scale computation
token_scale = max(token_scale / quant_type_max_v<fp8_type>,
min_scaling_factor<fp8_type>::val());
token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
min_scaling_factor);
scale[token_idx] = token_scale;
}
__syncthreads();

View File

@ -1,12 +1,20 @@
#pragma once
#include "quantization/vectorization.cuh"
#include "quantization/utils.cuh"
#include <cmath>
#include <c10/core/ScalarType.h>
#ifdef USE_ROCM
#ifndef USE_ROCM
#include <c10/util/Float8_e4m3fn.h>
#define MAYBE_HOST_DEVICE C10_HOST_DEVICE
#else
#include <ATen/hip/HIPContext.h>
#include <c10/util/Float8_e4m3fn.h>
#include <c10/util/Float8_e4m3fnuz.h>
#include "amd/quant_utils.cuh"
// ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
#define MAYBE_HOST_DEVICE
#endif
// Determines the preferred FP8 type for the current platform.
@ -23,6 +31,29 @@ static bool is_fp8_ocp() {
#endif
}
template <typename T>
struct fp8_e4m3_adjusted_max;
template <>
struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
static constexpr c10::Float8_e4m3fn val() {
return std::numeric_limits<c10::Float8_e4m3fn>::max();
}
};
// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
template <>
struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
static constexpr c10::Float8_e4m3fnuz val() {
return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
}
};
template <typename T>
MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
fp8_e4m3_adjusted_max<T>::val();
namespace vllm {
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@ -45,8 +76,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
x = val / scale;
}
float r =
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
#ifndef USE_ROCM
return static_cast<fp8_type>(r);
#else
@ -92,7 +123,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
// Finally, since cache[0] contains the maximum for this thread block,
// atomically write the max to the target location
if (threadIdx.x == 0) {
atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
}
}

View File

@ -14,7 +14,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
float* __restrict__ scales, // [num_tokens]
scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
float const* scale_ub, float const var_epsilon,
float const min_scaling_factor, int32_t const hidden_size,
scalar_t* __restrict__ residual = nullptr) {
float rms = 0.0f;
float token_scale = 0.0f;
@ -26,8 +27,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
// Compute scale
vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
has_residual>(
&token_scale, scales, input, weight, rms, scale_ub, hidden_size,
residual);
&token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
hidden_size, residual);
// RMS Norm + Quant
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@ -49,7 +50,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
float* __restrict__ scales, // [num_tokens]
scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
float const* scale_ub, float const var_epsilon,
float const min_scaling_factor, int32_t const hidden_size,
scalar_t* __restrict__ residual = nullptr) {
// For vectorization, token_input and token_output pointers need to be
// aligned at 8-byte and 4-byte addresses respectively.
@ -58,8 +60,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
if (can_vectorize) {
return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
has_residual>(
out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
residual);
out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
hidden_size, residual);
}
float rms = 0.0f;
@ -70,8 +72,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
var_epsilon, residual);
// Compute Scale
vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
&token_scale, scales, input, weight, rms, scale_ub, hidden_size,
residual);
&token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
hidden_size, residual);
// RMS Norm + Quant
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@ -103,6 +105,11 @@ void rms_norm_dynamic_per_token_quant_dispatch(
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const float min_scaling_factor =
out.dtype() == torch::kInt8
? std::numeric_limits<float>::epsilon()
: 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
if (residual.has_value()) {
VLLM_DISPATCH_QUANT_TYPES(
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
@ -112,7 +119,8 @@ void rms_norm_dynamic_per_token_quant_dispatch(
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
var_epsilon, min_scaling_factor, hidden_size,
residual->data_ptr<scalar_in_t>());
});
} else {
@ -124,7 +132,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
var_epsilon, hidden_size, nullptr);
var_epsilon, min_scaling_factor, hidden_size, nullptr);
});
}
}

View File

@ -5,7 +5,6 @@
*/
#include "quantization/vectorization.cuh"
#include "quantization/utils.cuh"
#include "quant_conversions.cuh"
#ifndef USE_ROCM
@ -52,11 +51,11 @@ __device__ void compute_dynamic_per_token_scales(
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
float const rms, float const* __restrict__ scale_ub,
int32_t const hidden_size,
float const min_scaling_factor, int32_t const hidden_size,
scalar_t const* __restrict__ residual = nullptr) {
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
;
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
float block_absmax_val_maybe = 0.0f;
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
@ -84,7 +83,7 @@ __device__ void compute_dynamic_per_token_scales(
scale = block_absmax_val_maybe;
}
// token scale computation
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
scale = max(scale / qmax, min_scaling_factor);
s_token_scale = scale; // Shared memory store
all_token_scales[blockIdx.x] = scale; // Global output store
}
@ -185,7 +184,7 @@ __device__ void compute_dynamic_per_token_scales(
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
float const rms, float const* __restrict__ scale_ub,
int32_t const hidden_size,
float const min_scaling_factor, int32_t const hidden_size,
scalar_t const* __restrict__ residual = nullptr) {
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
;
@ -201,7 +200,7 @@ __device__ void compute_dynamic_per_token_scales(
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
}
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
int32_t const num_vec_elems = hidden_size >> 2;
float block_absmax_val_maybe = 0.0f;
@ -249,7 +248,7 @@ __device__ void compute_dynamic_per_token_scales(
scale = block_absmax_val_maybe;
}
// token scale computation
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
scale = max(scale / qmax, min_scaling_factor);
s_token_scale = scale; // shared memory store
all_token_scales[blockIdx.x] = scale; // global output store
}

View File

@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
template <typename fp8_type>
static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
float const r =
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
return static_cast<fp8_type>(r);
}

View File

@ -1,59 +0,0 @@
#pragma once
/**
* Quantization utilities including:
* Adjusted maximum values for qtypes.
* Minimum scaling factors for qtypes.
*/
#include <cmath>
#include <torch/types.h>
#ifndef USE_ROCM
#include <c10/util/Float8_e4m3fn.h>
#define MAYBE_HOST_DEVICE C10_HOST_DEVICE
#else
#include <ATen/hip/HIPContext.h>
#include <c10/util/Float8_e4m3fn.h>
#include <c10/util/Float8_e4m3fnuz.h>
// ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
#define MAYBE_HOST_DEVICE
#endif
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
std::is_same_v<T, c10::Float8_e4m3fnuz> ||
std::is_same_v<T, int8_t>>>
struct quant_type_max {
static constexpr T val() { return std::numeric_limits<T>::max(); }
};
// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
template <>
struct quant_type_max<c10::Float8_e4m3fnuz> {
static constexpr c10::Float8_e4m3fnuz val() {
return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
}
};
template <typename T>
MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
quant_type_max<T>::val();
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
std::is_same_v<T, c10::Float8_e4m3fnuz> ||
std::is_same_v<T, int8_t>>>
struct min_scaling_factor {
C10_DEVICE C10_ALWAYS_INLINE static float val() {
return 1.0f / (quant_type_max_v<T> * 512.0f);
}
};
template <>
struct min_scaling_factor<int8_t> {
C10_DEVICE C10_ALWAYS_INLINE static float val() {
return std::numeric_limits<float>::epsilon();
}
};

View File

@ -31,10 +31,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
&get_cuda_view_from_cpu_tensor);
// Attention ops
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
@ -614,11 +610,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
&get_max_shared_memory_per_block_device_attribute);
}
#ifndef USE_ROCM
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels
custom_ar.def(
"init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
"int rank, bool fully_connected) -> int");
"int rank, bool full_nvlink) -> int");
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
custom_ar.def(
"all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@ -631,13 +628,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
custom_ar.def("register_buffer", &register_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", &register_graph_buffers);
custom_ar.def("allocate_shared_buffer_and_handle",
&allocate_shared_buffer_and_handle);
custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
custom_ar.def("free_shared_buffer", &free_shared_buffer);
}
#endif
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

View File

@ -1,138 +0,0 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
#
# Build targets:
# vllm-openai (default): used for serving deployment
# vllm-test: used for CI tests
# vllm-dev: used for development
#
# Build arguments:
# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
# VLLM_CPU_DISABLE_AVX512=false (default)|true
#
######################### BASE IMAGE #########################
FROM ubuntu:22.04 AS base
WORKDIR /workspace/
ARG PYTHON_VERSION=3.12
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
# Install minimal dependencies and uv
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
ENV PATH="/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV UV_HTTP_TIMEOUT=500
# Install Python dependencies
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE="copy"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
RUN echo 'ulimit -c 0' >> ~/.bashrc
######################### BUILD IMAGE #########################
FROM base AS vllm-build
ARG GIT_REPO_CHECK=0
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
uv pip install -r requirements/build.txt
COPY . .
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"]
######################### TEST IMAGE #########################
FROM base AS vllm-test
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
uv pip install -r requirements/test.txt
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
ADD ./tests/ ./tests/
ADD ./examples/ ./examples/
ADD ./benchmarks/ ./benchmarks/
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
ENTRYPOINT ["bash"]
######################### RELEASE IMAGE #########################
FROM base AS vllm-openai
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -2,42 +2,19 @@
## Build the docs
- Make sure in `docs` directory
```bash
cd docs
```
- Install the dependencies:
```bash
# Install dependencies.
pip install -r ../requirements/docs.txt
```
- Clean the previous build (optional but recommended):
```bash
# Build the docs.
make clean
```
- Generate the HTML documentation:
```bash
make html
```
## Open the docs with your browser
- Serve the documentation locally:
```bash
python -m http.server -d build/html/
```
This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
If port 8000 is already in use, you can specify a different port, for example:
```bash
python -m http.server 3000 -d build/html/
```
Launch your browser and open localhost:8000.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -4,8 +4,6 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)

View File

@ -104,7 +104,7 @@ myst_url_schemes = {
"classes": ["github"],
},
"gh-project": {
"url": "https://github.com/orgs/vllm-project/projects/{{path}}",
"url": "https://github.com/vllm-project/projects/{{path}}",
"title": "Project #{{path}}",
"classes": ["github"],
},

View File

@ -1,6 +1,6 @@
# Dockerfile
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
More information about deploying with Docker can be found [here](#deployment-docker).
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@ -28,7 +28,7 @@ The edges of the build graph represent:
> Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
>
> ```bash
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
> ```
>
> or in case you want to run it directly with the docker image:
@ -43,7 +43,7 @@ The edges of the build graph represent:
> --output png \
> --dpi 200 \
> --max-label-length 50 \
> --filename docker/Dockerfile \
> --filename Dockerfile \
> --legend
> ```
>

View File

@ -44,12 +44,6 @@ pre-commit run --all-files
pytest tests/
```
:::{tip}
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
:::
:::{note}
Currently, the repository is not fully checked by `mypy`.
:::

View File

@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git
## Building vLLM's Docker Image from Source
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
```console
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
```
:::{note}
@ -92,7 +92,6 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
$ python3 use_existing_torch.py
$ DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \
--platform "linux/arm64" \
-t vllm/vllm-gh200-openai:latest \

View File

@ -69,14 +69,14 @@ server {
```console
cd $vllm_root
docker build -f docker/Dockerfile . --tag vllm
docker build -f Dockerfile . --tag vllm
```
If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
```console
cd $vllm_root
docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
```
(nginxloadbalancer-nginx-docker-network)=

View File

@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges.
[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
- `spawn` - spawn a new Python process. This will be the default as of Python
3.14. In macOS, this is already the default.
3.14.
- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
in Python versions prior to 3.14.
@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges.
### Tradeoffs
`fork` is the fastest method, but is incompatible with dependencies that use
threads. If you are under macOS, using `fork` may cause the process to crash.
threads.
`spawn` is more compatible with dependencies, but can be problematic when vLLM
is used as a library. If the consuming code does not use a `__main__` guard (`if

View File

@ -16,6 +16,5 @@ gptqmodel
int4
int8
fp8
quark
quantized_kvcache
:::

View File

@ -1,217 +0,0 @@
(quark)=
# AMD QUARK
Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
activation and kv-cache quantization and cutting-edge quantization algorithms like
AWQ, GPTQ, Rotation and SmoothQuant.
## Quark Installation
Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
```console
pip install amd-quark
```
You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
for more installation details.
## Quantization Process
After installing Quark, we will use an example to illustrate how to use Quark.
The Quark quantization process can be listed for 5 steps as below:
1. Load the model
2. Prepare the calibration dataloader
3. Set the quantization configuration
4. Quantize the model and export
5. Evaluation in vLLM
### 1. Load the Model
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
to fetch model and tokenizer.
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
tokenizer.pad_token = tokenizer.eos_token
```
### 2. Prepare the Calibration Dataloader
Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
```python
from datasets import load_dataset
from torch.utils.data import DataLoader
BATCH_SIZE = 1
NUM_CALIBRATION_DATA = 512
# Load the dataset and get calibration data.
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True)
```
### 3. Set the Quantization Configuration
We need to set the quantization configuration, you can check
[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
for further details. Here we use FP8 per-tensor quantization on weight, activation,
kv-cache and the quantization algorithm is AutoSmoothQuant.
:::{note}
Note the quantization algorithm needs a JSON config file and the config file is located in
[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
AutoSmoothQuant config file for Llama is
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
:::
```python
from quark.torch.quantization import (Config, QuantizationConfig,
FP8E4M3PerTensorSpec,
load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
is_dynamic=False).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name :
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama}
layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"]
quant_config = Config(
global_quant_config=global_quant_config,
layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS,
algo_config=algo_config)
```
### 4. Quantize the Model and Export
Then we can apply the quantization. After quantizing, we need to freeze the
quantized model first before exporting. Note that we need to export model with format of
HuggingFace `safetensors`, you can refer to
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
for more exporting format details.
```python
import torch
from quark.torch import ModelQuantizer, ModelExporter
from quark.torch.export import ExporterConfig, JsonExporterConfig
# Apply quantization.
quantizer = ModelQuantizer(quant_config)
quant_model = quantizer.quantize_model(model, calib_dataloader)
# Freeze quantized model to export.
freezed_model = quantizer.freeze(model)
# Define export config.
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad():
exporter.export_safetensors_model(freezed_model,
quant_config=quant_config, tokenizer=tokenizer)
```
### 5. Evaluation in vLLM
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
```python
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype='fp8',quantization='quark')
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
```
Or, you can use `lm_eval` to evaluate accuracy:
```console
$ lm_eval --model vllm \
--model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
--tasks gsm8k
```
## Quark Quantization Script
In addition to the example of Python API above, Quark also offers a
[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
to quantize large language models more conveniently. It supports quantizing models with variety
of different quantization schemes and optimization algorithms. It can export the quantized model
and run evaluation tasks on the fly. With the script, the example above can be:
```console
python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
--output_dir /path/to/output \
--quant_scheme w_fp8_a_fp8 \
--kv_cache_dtype fp8 \
--quant_algo autosmoothquant \
--num_calib_data 512 \
--model_export hf_format \
--tasks gsm8k
```

View File

@ -136,14 +136,7 @@ Remember to check whether the `reasoning_content` exists in the response before
## Structured output
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
```bash
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
```
Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.
```python
from openai import OpenAI

View File

@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
```
:::{warning}
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
:::
Then use a client:

View File

@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
### Build image from source
```console
docker build -f docker/Dockerfile.hpu -t vllm-hpu-env .
docker build -f Dockerfile.hpu -t vllm-hpu-env .
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
```

View File

@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.
See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
## Extra information

View File

@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o
### Build image from source
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
```console
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
docker build -f Dockerfile.tpu -t vllm-tpu .
```
Run the Docker image with the following command:

View File

@ -159,45 +159,26 @@ Currently, there are no pre-built CPU wheels.
### Pre-built images
:::::{tab-set}
:sync-group: device
::::{tab-item} Intel/AMD x86
:sync: x86
:::{include} cpu/x86.inc.md
:start-after: "### Pre-built images"
:end-before: "### Build image from source"
:::
::::
:::::
Currently, there are no pre-build CPU images.
### Build image from source
```console
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
# Launching OpenAI server
$ docker run --rm \
--privileged=true \
--shm-size=4g \
-p 8000:8000 \
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
vllm-cpu-env \
--model=meta-llama/Llama-3.2-1B-Instruct \
--dtype=bfloat16 \
other vLLM OpenAI server arguments
$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
$ docker run -it \
--rm \
--network=host \
--cpuset-cpus=<cpu-id-list, optional> \
--cpuset-mems=<memory-node, optional> \
vllm-cpu-env
```
::::{tip}
For ARM or Apple silicon, use `docker/Dockerfile.arm`
For ARM or Apple silicon, use `Dockerfile.arm`
::::
::::{tip}
For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
::::
## Supported features

View File

@ -34,8 +34,6 @@ There are no pre-built wheels or images for this device, so you must build vLLM
### Pre-built images
See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
### Build image from source
## Extra information

View File

@ -8,7 +8,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
## Requirements
- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
- ROCm 6.3
## Set up using Python
@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
#### (Optional) Build an image with ROCm software stack
Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
If you choose to build this rocm_base image yourself, the steps are as follows.
@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
```console
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
```
#### Build an image with vLLM
First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
```console
@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
}
```
<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
It provides flexibility to customize the build of docker image using the following arguments:
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
```console
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
```
To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
```console
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
```
To run the above docker image `vllm-rocm`, use the below command:

View File

@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
### Build image from source
```console
$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
$ docker run -it \
--rm \
--network=host \

View File

@ -1,4 +1,4 @@
You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
You can create a new Python environment using `conda`:
```console
# (Recommended) Create a new conda environment.

View File

@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
```{attention}
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
```

View File

@ -26,14 +26,6 @@ To isolate the model downloading and loading issue, you can use the `--load-form
If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
## Generation quality changed
In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.
You can check if this is happening by trying the old defaults with `--generation-config vllm` for online and `generation_config="vllm"` for offline. If, after trying this, your generation quality improves we would recommend continuing to use the vLLM defaults and petition the model creator on <https://huggingface.co> to update their default `generation_config.json` so that it produces better quality generations.
## Enable more logging
If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:

View File

@ -77,9 +77,9 @@ getting_started/v1_user_guide
:caption: Models
:maxdepth: 1
models/supported_models
models/generative_models
models/pooling_models
models/supported_models
models/extensions/index
:::

View File

@ -23,8 +23,6 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc
except that tokenization and detokenization are also performed automatically.
```python
from vllm import LLM
llm = LLM(model="facebook/opt-125m")
outputs = llm.generate("Hello, my name is")
@ -38,8 +36,6 @@ You can optionally control the language generation by passing {class}`~vllm.Samp
For example, you can use greedy sampling by setting `temperature=0`:
```python
from vllm import LLM, SamplingParams
llm = LLM(model="facebook/opt-125m")
params = SamplingParams(temperature=0)
outputs = llm.generate("Hello, my name is", params)
@ -87,8 +83,6 @@ Base models may perform poorly as they are not trained to respond to the chat co
:::
```python
from vllm import LLM
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
conversation = [
{

View File

@ -68,8 +68,6 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
It returns the extracted hidden states directly, which is useful for reward models.
```python
from vllm import LLM
llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
(output,) = llm.encode("Hello, my name is")
@ -83,8 +81,6 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
It is primarily designed for embedding models.
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
(output,) = llm.embed("Hello, my name is")
@ -100,8 +96,6 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro
It is primarily designed for classification models.
```python
from vllm import LLM
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
(output,) = llm.classify("Hello, my name is")
@ -122,8 +116,6 @@ To handle RAG at a higher level, you should use integration frameworks such as [
:::
```python
from vllm import LLM
llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
(output,) = llm.score("What is the capital of France?",
"The capital of Brazil is Brasilia.")

View File

@ -1,28 +1,55 @@
(supported-models)=
# Supported Models
# List of Supported Models
vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
vLLM supports generative and pooling models across various tasks.
If a model supports more than one task, you can set the task via the `--task` argument.
For each task, we list the model architectures that have been implemented in vLLM.
Alongside each architecture, we include some popular models that use it.
## Model Implementation
## Loading a Model
### vLLM
### HuggingFace Hub
If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.
To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
(transformers-backend)=
Models do not _need_ to be natively supported to be used in vLLM.
The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
### Transformers
:::{tip}
The easiest way to check if your model is really supported at runtime is to run the program below:
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
```python
from vllm import LLM
To check if the modeling backend is Transformers, you can simply do this:
# For generative models (task=generate) only
llm = LLM(model=..., task="generate") # Name or path of your model
output = llm.generate("Hello, my name is")
print(output)
# For pooling models (task={embed,classify,reward,score}) only
llm = LLM(model=..., task="embed") # Name or path of your model
output = llm.encode("Hello, my name is")
print(output)
```
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
:::
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
(transformers-fallback)=
### Transformers fallback
vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
To check if the backend is Transformers, you can simply do this:
```python
from vllm import LLM
@ -42,15 +69,16 @@ vLLM may not fully optimise the Transformers implementation so you may see degra
#### Supported features
The Transformers modeling backend explicitly supports the following features:
The Transformers fallback explicitly supports the following features:
- <project:#quantization-index> (except GGUF)
- <project:#lora-adapter>
- <project:#distributed-serving>
- <project:#distributed-serving> (requires `transformers>=4.49.0`)
#### Remote Code
#### Remote code
If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
If you are interested in this feature, this section is for you!
Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
@ -61,7 +89,7 @@ llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of
llm.apply_model(lambda model: print(model.__class__))
```
To make your model compatible with the Transformers backend, it needs:
To make your model compatible with the Transformers fallback, it needs:
```{code-block} python
:caption: modeling_my_model.py
@ -93,9 +121,7 @@ Here is what happens in the background:
2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
That's it!
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
To make your model compatible with tensor parallel, it needs:
```{code-block} python
:caption: configuration_my_model.py
@ -104,65 +130,20 @@ from transformers import PretrainedConfig
class MyConfig(PretrainedConfig):
base_model_tp_plan = {
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
"layers.*.self_attn.q_proj": "colwise",
...
}
```
- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
* You only need to do this for layers which are not present on all pipeline stages
* vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
* The `list` in the first element of the `tuple` contains the names of the input arguments
* The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
## Loading a Model
### Hugging Face Hub
By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
Models do not _need_ to be natively supported to be used in vLLM.
The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
:::{tip}
The easiest way to check if your model is really supported at runtime is to run the program below:
```python
from vllm import LLM
# For generative models (task=generate) only
llm = LLM(model=..., task="generate") # Name or path of your model
output = llm.generate("Hello, my name is")
print(output)
# For pooling models (task={embed,classify,reward,score}) only
llm = LLM(model=..., task="embed") # Name or path of your model
output = llm.encode("Hello, my name is")
print(output)
```
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
:::
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
That's it!
### ModelScope
To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
```shell
export VLLM_USE_MODELSCOPE=True
@ -184,8 +165,6 @@ output = llm.encode("Hello, my name is")
print(output)
```
(supported-text-models)=
## List of Text-only Language Models
### Generative Models
@ -245,7 +224,7 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎
- * `DeciLMForCausalLM`
* DeciLM
* `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
* `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
*
* ✅︎
- * `DeepseekForCausalLM`
@ -503,11 +482,6 @@ See [this page](#generative-models) for more information on how to use generativ
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
* ✅︎
* ✅︎
- * `MiniMaxText01ForCausalLM`
* MiniMax-Text
* `MiniMaxAI/MiniMax-Text-01`, etc.
*
* ✅︎
- * `Zamba2ForCausalLM`
* Zamba2
* `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
@ -571,7 +545,7 @@ you should explicitly specify the task type to ensure that the model is used in
*
- * `XLMRobertaModel`
* XLM-RoBERTa-based
* `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc.
* `intfloat/multilingual-e5-large`, etc.
*
*
:::
@ -758,13 +732,6 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `AyaVisionForConditionalGeneration`
* Aya Vision
* T + I<sup>+</sup>
* `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
*
* ✅︎
* ✅︎
- * `Blip2ForConditionalGeneration`
* BLIP-2
* T + I<sup>E</sup>
@ -877,13 +844,6 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎
* ✅︎
* ✅︎
- * `Mistral3ForConditionalGeneration`
* Mistral3
* T + I<sup>+</sup>
* `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
*
* ✅︎
*
- * `MllamaForConditionalGeneration`
* Llama 3.2
* T + I<sup>+</sup>
@ -961,13 +921,6 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎
* ✅︎
* ✅︎
- * `SkyworkR1VChatModel`
* Skywork-R1V-38B
* T + I
* `Skywork/Skywork-R1V-38B`
*
* ✅︎
* ✅︎
- * `UltravoxModel`
* Ultravox
* T + A<sup>E+</sup>
@ -1106,7 +1059,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
:::{tip}
When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
:::
3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.

View File

@ -31,8 +31,6 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to
You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
```python
from vllm import LLM
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
# Set max_num_batched_tokens to tune performance.
# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.

View File

@ -21,8 +21,6 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
```python
from vllm import LLM
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Refer to the HuggingFace repo for the correct format to use
@ -67,8 +65,6 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
```python
from vllm import LLM
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, # Required to load Phi-3.5-vision
@ -100,8 +96,6 @@ Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
```python
from vllm import LLM
# Specify the maximum number of frames per video to be 4. This can be changed.
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
@ -145,8 +139,6 @@ To input pre-computed embeddings belonging to a data type (i.e. image, video, or
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
```python
from vllm import LLM
# Inference with image embeddings as input
llm = LLM(model="llava-hf/llava-1.5-7b-hf")

View File

@ -11,8 +11,6 @@ For example, the following code downloads the [`facebook/opt-125m`](https://hugg
and runs it in vLLM using the default configuration.
```python
from vllm import LLM
llm = LLM(model="facebook/opt-125m")
```
@ -49,8 +47,6 @@ To fix this, explicitly specify the model architecture by passing `config.json`
For example:
```python
from vllm import LLM
model = LLM(
model="cerebras/Cerebras-GPT-1.3B",
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
@ -96,8 +92,6 @@ You can further reduce memory usage by limiting the context length of the model
and the maximum batch size (`max_num_seqs` option).
```python
from vllm import LLM
llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2)

View File

@ -188,7 +188,6 @@ For example:
```yaml
# config.yaml
model: meta-llama/Llama-3.1-8B-Instruct
host: "127.0.0.1"
port: 6379
uvicorn-log-level: "info"
@ -197,13 +196,12 @@ uvicorn-log-level: "info"
To use the above config file:
```bash
vllm serve --config config.yaml
vllm serve SOME_MODEL --config config.yaml
```
:::{note}
In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
The order of priorities is `command line > config file values > defaults`.
e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
:::
## API Reference

View File

@ -1,8 +1,6 @@
# Usage Stats Collection
vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).
vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
## What data is collected?

View File

@ -69,12 +69,10 @@ llm = LLM(
max_model_len=max_model_len,
max_num_seqs=args.max_num_seqs,
gpu_memory_utilization=0.8,
speculative_config={
"model": eagle_dir,
"num_speculative_tokens": args.num_spec_tokens,
"draft_tensor_parallel_size": args.draft_tp,
"max_model_len": max_model_len,
},
speculative_model=eagle_dir,
num_speculative_tokens=args.num_spec_tokens,
speculative_draft_tensor_parallel_size=args.draft_tp,
speculative_max_model_len=max_model_len,
disable_log_stats=False,
)

View File

@ -60,28 +60,6 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
)
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "CohereForAI/aya-vision-8b"
engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -90,7 +68,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b",
model="Salesforce/blip2-opt-2.7b",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
@ -150,8 +128,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large",
max_model_len=4096,
max_num_seqs=2,
max_num_seqs=8,
trust_remote_code=True,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@ -520,29 +497,6 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
# NOTE: Need L40 (or equivalent) to avoid OOM
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLama 3.2
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -557,7 +511,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
max_num_seqs=16,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
@ -746,7 +700,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
# NOTE: Need L40 (or equivalent) to avoid OOM
engine_args = EngineArgs(
model=model_name,
max_model_len=6144,
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
@ -850,44 +804,8 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
)
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "Skywork/Skywork-R1V-38B"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"<image>\n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for SkyworkR1V
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
stop_tokens = ["<end▁of▁sentence>", "<|endoftext|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
"blip-2": run_blip2,
"chameleon": run_chameleon,
"deepseek_vl_v2": run_deepseek_vl2,
@ -905,7 +823,6 @@ model_example_map = {
"mantis": run_mantis,
"minicpmo": run_minicpmo,
"minicpmv": run_minicpmv,
"mistral3": run_mistral3,
"mllama": run_mllama,
"molmo": run_molmo,
"NVLM_D": run_nvlm_d,
@ -917,7 +834,6 @@ model_example_map = {
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
"skywork_chat": run_skyworkr1v,
}

View File

@ -61,41 +61,6 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "CohereForAI/aya-vision-8b"
engine_args = EngineArgs(
model=model_name,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_deepseek_vl2(question: str,
image_urls: list[str]) -> ModelRequestData:
model_name = "deepseek-ai/deepseek-vl2-tiny"
@ -253,28 +218,6 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
# Adjust this as necessary to fit in GPU
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "[IMG]" * len(image_urls)
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@ -286,8 +229,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
limit_mm_per_prompt={"image": len(image_urls)},
)
img_prompt = "Given the first image <|image|> and the second image<|image|>"
prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
placeholders = "<|image|>" * len(image_urls)
prompt = f"{placeholders}<|begin_of_text|>{question}"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
@ -561,13 +504,11 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
"deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"mistral3": load_mistral3,
"mllama": load_mllama,
"NVLM_D": load_nvlm_d,
"phi3_v": load_phi3v,

View File

@ -1,450 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
"""
This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
We can launch multiple vllm instances (2 for prefill and 2 for decode), and
launch this proxy demo through:
python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \
--model $model_name \
--prefill localhost:8100 localhost:8101 \
--decode localhost:8200 localhost:8201 \
--port 8000
Note: This demo will be removed once the PDController implemented in PR 15343
(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
"""
import argparse
import ipaddress
import itertools
import json
import logging
import os
import sys
from abc import ABC, abstractmethod
from typing import Callable, Optional
import aiohttp
import requests
import uvicorn
from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException,
Request, status)
from fastapi.responses import JSONResponse, StreamingResponse
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
class SchedulingPolicy(ABC):
@abstractmethod
def schedule(self, cycler: itertools.cycle):
raise NotImplementedError("Scheduling Proxy is not set.")
class Proxy:
def __init__(
self,
prefill_instances: list[str],
decode_instances: list[str],
model: str,
scheduling_policy: SchedulingPolicy,
custom_create_completion: Optional[Callable[[Request],
StreamingResponse]] = None,
custom_create_chat_completion: Optional[Callable[
[Request], StreamingResponse]] = None,
):
self.prefill_instances = prefill_instances
self.decode_instances = decode_instances
self.prefill_cycler = itertools.cycle(prefill_instances)
self.decode_cycler = itertools.cycle(decode_instances)
self.model = model
self.scheduling_policy = scheduling_policy
self.custom_create_completion = custom_create_completion
self.custom_create_chat_completion = custom_create_chat_completion
self.router = APIRouter()
self.setup_routes()
def setup_routes(self):
self.router.post(
"/v1/completions",
dependencies=[
Depends(self.validate_json_request)
])(self.custom_create_completion if self.
custom_create_completion else self.create_completion)
self.router.post(
"/v1/chat/completions",
dependencies=[
Depends(self.validate_json_request)
])(self.custom_create_chat_completion if self.
custom_create_chat_completion else self.create_chat_completion)
self.router.get("/status",
response_class=JSONResponse)(self.get_status)
self.router.post("/instances/add",
dependencies=[Depends(self.api_key_authenticate)
])(self.add_instance_endpoint)
async def validate_json_request(self, raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower()
if content_type != "application/json":
raise HTTPException(
status_code=415,
detail=
"Unsupported Media Type: Only 'application/json' is allowed",
)
def api_key_authenticate(self, x_api_key: str = Header(...)):
expected_api_key = os.environ.get("ADMIN_API_KEY")
if not expected_api_key:
logger.error("ADMIN_API_KEY is not set in the environment.")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Server configuration error.",
)
if x_api_key != expected_api_key:
logger.warning("Unauthorized access attempt with API Key: %s",
x_api_key)
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Forbidden: Invalid API Key.",
)
async def validate_instance(self, instance: str) -> bool:
url = f"http://{instance}/v1/models"
try:
async with aiohttp.ClientSession(
timeout=AIOHTTP_TIMEOUT) as client:
logger.info("Verifying %s ...", instance)
async with client.get(url) as response:
if response.status == 200:
data = await response.json()
if "data" in data and len(data["data"]) > 0:
model_cur = data["data"][0].get("id", "")
if model_cur == self.model:
logger.info("Instance: %s could be added.",
instance)
return True
else:
logger.warning("Mismatch model %s : %s != %s",
instance, model_cur, self.model)
return False
else:
return False
else:
return False
except aiohttp.ClientError as e:
logger.error(str(e))
return False
except Exception as e:
logger.error(str(e))
return False
async def add_instance_endpoint(self, request: Request):
try:
data = await request.json()
logger.warning(str(data))
instance_type = data.get("type")
instance = data.get("instance")
if instance_type not in ["prefill", "decode"]:
raise HTTPException(status_code=400,
detail="Invalid instance type.")
if not instance or ":" not in instance:
raise HTTPException(status_code=400,
detail="Invalid instance format.")
host, port_str = instance.split(":")
try:
if host != "localhost":
ipaddress.ip_address(host)
port = int(port_str)
if not (0 < port < 65536):
raise HTTPException(status_code=400,
detail="Invalid port number.")
except Exception as e:
raise HTTPException(status_code=400,
detail="Invalid instance address.") from e
is_valid = await self.validate_instance(instance)
if not is_valid:
raise HTTPException(status_code=400,
detail="Instance validation failed.")
if instance_type == "prefill":
if instance not in self.prefill_instances:
self.prefill_instances.append(instance)
self.prefill_cycler = itertools.cycle(
self.prefill_instances)
else:
raise HTTPException(status_code=400,
detail="Instance already exists.")
else:
if instance not in self.decode_instances:
self.decode_instances.append(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
else:
raise HTTPException(status_code=400,
detail="Instance already exists.")
return JSONResponse(content={
"message":
f"Added {instance} to {instance_type}_instances."
})
except HTTPException as http_exc:
raise http_exc
except Exception as e:
logger.error("Error in add_instance_endpoint: %s", str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
async def forward_request(self, url, data, use_chunked=True):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
try:
async with session.post(url=url, json=data,
headers=headers) as response:
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
if use_chunked:
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
1024):
yield chunk_bytes
else:
content = await response.read()
yield content
else:
error_content = await response.text()
try:
error_content = json.loads(error_content)
except json.JSONDecodeError:
error_content = error_content
logger.error("Request failed with status %s: %s",
response.status, error_content)
raise HTTPException(
status_code=response.status,
detail=
f"Request failed with status {response.status}: "
f"{error_content}",
)
except aiohttp.ClientError as e:
logger.error("ClientError occurred: %s", str(e))
raise HTTPException(
status_code=502,
detail=
"Bad Gateway: Error communicating with upstream server.",
) from e
except Exception as e:
logger.error("Unexpected error: %s", str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
def schedule(self, cycler: itertools.cycle) -> str:
return self.scheduling_policy.schedule(cycler)
async def get_status(self):
status = {
"prefill_node_count": len(self.prefill_instances),
"decode_node_count": len(self.decode_instances),
"prefill_nodes": self.prefill_instances,
"decode_nodes": self.decode_instances,
}
return status
async def create_completion(self, raw_request: Request):
try:
request = await raw_request.json()
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
prefill_instance = self.schedule(self.prefill_cycler)
try:
async for _ in self.forward_request(
f"http://{prefill_instance}/v1/completions",
kv_prepare_request):
continue
except HTTPException as http_exc:
self.remove_instance_endpoint("prefill", prefill_instance)
raise http_exc
# Perform kv recv and decoding stage
decode_instance = self.schedule(self.decode_cycler)
try:
generator = self.forward_request(
f"http://{decode_instance}/v1/completions", request)
except HTTPException as http_exc:
self.remove_instance_endpoint("decode", decode_instance)
raise http_exc
response = StreamingResponse(generator)
return response
except Exception:
import sys
exc_info = sys.exc_info()
print("Error occurred in disagg proxy server")
print(exc_info)
async def create_chat_completion(self, raw_request: Request):
try:
request = await raw_request.json()
# add params to request
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
# prefill stage
prefill_instance = self.schedule(self.prefill_cycler)
try:
async for _ in self.forward_request(
f"http://{prefill_instance}/v1/chat/completions",
kv_prepare_request):
continue
except HTTPException as http_exc:
self.remove_instance_endpoint("prefill", prefill_instance)
raise http_exc
# Perform kv recv and decoding stage
decode_instance = self.schedule(self.decode_cycler)
try:
generator = self.forward_request(
"http://" + decode_instance + "/v1/chat/completions",
request)
except HTTPException as http_exc:
self.remove_instance_endpoint("decode", decode_instance)
raise http_exc
response = StreamingResponse(content=generator)
return response
except Exception:
exc_info = sys.exc_info()
error_messages = [str(e) for e in exc_info if e]
print("Error occurred in disagg proxy server")
print(error_messages)
return StreamingResponse(content=iter(error_messages),
media_type="text/event-stream")
def remove_instance_endpoint(self, instance_type, instance):
if (instance_type == "decode" and instance in self.decode_instances):
self.decode_instances.remove(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
if (instance_type == "prefill" and instance in self.decode_instances):
self.prefill_instances.remove(instance)
self.prefill_cycler = itertools.cycle(self.decode_instances)
class RoundRobinSchedulingPolicy(SchedulingPolicy):
def __init__(self):
super().__init__()
def schedule(self, cycler: itertools.cycle) -> str:
return next(cycler)
class ProxyServer:
def __init__(
self,
args: argparse.Namespace,
scheduling_policy: Optional[SchedulingPolicy] = None,
create_completion: Optional[Callable[[Request],
StreamingResponse]] = None,
create_chat_completion: Optional[Callable[[Request],
StreamingResponse]] = None,
):
self.validate_parsed_serve_args(args)
self.port = args.port
self.proxy_instance = Proxy(
prefill_instances=[] if args.prefill is None else args.prefill,
decode_instances=[] if args.decode is None else args.decode,
model=args.model,
scheduling_policy=(scheduling_policy if scheduling_policy
is not None else RoundRobinSchedulingPolicy()),
custom_create_completion=create_completion,
custom_create_chat_completion=create_chat_completion,
)
def validate_parsed_serve_args(self, args: argparse.Namespace):
if not args.prefill:
raise ValueError("Please specify at least one prefill node.")
if not args.decode:
raise ValueError("Please specify at least one decode node.")
self.validate_instances(args.prefill)
self.validate_instances(args.decode)
self.verify_model_config(args.prefill, args.model)
self.verify_model_config(args.decode, args.model)
def validate_instances(self, instances: list):
for instance in instances:
if len(instance.split(":")) != 2:
raise ValueError(f"Invalid instance format: {instance}")
host, port = instance.split(":")
try:
if host != "localhost":
ipaddress.ip_address(host)
port = int(port)
if not (0 < port < 65536):
raise ValueError(
f"Invalid port number in instance: {instance}")
except Exception as e:
raise ValueError(
f"Invalid instance {instance}: {str(e)}") from e
def verify_model_config(self, instances: list, model: str) -> None:
model_suffix = model.split("/")[-1]
for instance in instances:
try:
response = requests.get(f"http://{instance}/v1/models")
if response.status_code == 200:
model_cur = response.json()["data"][0]["id"]
model_cur_suffix = model_cur.split("/")[-1]
if model_cur_suffix != model_suffix:
raise ValueError(
f"{instance} serves a different model: "
f"{model_cur} != {model}")
else:
raise ValueError(f"Cannot get model id from {instance}!")
except requests.RequestException as e:
raise ValueError(
f"Error communicating with {instance}: {str(e)}") from e
def run_server(self):
app = FastAPI()
app.include_router(self.proxy_instance.router)
config = uvicorn.Config(app, port=self.port, loop="uvloop")
server = uvicorn.Server(config)
server.run()
if __name__ == "__main__":
# Todo: allow more config
parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
parser.add_argument("--model",
"-m",
type=str,
required=True,
help="Model name")
parser.add_argument(
"--prefill",
"-p",
type=str,
nargs="+",
help="List of prefill node URLs (host:port)",
)
parser.add_argument(
"--decode",
"-d",
type=str,
nargs="+",
help="List of decode node URLs (host:port)",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Server port number",
)
args = parser.parse_args()
proxy_server = ProxyServer(args=args)
proxy_server.run_server()

View File

@ -1,60 +0,0 @@
{%- if messages %}
{%- if system_message or tools %}
<|system|>
{%- if system_message %}
{{ system_message }}
{%- endif %}
In addition to plain text responses, you can chose to call one or more of the provided functions.
Use the following rule to decide when to call a function:
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
* if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
If you decide to call functions:
* prefix function calls with functools marker (no closing marker required)
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
* make sure you pick the right functions that match the user intent
{%- if tools %}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}<|end|>
{%- endif %}
{%- for message in messages %}
{%- if message.role != "system" %}
<|{{ message.role }}|>
{%- if message.content and message.role == "tools" %}
{"result": {{ message.content }}}
{%- elif message.content %}
{{ message.content }}
{%- elif message.tool_calls %}
{%- for call in message.tool_calls %}
{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
{%- if not loop.last %},{% endif %}
{%- endfor %}
{%- endif %}<|end|>
{%- endif %}
{%- endfor %}<|assistant|>
{%- else %}
{%- if system_message %}
<|system|>
{{ system_message }}<|end|>
{%- endif %}
{%- if prompt %}
<|user|>
{{ prompt }}<|end|>
{%- endif %}<|assistant|>
{%- endif %}
{{ response }}
{%- if response %}<|user|>{% endif %}

6
format.sh Normal file → Executable file
View File

@ -1,6 +1,6 @@
#!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
echo "vLLM linting system has been moved from format.sh to pre-commit hook."
echo "Please run 'pip install -r requirements/lint.txt', followed by"
echo "'pre-commit install' to install the pre-commit hooks."
echo "Then linters will run automatically before each commit."
echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
echo "Then linters will run automatically before each commit."

16
python_only_dev.py Normal file
View File

@ -0,0 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
TL;DR:
VLLM_USE_PRECOMPILED=1 pip install -e .
or
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
pip install -e .
""" # noqa
print(msg)

View File

@ -1,12 +1,12 @@
cachetools
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy
numpy < 2.0.0
requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.50.3
transformers >= 4.48.2 # Required for Bamba model and Transformers backend.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@ -30,7 +30,6 @@ msgspec
gguf == 0.10.0
importlib_metadata
mistral_common[opencv] >= 1.5.4
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12

View File

@ -1,8 +1,7 @@
# Common dependencies
-r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.

View File

@ -6,7 +6,7 @@ torch==2.6.0
torchvision==0.21.0
torchaudio==2.6.0
cmake>=3.26,<4
cmake>=3.26
packaging
setuptools>=61
setuptools-scm>=8

View File

@ -1,8 +1,7 @@
# Common dependencies
-r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
# Dependencies for AMD GPUs
awscli

View File

@ -9,6 +9,7 @@ pytest-shard
# testing utils
awscli
backoff # required for phi4mm test
decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
@ -27,10 +28,9 @@ torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test
transformers==4.50.3
transformers==4.48.2
# quantization
bitsandbytes>=0.45.3
buildkite-test-collector==0.1.9
@ -38,9 +38,7 @@ buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
numpy
numpy < 2.0.0
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10

View File

@ -93,6 +93,8 @@ datasets==3.0.2
# lm-eval
decorator==5.1.1
# via librosa
decord==0.6.0
# via -r requirements/test.in
dill==0.3.8
# via
# datasets
@ -217,7 +219,7 @@ libnacl==2.1.0
# via tensorizer
librosa==0.10.2.post1
# via -r requirements/test.in
llvmlite==0.44.0
llvmlite==0.43.0
# via numba
lm-eval==0.4.4
# via -r requirements/test.in
@ -260,10 +262,8 @@ networkx==3.2.1
# via torch
nltk==3.9.1
# via rouge-score
numba==0.61.0
# via
# -r requirements/test.in
# librosa
numba==0.60.0
# via librosa
numexpr==2.10.1
# via lm-eval
numpy==1.26.4
@ -274,6 +274,7 @@ numpy==1.26.4
# contourpy
# cupy-cuda12x
# datasets
# decord
# einx
# encodec
# evaluate
@ -334,10 +335,8 @@ nvidia-nvjitlink-cu12==12.4.127
# torch
nvidia-nvtx-cu12==12.4.127
# via torch
opencv-python-headless==4.11.0.86
# via
# -r requirements/test.in
# mistral-common
opencv-python-headless==4.10.0.84
# via mistral-common
packaging==24.1
# via
# accelerate
@ -642,7 +641,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.50.3
transformers==4.48.2
# via
# -r requirements/test.in
# genai-perf

View File

@ -17,9 +17,9 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

View File

@ -201,9 +201,6 @@ class cmake_build_ext(build_ext):
else:
# Default build tool to whatever cmake picks.
build_tool = []
# Make sure we use the nvcc from CUDA_HOME
if _is_cuda():
cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
subprocess.check_call(
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp)
@ -595,8 +592,9 @@ def get_requirements() -> list[str]:
for line in requirements:
if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1])
elif not line.startswith("--") and not line.startswith(
"#") and line.strip() != "":
elif line.startswith("--"):
continue
else:
resolved_requirements.append(line)
return resolved_requirements
@ -642,10 +640,11 @@ if _is_hip():
if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# FA3 requires CUDA 12.3 or later
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
# FA3 requires CUDA 12.0 or later
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# Optional since this doesn't get built (produce an .so file) when
# not targeting a hopper system
ext_modules.append(
@ -684,7 +683,7 @@ setup(
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing
"video": [] # Kept for backwards compatibility
"video": ["decord"] # Required for video processing
},
cmdclass=cmdclass,
package_data=package_data,

View File

@ -1,8 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
from ..utils import compare_two_settings
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')
def test_cpu_offload():
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
["--cpu-offload-gb", "1"])

Some files were not shown because too many files have changed in this diff Show More