Compare commits
184 Commits
woosuk/tes
...
amd_dev
| Author | SHA1 | Date | |
|---|---|---|---|
| c7021f1270 | |||
| 2072fdc044 | |||
| 6eefda507a | |||
| a0003b56b0 | |||
| 5beacce2ea | |||
| 8669c69afa | |||
| 1651003c35 | |||
| 1cb8c6c5fe | |||
| e05a6754a8 | |||
| 084a9dae80 | |||
| c9461e05a4 | |||
| 4dfdb821c8 | |||
| 58fab50d82 | |||
| db6f28d898 | |||
| 14e2f1231e | |||
| 7c4767f1eb | |||
| 9771e0b432 | |||
| 980de31ca0 | |||
| 1c160841ea | |||
| 4ca13a8667 | |||
| 675aa2ec64 | |||
| 3ae082c373 | |||
| 49c00fe304 | |||
| 141d3b9fc5 | |||
| abf3db40ef | |||
| 8e4ca4d14e | |||
| 1a0f4defb7 | |||
| 843af7f7fc | |||
| 1f633b8632 | |||
| a4c29e6e82 | |||
| 8f18feb191 | |||
| ed540d6d4c | |||
| f6027b2855 | |||
| ab3e80042e | |||
| ceacedc1f9 | |||
| bfa59be8f1 | |||
| 265ecb05fb | |||
| 09a7e6f617 | |||
| 6c2eef5a5d | |||
| 19748806f0 | |||
| 4a8a567e16 | |||
| 344a0017c0 | |||
| becb7de40b | |||
| 250fb1b8ea | |||
| 647214f3d5 | |||
| ddeec11ba9 | |||
| 86ed77022d | |||
| aa1356ec53 | |||
| ecc3c0940a | |||
| ba09652de2 | |||
| bd66b8529b | |||
| 6c728f7771 | |||
| 80e9452984 | |||
| c3a2c6ac5f | |||
| 72f431e709 | |||
| be4445072c | |||
| f381cf2302 | |||
| 5ff5d94e77 | |||
| f95da13c3d | |||
| aef368aa08 | |||
| 5f6cbf60d6 | |||
| 3ada34f9cb | |||
| 0eb8f2b880 | |||
| 163965d183 | |||
| a03cf9bc70 | |||
| 352c0c8a28 | |||
| bfe0b4bd2a | |||
| 58fbbcb2f5 | |||
| 87778d5f00 | |||
| f9e7ad5400 | |||
| 4d0f266113 | |||
| e93ff6c8b9 | |||
| 1c691f4a71 | |||
| 9fce7bee74 | |||
| b63f2143f8 | |||
| f32bf7582e | |||
| 8a81d776ce | |||
| f6fdacd82c | |||
| d31f7844f8 | |||
| 7a6c8c3fa1 | |||
| 221bf72577 | |||
| b3aba04e5a | |||
| 8a297115e2 | |||
| 191eed0bb9 | |||
| fb860670da | |||
| 83e760c57d | |||
| c2bba69065 | |||
| e133d6d218 | |||
| a1946c9f61 | |||
| 9f020f4f31 | |||
| 3b45075206 | |||
| 168e578efc | |||
| 6ac5e06f7c | |||
| 5c2acb270a | |||
| b26b70bec4 | |||
| ab4be40fc5 | |||
| 245e4f2c01 | |||
| 1d165d6d85 | |||
| 83004020fd | |||
| 12e21701e7 | |||
| 30a33b92ee | |||
| 7c572544e4 | |||
| c312320764 | |||
| c981f0ea78 | |||
| 6367bde739 | |||
| f50cc221ea | |||
| acedc74b1a | |||
| d29483b58a | |||
| 950cf9e58e | |||
| 3125d79950 | |||
| e33ee23ee3 | |||
| b10c64c834 | |||
| 0925b28a8e | |||
| 99722d5f0e | |||
| 4c91a28e30 | |||
| b038d9c40c | |||
| 2ba60ec7fe | |||
| bd7157a071 | |||
| be429d0cfd | |||
| c253745eb8 | |||
| daec4d2624 | |||
| 6c9fdbf725 | |||
| 483ea64611 | |||
| e20eba753b | |||
| bbc1b29665 | |||
| acb1bfa601 | |||
| 75c7ad9918 | |||
| 5550ff9c25 | |||
| 3aeb19a39e | |||
| 8c017b3490 | |||
| 9c2c2287a0 | |||
| fec2b341ad | |||
| 87bc0c492f | |||
| fe3b9372ad | |||
| bde9e2272a | |||
| 08405609cc | |||
| ab81379ea6 | |||
| 4ffd6e8942 | |||
| 965c5f4914 | |||
| 4d055ef465 | |||
| 17c540a993 | |||
| 4d4d6bad19 | |||
| 11ae016bd7 | |||
| 41d3071918 | |||
| fb5e10d3fb | |||
| b2f78cbad4 | |||
| 23583ee28c | |||
| 01c977e96d | |||
| b3dda72c23 | |||
| fb0571b077 | |||
| 2ed8b6b3d0 | |||
| 013abde6ef | |||
| a5464dcf92 | |||
| ac3ed5a815 | |||
| e6ba2000ae | |||
| aa255ff55a | |||
| 7bb736d00e | |||
| 9f4e30904b | |||
| 5afd3276df | |||
| 43721bc67f | |||
| 02d709a6f1 | |||
| 4a510ab487 | |||
| 314fa8abbf | |||
| 334535b6fb | |||
| dcbb3f1871 | |||
| 00417f4e44 | |||
| ed344f4116 | |||
| e51928793e | |||
| d2740fafbf | |||
| 17838e50ef | |||
| 44c8555621 | |||
| f7d318de2b | |||
| 76f0d05bc6 | |||
| 7d8975de84 | |||
| 785d8b6410 | |||
| f6cdc9a02f | |||
| 509cdc0370 | |||
| 9b6504c307 | |||
| e19b16dde6 | |||
| 582f2c6be7 | |||
| f8a0acbdbe | |||
| 1317034379 | |||
| 0ecc553ee6 | |||
| f96bc3649c |
@ -1,11 +1,12 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# For hf script, without -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
backend: "vllm-vlm"
|
backend: "vllm-vlm"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "chartqa"
|
- name: "chartqa"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "relaxed_accuracy,none"
|
- name: "relaxed_accuracy,none"
|
||||||
value: 0.90
|
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||||
|
value: 0.80
|
||||||
limit: 100
|
limit: 100
|
||||||
num_fewshot: 0
|
num_fewshot: 0
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# For hf script, without -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: "mmlu_pro"
|
- name: "mmlu_pro"
|
||||||
metrics:
|
metrics:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
# aarch64 + CUDA builds
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
@ -15,6 +15,21 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# aarch64 build
|
||||||
|
- label: "Build arm64 CPU wheel"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-arm64-cpu
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
@ -28,20 +43,6 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-6
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
@ -55,6 +56,20 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 13.0"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# Build release images (12.9)
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
@ -62,13 +77,12 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
|
||||||
- label: "Build release image (arm64)"
|
- label: "Build release image (arm64)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
@ -142,6 +156,22 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build arm64 CPU release image"
|
||||||
|
key: block-arm64-cpu-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and publish arm64 CPU release image"
|
||||||
|
depends_on: block-arm64-cpu-release-image-build
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
|
|||||||
@ -70,7 +70,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
|
|||||||
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# generate index for nightly
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
|||||||
@ -454,8 +454,8 @@ steps:
|
|||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
- pytest -v -s compile/test_functionalization.py
|
- pytest -v -s compile/test_functionalization.py
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
# - pytest -v -s compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s compile/test_async_tp.py
|
# - pytest -v -s compile/test_async_tp.py
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s compile/test_decorator.py
|
- pytest -v -s compile/test_decorator.py
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
@ -474,8 +474,8 @@ steps:
|
|||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s compile/piecewise/
|
- pytest -v -s compile/piecewise/
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 20min
|
- label: PyTorch Fullgraph Test # 22min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@ -485,6 +485,7 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
- pytest -v -s compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -494,6 +495,7 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
|
- tests/kernels/test_top_k_per_row.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
@ -606,7 +608,7 @@ steps:
|
|||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0
|
- uv pip install --system torchao==0.13.0
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -848,6 +850,18 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
@ -923,8 +937,8 @@ steps:
|
|||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 21 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
@ -937,8 +951,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/fusion.py
|
|
||||||
- vllm/compilation/fusion_attn.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@ -955,13 +967,32 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
# Fusion
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
|
||||||
|
- label: Blackwell Fusion Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -1081,6 +1112,7 @@ steps:
|
|||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
@ -1128,6 +1160,11 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
@ -1172,7 +1209,6 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1201,6 +1237,18 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
@ -1232,12 +1280,16 @@ steps:
|
|||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distrubted Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
|
||||||
|
|||||||
@ -172,6 +172,8 @@ steps:
|
|||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
# test with torchrun tp=2 and external_dp=2
|
# test with torchrun tp=2 and external_dp=2
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with torchrun tp=2 and pp=2
|
||||||
@ -349,7 +351,8 @@ steps:
|
|||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
- label: Platform Tests (CUDA) # 4min
|
- label: Platform Tests (CUDA) # 4min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@ -384,7 +387,12 @@ steps:
|
|||||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
--ignore=lora/test_chatglm3_tp.py \
|
--ignore=lora/test_chatglm3_tp.py \
|
||||||
--ignore=lora/test_llama_tp.py \
|
--ignore=lora/test_llama_tp.py \
|
||||||
--ignore=lora/test_llm_with_multi_loras.py
|
--ignore=lora/test_llm_with_multi_loras.py \
|
||||||
|
--ignore=lora/test_olmoe_tp.py \
|
||||||
|
--ignore=lora/test_deepseekv2_tp.py \
|
||||||
|
--ignore=lora/test_gptoss.py \
|
||||||
|
--ignore=lora/test_qwen3moe_tp.py
|
||||||
|
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
@ -416,8 +424,8 @@ steps:
|
|||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s compile/piecewise/
|
- pytest -v -s compile/piecewise/
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 20min
|
- label: PyTorch Fullgraph Test # 22min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -425,6 +433,7 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
- pytest -v -s compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -528,8 +537,8 @@ steps:
|
|||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -807,8 +816,8 @@ steps:
|
|||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 21 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
@ -821,8 +830,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/fusion.py
|
|
||||||
- vllm/compilation/fusion_attn.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@ -839,15 +846,32 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
|
||||||
# Fusion
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
|
||||||
|
- label: Blackwell Fusion Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -954,6 +978,8 @@ steps:
|
|||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
@ -961,6 +987,7 @@ steps:
|
|||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
@ -1004,6 +1031,11 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
@ -1043,6 +1075,7 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -1068,6 +1101,17 @@ steps:
|
|||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -1100,7 +1144,7 @@ steps:
|
|||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distrubted Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
@ -1108,6 +1152,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/test_async_tp.py
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
|
||||||
|
|||||||
11
.github/CODEOWNERS
vendored
11
.github/CODEOWNERS
vendored
@ -5,8 +5,8 @@
|
|||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
@ -25,7 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
@ -44,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
@ -57,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Transformers backend
|
# Transformers backend
|
||||||
/vllm/model_executor/models/transformers.py @hmellor
|
/vllm/model_executor/models/transformers @hmellor
|
||||||
/tests/models/test_transformers.py @hmellor
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -94,6 +94,9 @@ ipython_config.py
|
|||||||
# generated files
|
# generated files
|
||||||
**/generated/**
|
**/generated/**
|
||||||
|
|
||||||
|
# uv
|
||||||
|
uv.lock
|
||||||
|
|
||||||
# pyenv
|
# pyenv
|
||||||
# For a library or package, you might want to ignore these files since the code is
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
# intended to run in multiple environments; otherwise, check them in:
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
|||||||
@ -4,7 +4,6 @@ MD013: false
|
|||||||
MD024:
|
MD024:
|
||||||
siblings_only: true
|
siblings_only: true
|
||||||
MD033: false
|
MD033: false
|
||||||
MD042: false
|
|
||||||
MD045: false
|
MD045: false
|
||||||
MD046: false
|
MD046: false
|
||||||
MD051: false
|
MD051: false
|
||||||
|
|||||||
@ -38,7 +38,7 @@ repos:
|
|||||||
rev: 0.9.1
|
rev: 0.9.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
|
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
|||||||
@ -31,6 +31,7 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
|
from contextlib import nullcontext
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -501,15 +502,9 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
# This can be used once the minimum Python version is 3.10 or higher,
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||||
# and it will simplify the code in limited_request_func.
|
|
||||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
|
||||||
# if max_concurrency else contextlib.nullcontext())
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
if semaphore is None:
|
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,8 @@ import torch
|
|||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
def with_triton_mode(fn):
|
def with_triton_mode(fn):
|
||||||
|
|||||||
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
|
|||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
@ -9,9 +9,9 @@ import torch
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -1,155 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm import _custom_ops as vllm_ops
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
|
|
||||||
def polynorm_naive(
|
|
||||||
x: torch.Tensor,
|
|
||||||
weight: torch.Tensor,
|
|
||||||
bias: torch.Tensor,
|
|
||||||
eps: float = 1e-6,
|
|
||||||
):
|
|
||||||
orig_shape = x.shape
|
|
||||||
x = x.view(-1, x.shape[-1])
|
|
||||||
|
|
||||||
def norm(x, eps: float):
|
|
||||||
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
|
||||||
|
|
||||||
x = x.float()
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
weight[0] * norm(x**3, eps)
|
|
||||||
+ weight[1] * norm(x**2, eps)
|
|
||||||
+ weight[2] * norm(x, eps)
|
|
||||||
+ bias
|
|
||||||
)
|
|
||||||
.to(weight.dtype)
|
|
||||||
.view(orig_shape)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def polynorm_vllm(
|
|
||||||
x: torch.Tensor,
|
|
||||||
weight: torch.Tensor,
|
|
||||||
bias: torch.Tensor,
|
|
||||||
eps: float = 1e-6,
|
|
||||||
):
|
|
||||||
orig_shape = x.shape
|
|
||||||
x = x.view(-1, x.shape[-1])
|
|
||||||
|
|
||||||
out = torch.empty_like(x)
|
|
||||||
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
|
||||||
output = out
|
|
||||||
|
|
||||||
output = output.view(orig_shape)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_diff(batch_size, seq_len, hidden_dim):
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
|
||||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
|
||||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
|
||||||
|
|
||||||
output_naive = polynorm_naive(x, weight, bias)
|
|
||||||
output_vllm = polynorm_vllm(x, weight, bias)
|
|
||||||
|
|
||||||
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
|
||||||
print("✅ All implementations match")
|
|
||||||
else:
|
|
||||||
print("❌ Implementations differ")
|
|
||||||
|
|
||||||
|
|
||||||
batch_size_range = [2**i for i in range(0, 7, 2)]
|
|
||||||
seq_length_range = [2**i for i in range(6, 11, 1)]
|
|
||||||
dim_range = [2048, 4096]
|
|
||||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
|
||||||
|
|
||||||
|
|
||||||
def get_benchmark():
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["dim", "batch_size", "seq_len"],
|
|
||||||
x_vals=[list(_) for _ in configs],
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=["naive", "vllm"],
|
|
||||||
line_names=["Naive", "vLLM"],
|
|
||||||
styles=[("blue", "-"), ("red", "-")],
|
|
||||||
ylabel="us",
|
|
||||||
plot_name="polynorm-perf",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(dim, batch_size, seq_len, provider):
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
hidden_dim = dim * 4
|
|
||||||
|
|
||||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
|
||||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
|
||||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
if provider == "naive":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: polynorm_naive(x, weight, bias),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: polynorm_vllm(x, weight, bias),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
|
|
||||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
|
||||||
|
|
||||||
return benchmark
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--batch-size",
|
|
||||||
type=int,
|
|
||||||
default=4,
|
|
||||||
help="Batch size",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--seq-len",
|
|
||||||
type=int,
|
|
||||||
default=128,
|
|
||||||
help="Sequence length",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hidden-dim",
|
|
||||||
type=int,
|
|
||||||
default=8192,
|
|
||||||
help="Intermediate size of MLP",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--save-path",
|
|
||||||
type=str,
|
|
||||||
default="./configs/polnorm/",
|
|
||||||
help="Path to save polnorm benchmark results",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Run correctness test
|
|
||||||
calculate_diff(
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
seq_len=args.seq_len,
|
|
||||||
hidden_dim=args.hidden_dim,
|
|
||||||
)
|
|
||||||
|
|
||||||
benchmark = get_benchmark()
|
|
||||||
# Run performance benchmark
|
|
||||||
benchmark.run(print_data=True, save_path=args.save_path)
|
|
||||||
@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
@ -9,9 +9,9 @@ from tabulate import tabulate
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
|||||||
)
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random_flash,
|
create_kv_caches_with_random_flash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -1251,7 +1251,7 @@ async def main() -> None:
|
|||||||
default=None,
|
default=None,
|
||||||
help="The model name used in the API. "
|
help="The model name used in the API. "
|
||||||
"If not specified, the model name will be the "
|
"If not specified, the model name will be the "
|
||||||
"same as the ``--model`` argument. ",
|
"same as the `--model` argument. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@ -188,16 +188,47 @@ else()
|
|||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
|
|
||||||
# Flag to enable ACL kernels for AARCH64 platforms
|
|
||||||
if (VLLM_BUILD_ACL STREQUAL "ON")
|
|
||||||
set(USE_ACL ON)
|
|
||||||
else()
|
|
||||||
set(USE_ACL OFF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
|
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
|
||||||
|
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
|
||||||
|
if(ASIMD_FOUND)
|
||||||
|
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
|
||||||
|
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
|
||||||
|
FetchContent_Populate(arm_compute
|
||||||
|
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
|
||||||
|
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
|
||||||
|
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
|
||||||
|
GIT_TAG v52.2.0
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
)
|
||||||
|
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Build ACL with scons
|
||||||
|
include(ProcessorCount)
|
||||||
|
ProcessorCount(_NPROC)
|
||||||
|
execute_process(
|
||||||
|
COMMAND scons -j${_NPROC}
|
||||||
|
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||||
|
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||||
|
multi_isa=1 openmp=1 cppthreads=0
|
||||||
|
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||||
|
RESULT_VARIABLE _acl_rc
|
||||||
|
)
|
||||||
|
if(NOT _acl_rc EQUAL 0)
|
||||||
|
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||||
|
add_compile_definitions(VLLM_USE_ACL)
|
||||||
|
endif()
|
||||||
|
|
||||||
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||||
|
|
||||||
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||||
@ -217,16 +248,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_ACL)
|
|
||||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
|
||||||
if(NOT ARM_COMPUTE_LIBRARY)
|
|
||||||
message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
|
|
||||||
endif()
|
|
||||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
|
||||||
add_compile_definitions(VLLM_USE_ACL)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
set(ONEDNN_BUILD_DOC "OFF")
|
set(ONEDNN_BUILD_DOC "OFF")
|
||||||
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
||||||
|
|||||||
@ -19,7 +19,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
flashmla
|
flashmla
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||||
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
|
GIT_TAG 28417e516fcbf6257a422ba117ef5b6f44da5682
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
CONFIGURE_COMMAND ""
|
CONFIGURE_COMMAND ""
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
|
|||||||
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
|
||||||
)
|
)
|
||||||
|
|
||||||
set(FlashMLA_INCLUDES
|
set(FlashMLA_INCLUDES
|
||||||
|
|||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
|
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -125,32 +125,37 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void set_split_kv (KernelArguments& args) {
|
static void set_split_kv (KernelArguments& args) {
|
||||||
// printf("set_split_kv start");
|
|
||||||
if (args.split_kv >= 1) return;
|
if (args.split_kv >= 1) return;
|
||||||
auto [H, K, D, B] = args.problem_shape;
|
auto [H, K, D, B] = args.problem_shape;
|
||||||
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
|
||||||
int sm_count = args.hw_info.sm_count;
|
int sm_count = args.hw_info.sm_count;
|
||||||
// printf(" sm_count = %d\n", sm_count);
|
float seq_length_k = static_cast<float>(K) / 1024.0f;
|
||||||
int max_splits = ceil_div(K, 128);
|
int max_splits = 1;
|
||||||
max_splits = min(16, max_splits);
|
|
||||||
|
|
||||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
if (B <= 4 && seq_length_k >= 16) {
|
||||||
// there is more than 1 kv_splits.
|
max_splits = 16;
|
||||||
// Discuss with NVIDIA how this can be fixed.
|
|
||||||
if (B > 1) {
|
|
||||||
max_splits = min(1, max_splits);
|
|
||||||
}
|
}
|
||||||
|
else if (B <= 8 && seq_length_k >= 4) {
|
||||||
// printf(" max_splits = %d\n", max_splits);
|
max_splits = 8;
|
||||||
|
}
|
||||||
|
else if ((B <= 16 && seq_length_k >= 8) ||
|
||||||
|
(B == 48 && seq_length_k >= 32)) {
|
||||||
|
max_splits = 4;
|
||||||
|
}
|
||||||
|
else if ((B <= 32 && seq_length_k >= 16) ||
|
||||||
|
(B == 96 && seq_length_k >= 16)) {
|
||||||
|
max_splits = 2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
max_splits = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wave-aware scheduling: ensure integer number of waves in K dimension
|
||||||
int sms_per_batch = max(1, sm_count / B);
|
int sms_per_batch = max(1, sm_count / B);
|
||||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
|
||||||
int split_heur = min(max_splits, sms_per_batch);
|
int split_heur = min(max_splits, sms_per_batch);
|
||||||
int waves = ceil_div(B * split_heur, sm_count);
|
int waves = ceil_div(B * split_heur, sm_count);
|
||||||
int k_waves = ceil_div(max_splits, split_heur);
|
int k_waves = ceil_div(max_splits, split_heur);
|
||||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||||
args.split_kv = split_wave_aware;
|
args.split_kv = split_wave_aware;
|
||||||
// printf(" args.split_kv = %d\n", args.split_kv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Determines whether the GEMM can execute the given problem.
|
/// Determines whether the GEMM can execute the given problem.
|
||||||
|
|||||||
@ -5,11 +5,11 @@
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// vllm_kernel_override_batch_invariant(); returns true
|
// vllm_is_batch_invariant(); returns true
|
||||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
// if env VLLM_BATCH_INVARIANT=1
|
||||||
inline bool vllm_kernel_override_batch_invariant() {
|
inline bool vllm_is_batch_invariant() {
|
||||||
static bool cached = []() {
|
static bool cached = []() {
|
||||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
std::string env_key = "VLLM_BATCH_INVARIANT";
|
||||||
const char* val = std::getenv(env_key.c_str());
|
const char* val = std::getenv(env_key.c_str());
|
||||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||||
}();
|
}();
|
||||||
|
|||||||
@ -148,211 +148,6 @@ fused_add_rms_norm_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Function specialization in the case of FP16/BF16 tensors.
|
|
||||||
Additional optimizations we can make in this case are
|
|
||||||
packed and vectorized operations, which help with the
|
|
||||||
memory latency bottleneck.
|
|
||||||
|
|
||||||
_f16VecPN struct extends _f16Vec to add operations specifically required for
|
|
||||||
polynomial normalization (poly norm).
|
|
||||||
The original _f16Vec does not include the sum-of-powers computation or
|
|
||||||
in-place polynomial normalization logic. */
|
|
||||||
template <typename scalar_t, int width>
|
|
||||||
struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
|
|
||||||
using Base = _f16Vec<scalar_t, width>;
|
|
||||||
using Converter = typename Base::Converter;
|
|
||||||
using T1 = typename Base::T1;
|
|
||||||
using T2 = typename Base::T2;
|
|
||||||
using Base::data;
|
|
||||||
|
|
||||||
__device__ auto sum_pows() const {
|
|
||||||
float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; i += 2) {
|
|
||||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
|
||||||
float x2 = z.x * z.x;
|
|
||||||
float x4 = x2 * x2;
|
|
||||||
float x6 = x4 * x2;
|
|
||||||
|
|
||||||
float y2 = z.y * z.y;
|
|
||||||
float y4 = y2 * y2;
|
|
||||||
float y6 = y4 * y2;
|
|
||||||
|
|
||||||
s2 += x2 + y2;
|
|
||||||
s4 += x4 + y4;
|
|
||||||
s6 += x6 + y6;
|
|
||||||
}
|
|
||||||
return std::make_tuple(s2, s4, s6);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ void poly_norm_inplace(const float w2_inv_std,
|
|
||||||
const float w1_inv_std2,
|
|
||||||
const float w0_inv_std3, const float bias) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; i += 2) {
|
|
||||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
|
||||||
|
|
||||||
float x2 = z.x * z.x;
|
|
||||||
float x3 = x2 * z.x;
|
|
||||||
z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
|
|
||||||
|
|
||||||
float y2 = z.y * z.y;
|
|
||||||
float y3 = y2 * z.y;
|
|
||||||
z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
|
|
||||||
|
|
||||||
auto out = Converter::convert(z);
|
|
||||||
data[i] = out.x;
|
|
||||||
data[i + 1] = out.y;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename scalar_t, int width>
|
|
||||||
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
|
||||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
|
||||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
|
||||||
const scalar_t* __restrict__ weight, // [3]
|
|
||||||
const scalar_t* __restrict__ bias, // [1]
|
|
||||||
const float epsilon, const int hidden_size) {
|
|
||||||
// Sanity checks on our vector struct and type-punned pointer arithmetic
|
|
||||||
static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
|
|
||||||
static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
|
|
||||||
|
|
||||||
/* These and the argument pointers are all declared `restrict` as they are
|
|
||||||
not aliased in practice. Argument pointers should not be dereferenced
|
|
||||||
in this kernel as that would be undefined behavior */
|
|
||||||
auto* __restrict__ input_v =
|
|
||||||
reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
|
|
||||||
const int vec_hidden_size = hidden_size / width;
|
|
||||||
float variance = 0.0f;
|
|
||||||
float variance2 = 0.0f;
|
|
||||||
float variance3 = 0.0f;
|
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
|
||||||
int id = blockIdx.x * vec_hidden_size + idx;
|
|
||||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
|
||||||
auto [x2, x4, x6] = temp.sum_pows();
|
|
||||||
|
|
||||||
variance += x2;
|
|
||||||
variance2 += x4;
|
|
||||||
variance3 += x6;
|
|
||||||
}
|
|
||||||
|
|
||||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
|
||||||
|
|
||||||
struct SumOp {
|
|
||||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
|
||||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
|
||||||
float3 block_variances =
|
|
||||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
|
||||||
|
|
||||||
variance = block_variances.x;
|
|
||||||
variance2 = block_variances.y;
|
|
||||||
variance3 = block_variances.z;
|
|
||||||
|
|
||||||
__shared__ float s_w2_inv_std;
|
|
||||||
__shared__ float s_w1_inv_std2;
|
|
||||||
__shared__ float s_w0_inv_std3;
|
|
||||||
__shared__ float s_bias;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
float w0 = (float)weight[0];
|
|
||||||
float w1 = (float)weight[1];
|
|
||||||
float w2 = (float)weight[2];
|
|
||||||
s_bias = (float)bias[0];
|
|
||||||
|
|
||||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
|
||||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
|
||||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
|
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
|
||||||
int id = blockIdx.x * vec_hidden_size + idx;
|
|
||||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
|
||||||
temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
|
|
||||||
out_v[id] = temp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Generic poly_norm_kernel
|
|
||||||
The width field is not used here but necessary for other specializations.
|
|
||||||
*/
|
|
||||||
template <typename scalar_t, int width>
|
|
||||||
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
|
||||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
|
||||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
|
||||||
const scalar_t* __restrict__ weight, // [3]
|
|
||||||
const scalar_t* __restrict__ bias, // [1]
|
|
||||||
const float epsilon, const int hidden_size) {
|
|
||||||
float variance = 0.0f;
|
|
||||||
float variance2 = 0.0f;
|
|
||||||
float variance3 = 0.0f;
|
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
|
||||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
|
||||||
float x2 = x * x;
|
|
||||||
float x4 = x2 * x2;
|
|
||||||
float x6 = x4 * x2;
|
|
||||||
|
|
||||||
variance += x2;
|
|
||||||
variance2 += x4;
|
|
||||||
variance3 += x6;
|
|
||||||
}
|
|
||||||
|
|
||||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
|
||||||
|
|
||||||
struct SumOp {
|
|
||||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
|
||||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
|
||||||
float3 block_variances =
|
|
||||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
|
||||||
|
|
||||||
variance = block_variances.x;
|
|
||||||
variance2 = block_variances.y;
|
|
||||||
variance3 = block_variances.z;
|
|
||||||
|
|
||||||
__shared__ float s_w2_inv_std;
|
|
||||||
__shared__ float s_w1_inv_std2;
|
|
||||||
__shared__ float s_w0_inv_std3;
|
|
||||||
__shared__ float s_bias;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
float w0 = (float)weight[0];
|
|
||||||
float w1 = (float)weight[1];
|
|
||||||
float w2 = (float)weight[2];
|
|
||||||
s_bias = (float)bias[0];
|
|
||||||
|
|
||||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
|
||||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
|
||||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
|
||||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
|
||||||
float x2 = x * x;
|
|
||||||
float x3 = x2 * x;
|
|
||||||
|
|
||||||
out[blockIdx.x * hidden_size + idx] =
|
|
||||||
(scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
|
|
||||||
s_bias);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||||
@ -364,18 +159,26 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
TORCH_CHECK(weight.is_contiguous());
|
TORCH_CHECK(weight.is_contiguous());
|
||||||
|
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
|
||||||
int64_t input_stride = input.stride(-2);
|
// We cannot just use `input.stride(-2)` if the tensor is not row-major.
|
||||||
|
// Instead, we use a 2d view to get the second-innermost stride.
|
||||||
|
// That way the dimensions (except the last one) can be arbitrarily permuted.
|
||||||
|
torch::Tensor input_view = input.view({-1, hidden_size});
|
||||||
|
|
||||||
|
int num_tokens = input_view.numel() / hidden_size;
|
||||||
|
int64_t input_stride = input_view.stride(-2);
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(hidden_size, 1024));
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
input_view.scalar_type(), "rms_norm_kernel", [&] {
|
||||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
|
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
|
||||||
});
|
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||||
|
hidden_size);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||||
@ -392,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
torch::Tensor& residual, // [..., hidden_size]
|
torch::Tensor& residual, // [..., hidden_size]
|
||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
double epsilon) {
|
double epsilon) {
|
||||||
|
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||||
|
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
|
||||||
TORCH_CHECK(residual.is_contiguous());
|
TORCH_CHECK(residual.is_contiguous());
|
||||||
TORCH_CHECK(weight.is_contiguous());
|
TORCH_CHECK(weight.is_contiguous());
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
@ -426,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
wt_ptr % req_alignment_bytes == 0;
|
wt_ptr % req_alignment_bytes == 0;
|
||||||
bool offsets_are_multiple_of_vector_width =
|
bool offsets_are_multiple_of_vector_width =
|
||||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||||
!batch_invariant_launch) {
|
!batch_invariant_launch) {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
@ -434,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LAUNCH_FUSED_POLY_NORM(width) \
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
|
|
||||||
vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \
|
|
||||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \
|
|
||||||
weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \
|
|
||||||
hidden_size); \
|
|
||||||
});
|
|
||||||
|
|
||||||
void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
|
||||||
torch::Tensor& input, // [..., hidden_size]
|
|
||||||
torch::Tensor& weight, // [3]
|
|
||||||
torch::Tensor& bias, // [1]
|
|
||||||
double epsilon) {
|
|
||||||
TORCH_CHECK(out.is_contiguous());
|
|
||||||
TORCH_CHECK(input.is_contiguous());
|
|
||||||
TORCH_CHECK(out.data_ptr() != input.data_ptr());
|
|
||||||
|
|
||||||
int hidden_size = input.size(-1);
|
|
||||||
int num_tokens = input.numel() / hidden_size;
|
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
|
||||||
/* This kernel is memory-latency bound in many scenarios.
|
|
||||||
When num_tokens is large, a smaller block size allows
|
|
||||||
for increased block occupancy on CUs and better latency
|
|
||||||
hiding on global mem ops. */
|
|
||||||
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
|
|
||||||
dim3 block(std::min(hidden_size, max_block_size));
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
/*If the tensor types are FP16/BF16, try to use the optimized kernel
|
|
||||||
with packed + vectorized ops.
|
|
||||||
Max optimization is achieved with a width-8 vector of FP16/BF16s
|
|
||||||
since we can load at most 128 bits at once in a global memory op.
|
|
||||||
However, this requires each tensor's data to be aligned to 16
|
|
||||||
bytes.
|
|
||||||
*/
|
|
||||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
|
||||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
|
||||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_POLY_NORM(8);
|
|
||||||
} else {
|
|
||||||
LAUNCH_FUSED_POLY_NORM(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@ -229,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
double epsilon) {
|
double epsilon) {
|
||||||
TORCH_CHECK(out.is_contiguous());
|
TORCH_CHECK(out.is_contiguous());
|
||||||
TORCH_CHECK(residual.is_contiguous());
|
TORCH_CHECK(residual.is_contiguous());
|
||||||
|
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
|
||||||
|
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int input_stride = input.stride(-2);
|
int input_stride = input.stride(-2);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
@ -254,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
bool ptrs_are_aligned =
|
bool ptrs_are_aligned =
|
||||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||||
!batch_invariant_launch) {
|
!batch_invariant_launch) {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
|
|||||||
@ -8,12 +8,77 @@
|
|||||||
|
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "../dispatch_utils.h"
|
#include "../dispatch_utils.h"
|
||||||
|
#include "core/math.hpp"
|
||||||
|
|
||||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
namespace moe {
|
namespace moe {
|
||||||
|
|
||||||
|
namespace batched_moe_align_block_size {
|
||||||
|
|
||||||
|
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||||
|
static constexpr int32_t num_threads = 1024;
|
||||||
|
static constexpr int32_t num_blocks = 1;
|
||||||
|
__global__ void batched_moe_align_block_size_kernel(
|
||||||
|
int32_t const num_batches, int32_t const max_tokens_per_batch,
|
||||||
|
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
|
||||||
|
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
|
||||||
|
int32_t* __restrict__ num_tokens_post_pad) {
|
||||||
|
// TODO(varun): This is a naive implementation. Could be optimized.
|
||||||
|
|
||||||
|
size_t const batch_id = threadIdx.x;
|
||||||
|
size_t const stride = blockDim.x * gridDim.x;
|
||||||
|
int32_t const num_blocks_per_batch =
|
||||||
|
CEILDIV(max_tokens_per_batch, block_size);
|
||||||
|
int32_t const sorted_ids_size =
|
||||||
|
num_blocks_per_batch * num_batches * block_size;
|
||||||
|
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||||
|
int32_t const SENTINEL =
|
||||||
|
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||||
|
// Intialize sorted_ids
|
||||||
|
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||||
|
sorted_ids[i] = SENTINEL;
|
||||||
|
}
|
||||||
|
// Intialize expert_ids with -1
|
||||||
|
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||||
|
block_ids[i] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t b_num_tokens = 0;
|
||||||
|
if (batch_id < num_batches) {
|
||||||
|
b_num_tokens = batch_num_tokens[batch_id];
|
||||||
|
}
|
||||||
|
int32_t const ceil_b_num_tokens =
|
||||||
|
CEILDIV(b_num_tokens, block_size) * block_size;
|
||||||
|
|
||||||
|
// Compute prefix sum over token counts per expert
|
||||||
|
using BlockScan = cub::BlockScan<int32_t, 1024>;
|
||||||
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
|
int cumsum_val;
|
||||||
|
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
bool const is_last_batch = batch_id == (num_batches - 1);
|
||||||
|
if (is_last_batch) {
|
||||||
|
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (batch_id < num_batches) {
|
||||||
|
int32_t const batch_offset = batch_id * max_tokens_per_batch;
|
||||||
|
for (size_t i = 0; i < b_num_tokens; ++i) {
|
||||||
|
sorted_ids[cumsum_val + i] = batch_offset + i;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t const block_start = cumsum_val / block_size;
|
||||||
|
int32_t const num_blocks = ceil_b_num_tokens / block_size;
|
||||||
|
for (size_t i = 0; i < num_blocks; ++i) {
|
||||||
|
block_ids[block_start + i] = batch_id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace batched_moe_align_block_size
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void moe_align_block_size_kernel(
|
__global__ void moe_align_block_size_kernel(
|
||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||||
|
int64_t block_size,
|
||||||
|
torch::Tensor const& batch_num_tokens,
|
||||||
|
torch::Tensor sorted_ids,
|
||||||
|
torch::Tensor batch_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad) {
|
||||||
|
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
|
||||||
|
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
int32_t const B = batch_num_tokens.size(0);
|
||||||
|
int32_t const num_blocks_per_batch =
|
||||||
|
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
|
||||||
|
int32_t const num_blocks = num_blocks_per_batch * B;
|
||||||
|
int64_t const sorted_ids_size = num_blocks * block_size;
|
||||||
|
|
||||||
|
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
|
||||||
|
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
|
||||||
|
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
|
||||||
|
TORCH_CHECK(B <= batched_kernel::num_threads);
|
||||||
|
|
||||||
|
batched_kernel::batched_moe_align_block_size_kernel<<<
|
||||||
|
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
|
||||||
|
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
|
||||||
|
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
|
||||||
|
num_tokens_post_pad.data_ptr<int32_t>());
|
||||||
|
}
|
||||||
|
|
||||||
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||||
torch::Tensor& output) // [num_tokens, hidden_size]
|
torch::Tensor& output) // [num_tokens, hidden_size]
|
||||||
{
|
{
|
||||||
|
|||||||
169
csrc/moe/moe_lora_align_sum_kernels.cu
Normal file
169
csrc/moe/moe_lora_align_sum_kernels.cu
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#include <ATen/ATen.h>
|
||||||
|
#include <ATen/cuda/Atomic.cuh>
|
||||||
|
|
||||||
|
#include "../cuda_compat.h"
|
||||||
|
#include "../dispatch_utils.h"
|
||||||
|
#include "core/math.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||||
|
int32_t col) {
|
||||||
|
return row * total_col + col;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
// TODO: Refactor common parts with moe_align_sum_kernels
|
||||||
|
template <typename scalar_t, typename token_cnts_t>
|
||||||
|
__global__ void moe_lora_align_sum_kernel(
|
||||||
|
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||||
|
int64_t block_size, int num_experts, int max_loras, size_t numel,
|
||||||
|
int max_num_tokens_padded, int max_num_m_blocks,
|
||||||
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
|
int topk_num, int32_t* total_tokens_post_pad) {
|
||||||
|
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
|
||||||
|
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||||
|
|
||||||
|
int lora_id = blockIdx.x;
|
||||||
|
extern __shared__ int32_t shared_mem[];
|
||||||
|
int32_t* cumsum = shared_mem;
|
||||||
|
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
|
||||||
|
|
||||||
|
// Initialize sorted_token_ids with numel
|
||||||
|
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||||
|
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize expert_ids with -1
|
||||||
|
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
|
||||||
|
expert_ids[lora_id * max_num_m_blocks + it] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize total_tokens_post_pad with 0
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
total_tokens_post_pad[lora_id] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
|
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||||
|
int mask = token_lora_mapping[i / topk_num] == lora_id;
|
||||||
|
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
|
||||||
|
tokens_cnts[idx] += mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// For each expert we accumulate the token counts from the different threads.
|
||||||
|
if (threadIdx.x < num_experts) {
|
||||||
|
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||||
|
for (int i = 1; i <= blockDim.x; ++i) {
|
||||||
|
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||||
|
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// We accumulate the token counts of all experts in thread 0.
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
cumsum[0] = 0;
|
||||||
|
for (int i = 1; i <= num_experts; ++i) {
|
||||||
|
cumsum[i] = cumsum[i - 1] +
|
||||||
|
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||||
|
block_size) *
|
||||||
|
block_size;
|
||||||
|
}
|
||||||
|
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each expert, each thread processes the tokens of the corresponding
|
||||||
|
* blocks and stores the corresponding expert_id for each block.
|
||||||
|
*/
|
||||||
|
if (threadIdx.x < num_experts) {
|
||||||
|
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||||
|
i += block_size) {
|
||||||
|
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
|
||||||
|
threadIdx.x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||||
|
int32_t expert_id = topk_ids[i];
|
||||||
|
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||||
|
* expert with expert_id needs to process, and
|
||||||
|
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||||
|
* processed by the expert with expert_id within the current thread's token
|
||||||
|
* shard.
|
||||||
|
*/
|
||||||
|
int32_t rank_post_pad =
|
||||||
|
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||||
|
cumsum[expert_id];
|
||||||
|
|
||||||
|
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
|
||||||
|
atomicAdd(
|
||||||
|
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
|
||||||
|
(i - numel) * mask);
|
||||||
|
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void moe_lora_align_block_size(torch::Tensor topk_ids,
|
||||||
|
torch::Tensor token_lora_mapping,
|
||||||
|
int64_t num_experts, int64_t block_size,
|
||||||
|
int64_t max_loras, int64_t max_num_tokens_padded,
|
||||||
|
int64_t max_num_m_blocks,
|
||||||
|
torch::Tensor sorted_token_ids,
|
||||||
|
torch::Tensor expert_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad) {
|
||||||
|
const int topk_num = topk_ids.size(1);
|
||||||
|
|
||||||
|
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||||
|
|
||||||
|
int device_max_shared_mem;
|
||||||
|
auto dev = topk_ids.get_device();
|
||||||
|
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||||
|
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
|
||||||
|
TORCH_CHECK(num_thread <= 1024,
|
||||||
|
"num_thread must be less than 1024, "
|
||||||
|
"and fallback is not implemented yet.");
|
||||||
|
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||||
|
(num_experts + 1) * sizeof(int32_t);
|
||||||
|
|
||||||
|
if (shared_mem > device_max_shared_mem) {
|
||||||
|
TORCH_CHECK(false,
|
||||||
|
"Shared memory usage exceeds device limit, and global memory "
|
||||||
|
"fallback is not implemented yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||||
|
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||||
|
dim3 blockDim(num_thread);
|
||||||
|
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
|
||||||
|
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||||
|
(void*)kernel, shared_mem));
|
||||||
|
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||||
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
|
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
|
||||||
|
max_loras, topk_ids.numel(), max_num_tokens_padded,
|
||||||
|
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
|
||||||
|
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||||
|
num_tokens_post_pad.data_ptr<int32_t>());
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||||
torch::Tensor& token_expert_indices,
|
torch::Tensor& token_expert_indices,
|
||||||
torch::Tensor& gating_output);
|
torch::Tensor& gating_output, bool renormalize);
|
||||||
|
|
||||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||||
|
|
||||||
@ -12,6 +12,22 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad);
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|
||||||
|
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||||
|
int64_t block_size,
|
||||||
|
torch::Tensor const& expert_num_tokens,
|
||||||
|
torch::Tensor sorted_ids,
|
||||||
|
torch::Tensor expert_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|
||||||
|
void moe_lora_align_block_size(torch::Tensor topk_ids,
|
||||||
|
torch::Tensor token_lora_mapping,
|
||||||
|
int64_t num_experts, int64_t block_size,
|
||||||
|
int64_t max_loras, int64_t max_num_tokens_padded,
|
||||||
|
int64_t max_num_m_blocks,
|
||||||
|
torch::Tensor sorted_token_ids,
|
||||||
|
torch::Tensor expert_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad);
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||||
|
|||||||
@ -16,12 +16,23 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
#include <type_traits>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "../cub_helpers.h"
|
#include "../cub_helpers.h"
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#else
|
||||||
|
#include <hip/hip_bf16.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
|
typedef __hip_bfloat162 __nv_bfloat162;
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
|
||||||
@ -36,16 +47,27 @@ template <
|
|||||||
/// Alignment requirement in bytes
|
/// Alignment requirement in bytes
|
||||||
int Alignment = sizeof(T) * N
|
int Alignment = sizeof(T) * N
|
||||||
>
|
>
|
||||||
class alignas(Alignment) AlignedArray {
|
struct alignas(Alignment) AlignedArray {
|
||||||
float data[N];
|
T data[N];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ float toFloat(T value) {
|
||||||
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
|
return value;
|
||||||
|
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
|
||||||
|
return __bfloat162float(value);
|
||||||
|
} else if constexpr (std::is_same_v<T, __half>) {
|
||||||
|
return __half2float(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== Softmax things ===============================
|
// ====================== Softmax things ===============================
|
||||||
// We have our own implementation of softmax here so we can support transposing the output
|
// We have our own implementation of softmax here so we can support transposing the output
|
||||||
// in the softmax kernel when we extend this module to support expert-choice routing.
|
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||||
template <int TPB>
|
template <int TPB, typename InputType>
|
||||||
__launch_bounds__(TPB) __global__
|
__launch_bounds__(TPB) __global__
|
||||||
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
|
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
|
||||||
{
|
{
|
||||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||||
@ -66,7 +88,8 @@ __launch_bounds__(TPB) __global__
|
|||||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||||
{
|
{
|
||||||
const int idx = thread_row_offset + ii;
|
const int idx = thread_row_offset + ii;
|
||||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
const float val = toFloat(input[idx]);
|
||||||
|
threadData = max(val, threadData);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||||
@ -81,7 +104,8 @@ __launch_bounds__(TPB) __global__
|
|||||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||||
{
|
{
|
||||||
const int idx = thread_row_offset + ii;
|
const int idx = thread_row_offset + ii;
|
||||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
const float val = toFloat(input[idx]);
|
||||||
|
threadData += expf(val - float_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||||
@ -95,8 +119,9 @@ __launch_bounds__(TPB) __global__
|
|||||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||||
{
|
{
|
||||||
const int idx = thread_row_offset + ii;
|
const int idx = thread_row_offset + ii;
|
||||||
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
const float val = toFloat(input[idx]);
|
||||||
output[idx] = val;
|
const float softmax_val = expf(val - float_max) * normalizing_factor;
|
||||||
|
output[idx] = softmax_val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
|||||||
const int num_experts,
|
const int num_experts,
|
||||||
const int k,
|
const int k,
|
||||||
const int start_expert,
|
const int start_expert,
|
||||||
const int end_expert)
|
const int end_expert,
|
||||||
|
const bool renormalize)
|
||||||
{
|
{
|
||||||
|
|
||||||
using cub_kvp = cub::KeyValuePair<int, float>;
|
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||||
@ -125,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
|||||||
|
|
||||||
const bool row_is_active = finished ? !finished[block_row] : true;
|
const bool row_is_active = finished ? !finished[block_row] : true;
|
||||||
const int thread_read_offset = blockIdx.x * num_experts;
|
const int thread_read_offset = blockIdx.x * num_experts;
|
||||||
|
float selected_sum = 0.f;
|
||||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||||
{
|
{
|
||||||
thread_kvp.key = 0;
|
thread_kvp.key = 0;
|
||||||
@ -163,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
|||||||
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||||
assert(indices[idx] >= 0);
|
assert(indices[idx] >= 0);
|
||||||
source_rows[idx] = k_idx * num_rows + block_row;
|
source_rows[idx] = k_idx * num_rows + block_row;
|
||||||
|
if (renormalize) {
|
||||||
|
selected_sum += result_kvp.value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||||
|
if (renormalize) {
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||||
|
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||||
|
const int idx = k * block_row + k_idx;
|
||||||
|
output[idx] = output[idx] / denom;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ====================== TopK softmax things ===============================
|
// ====================== TopK softmax things ===============================
|
||||||
@ -184,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
|||||||
2) This implementation assumes k is small, but will work for any k.
|
2) This implementation assumes k is small, but will work for any k.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
|
||||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
|
||||||
{
|
{
|
||||||
|
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
|
||||||
|
std::is_same_v<InputType, __half>,
|
||||||
|
"InputType must be float, __nv_bfloat16, or __half");
|
||||||
|
|
||||||
// We begin by enforcing compile time assertions and setting up compile time constants.
|
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||||
|
|
||||||
// Number of bytes each thread pulls in per load
|
// Number of bytes each thread pulls in per load
|
||||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||||
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||||
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||||
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
|
||||||
|
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
|
||||||
|
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
|
||||||
|
}
|
||||||
|
|
||||||
// Restrictions based on previous section.
|
// Restrictions based on previous section.
|
||||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||||
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||||
@ -236,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
|||||||
|
|
||||||
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
||||||
// row it will read.
|
// row it will read.
|
||||||
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||||
|
|
||||||
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
||||||
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||||
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||||
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||||
|
|
||||||
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
|
|
||||||
// this can support all powers of 2 up to 16.
|
|
||||||
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
|
|
||||||
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
|
|
||||||
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
|
|
||||||
|
|
||||||
// Finally, we pull in the data from global mem
|
// Finally, we pull in the data from global mem
|
||||||
float row_chunk[VPT];
|
float row_chunk[VPT];
|
||||||
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
|
|
||||||
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
|
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
|
||||||
|
if constexpr (std::is_same_v<InputType, float>) {
|
||||||
|
using VecType = AlignedArray<float, ELTS_PER_LDG>;
|
||||||
|
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
|
||||||
|
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
|
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||||
{
|
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
}
|
||||||
|
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
|
||||||
|
if constexpr (ELTS_PER_LDG >= 2) {
|
||||||
|
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
|
||||||
|
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||||
|
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||||
|
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||||
|
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||||
|
#pragma unroll
|
||||||
|
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||||
|
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
|
||||||
|
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else { // ELTS_PER_LDG == 1
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||||
|
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||||
|
row_chunk[ii] = __bfloat162float(*scalar_ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<InputType, __half>) {
|
||||||
|
if constexpr (ELTS_PER_LDG >= 2) {
|
||||||
|
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
|
||||||
|
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||||
|
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||||
|
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||||
|
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||||
|
#pragma unroll
|
||||||
|
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||||
|
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
|
||||||
|
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else { // ELTS_PER_LDG == 1
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||||
|
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||||
|
row_chunk[ii] = __half2float(*scalar_ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||||
@ -310,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
|||||||
int start_col = first_elt_read_by_thread;
|
int start_col = first_elt_read_by_thread;
|
||||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||||
|
|
||||||
|
float selected_sum = 0.f;
|
||||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||||
{
|
{
|
||||||
// First, each thread does the local argmax
|
// First, each thread does the local argmax
|
||||||
@ -363,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
|||||||
output[idx] = max_val;
|
output[idx] = max_val;
|
||||||
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
||||||
source_rows[idx] = k_idx * num_rows + thread_row;
|
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||||
|
if (renormalize) {
|
||||||
|
selected_sum += max_val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
||||||
@ -380,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||||
|
if (renormalize) {
|
||||||
|
if (thread_group_idx == 0)
|
||||||
|
{
|
||||||
|
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||||
|
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||||
|
{
|
||||||
|
const int idx = k * thread_row + k_idx;
|
||||||
|
output[idx] = output[idx] / denom;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace detail
|
namespace detail
|
||||||
{
|
{
|
||||||
// Constructs some constants needed to partition the work across threads at compile time.
|
// Constructs some constants needed to partition the work across threads at compile time.
|
||||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
|
||||||
struct TopkConstants
|
struct TopkConstants
|
||||||
{
|
{
|
||||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||||
@ -397,20 +508,21 @@ struct TopkConstants
|
|||||||
};
|
};
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
|
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
|
||||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||||
|
cudaStream_t stream)
|
||||||
{
|
{
|
||||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
|
||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
|
||||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
@ -418,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
|||||||
static_assert(WARP_SIZE == 32, \
|
static_assert(WARP_SIZE == 32, \
|
||||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||||
gating_output, nullptr, topk_weights, topk_indices, \
|
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
num_tokens, topk, 0, num_experts, renormalize, stream);
|
||||||
#else
|
#else
|
||||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||||
if (WARP_SIZE == 64) { \
|
if (WARP_SIZE == 64) { \
|
||||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||||
gating_output, nullptr, topk_weights, topk_indices, \
|
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||||
} else if (WARP_SIZE == 32) { \
|
} else if (WARP_SIZE == 32) { \
|
||||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||||
gating_output, nullptr, topk_weights, topk_indices, \
|
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||||
} else { \
|
} else { \
|
||||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <typename IndType>
|
template <typename IndType, typename InputType>
|
||||||
void topkGatingSoftmaxKernelLauncher(
|
void topkGatingSoftmaxKernelLauncher(
|
||||||
const float* gating_output,
|
const InputType* gating_output,
|
||||||
float* topk_weights,
|
float* topk_weights,
|
||||||
IndType* topk_indices,
|
IndType* topk_indices,
|
||||||
int* token_expert_indices,
|
int* token_expert_indices,
|
||||||
@ -445,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
|
|||||||
const int num_tokens,
|
const int num_tokens,
|
||||||
const int num_experts,
|
const int num_experts,
|
||||||
const int topk,
|
const int topk,
|
||||||
|
const bool renormalize,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
static constexpr int WARPS_PER_TB = 4;
|
static constexpr int WARPS_PER_TB = 4;
|
||||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
|
||||||
|
// elements can be loaded by a warp
|
||||||
|
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
|
||||||
|
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
|
||||||
#endif
|
#endif
|
||||||
switch (num_experts) {
|
switch (num_experts) {
|
||||||
case 1:
|
case 1:
|
||||||
@ -506,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
|
|||||||
TORCH_CHECK(softmax_workspace != nullptr,
|
TORCH_CHECK(softmax_workspace != nullptr,
|
||||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||||
static constexpr int TPB = 256;
|
static constexpr int TPB = 256;
|
||||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||||
gating_output, nullptr, softmax_workspace, num_experts);
|
gating_output, nullptr, softmax_workspace, num_experts);
|
||||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||||
num_experts, topk, 0, num_experts);
|
num_experts, topk, 0, num_experts, renormalize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -518,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
|
|||||||
} // namespace moe
|
} // namespace moe
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
|
|
||||||
|
template<typename ComputeType>
|
||||||
|
void dispatch_topk_softmax_launch(
|
||||||
|
torch::Tensor& gating_output,
|
||||||
|
torch::Tensor& topk_weights,
|
||||||
|
torch::Tensor& topk_indices,
|
||||||
|
torch::Tensor& token_expert_indices,
|
||||||
|
torch::Tensor& softmax_workspace,
|
||||||
|
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
if (topk_indices.scalar_type() == at::ScalarType::Int) {
|
||||||
|
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
|
||||||
|
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||||
|
topk_weights.data_ptr<float>(),
|
||||||
|
topk_indices.data_ptr<int>(),
|
||||||
|
token_expert_indices.data_ptr<int>(),
|
||||||
|
softmax_workspace.data_ptr<float>(),
|
||||||
|
num_tokens, num_experts, topk, renormalize, stream);
|
||||||
|
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
|
||||||
|
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
|
||||||
|
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||||
|
topk_weights.data_ptr<float>(),
|
||||||
|
topk_indices.data_ptr<uint32_t>(),
|
||||||
|
token_expert_indices.data_ptr<int>(),
|
||||||
|
softmax_workspace.data_ptr<float>(),
|
||||||
|
num_tokens, num_experts, topk, renormalize, stream);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||||
|
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
|
||||||
|
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||||
|
topk_weights.data_ptr<float>(),
|
||||||
|
topk_indices.data_ptr<int64_t>(),
|
||||||
|
token_expert_indices.data_ptr<int>(),
|
||||||
|
softmax_workspace.data_ptr<float>(),
|
||||||
|
num_tokens, num_experts, topk, renormalize, stream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void topk_softmax(
|
void topk_softmax(
|
||||||
torch::Tensor& topk_weights, // [num_tokens, topk]
|
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||||
torch::Tensor& gating_output) // [num_tokens, num_experts]
|
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||||
|
bool renormalize)
|
||||||
{
|
{
|
||||||
const int num_experts = gating_output.size(-1);
|
const int num_experts = gating_output.size(-1);
|
||||||
const auto num_tokens = gating_output.numel() / num_experts;
|
const auto num_tokens = gating_output.numel() / num_experts;
|
||||||
@ -534,45 +689,19 @@ void topk_softmax(
|
|||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
|
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
|
||||||
|
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
|
||||||
|
|
||||||
if(topk_indices.scalar_type() == at::ScalarType::Int)
|
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||||
{
|
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
|
||||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||||
gating_output.data_ptr<float>(),
|
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||||
topk_weights.data_ptr<float>(),
|
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
|
||||||
topk_indices.data_ptr<int>(),
|
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||||
token_expert_indices.data_ptr<int>(),
|
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||||
softmax_workspace.data_ptr<float>(),
|
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
|
||||||
num_tokens,
|
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||||
num_experts,
|
} else {
|
||||||
topk,
|
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||||
stream);
|
|
||||||
}
|
|
||||||
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
|
||||||
{
|
|
||||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
|
||||||
gating_output.data_ptr<float>(),
|
|
||||||
topk_weights.data_ptr<float>(),
|
|
||||||
topk_indices.data_ptr<uint32_t>(),
|
|
||||||
token_expert_indices.data_ptr<int>(),
|
|
||||||
softmax_workspace.data_ptr<float>(),
|
|
||||||
num_tokens,
|
|
||||||
num_experts,
|
|
||||||
topk,
|
|
||||||
stream);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
|
||||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
|
||||||
gating_output.data_ptr<float>(),
|
|
||||||
topk_weights.data_ptr<float>(),
|
|
||||||
topk_indices.data_ptr<int64_t>(),
|
|
||||||
token_expert_indices.data_ptr<int>(),
|
|
||||||
softmax_workspace.data_ptr<float>(),
|
|
||||||
num_tokens,
|
|
||||||
num_experts,
|
|
||||||
topk,
|
|
||||||
stream);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
// Apply topk softmax to the gating outputs.
|
// Apply topk softmax to the gating outputs.
|
||||||
m.def(
|
m.def(
|
||||||
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||||
"token_expert_indices, Tensor gating_output) -> ()");
|
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
|
||||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||||
|
|
||||||
// Calculate the result of moe by summing up the partial results
|
// Calculate the result of moe by summing up the partial results
|
||||||
@ -22,6 +22,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
" Tensor! num_tokens_post_pad) -> ()");
|
" Tensor! num_tokens_post_pad) -> ()");
|
||||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||||
|
|
||||||
|
// Aligning the number of tokens to be processed by each expert such
|
||||||
|
// that it is divisible by the block size, but for the batched case.
|
||||||
|
m.def(
|
||||||
|
"batched_moe_align_block_size(int max_tokens_per_batch,"
|
||||||
|
" int block_size, Tensor expert_num_tokens,"
|
||||||
|
" Tensor! sorted_token_ids,"
|
||||||
|
" Tensor! experts_ids,"
|
||||||
|
" Tensor! num_tokens_post_pad) -> ()");
|
||||||
|
m.impl("batched_moe_align_block_size", torch::kCUDA,
|
||||||
|
&batched_moe_align_block_size);
|
||||||
|
|
||||||
|
// Aligning the number of tokens to be processed by each expert such
|
||||||
|
// that it is divisible by the block size.
|
||||||
|
m.def(
|
||||||
|
"moe_lora_align_block_size(Tensor topk_ids,"
|
||||||
|
" Tensor token_lora_mapping,"
|
||||||
|
" int num_experts,"
|
||||||
|
" int block_size, int max_loras, "
|
||||||
|
" int max_num_tokens_padded, "
|
||||||
|
" int max_num_m_blocks, "
|
||||||
|
" Tensor !sorted_token_ids,"
|
||||||
|
" Tensor !experts_ids,"
|
||||||
|
" Tensor !num_tokens_post_pad) -> () ");
|
||||||
|
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
m.def(
|
m.def(
|
||||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||||
|
|||||||
10
csrc/ops.h
10
csrc/ops.h
@ -92,9 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
|||||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
torch::Tensor& weight, double epsilon);
|
torch::Tensor& weight, double epsilon);
|
||||||
|
|
||||||
void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
|
||||||
torch::Tensor& bias, double epsilon);
|
|
||||||
|
|
||||||
void apply_repetition_penalties_(torch::Tensor& logits,
|
void apply_repetition_penalties_(torch::Tensor& logits,
|
||||||
const torch::Tensor& prompt_mask,
|
const torch::Tensor& prompt_mask,
|
||||||
const torch::Tensor& output_mask,
|
const torch::Tensor& output_mask,
|
||||||
@ -102,8 +99,11 @@ void apply_repetition_penalties_(torch::Tensor& logits,
|
|||||||
|
|
||||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||||
torch::Tensor& values, int64_t numRows, int64_t stride0,
|
int64_t numRows, int64_t stride0, int64_t stride1);
|
||||||
int64_t stride1);
|
|
||||||
|
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||||
|
const torch::Tensor& seq_lens, torch::Tensor& indices,
|
||||||
|
int64_t numRows, int64_t stride0, int64_t stride1);
|
||||||
|
|
||||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
torch::Tensor& weight, torch::Tensor& scale,
|
torch::Tensor& weight, torch::Tensor& scale,
|
||||||
|
|||||||
@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
|
|||||||
if (scale_ub.has_value()) {
|
if (scale_ub.has_value()) {
|
||||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||||
}
|
}
|
||||||
|
TORCH_CHECK(weight.dtype() == input.dtype());
|
||||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||||
|
if (residual) {
|
||||||
|
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||||
|
}
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
|
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
|
||||||
|
|||||||
107
csrc/sampler.cu
107
csrc/sampler.cu
@ -54,15 +54,10 @@ static inline __device__ uint16_t extractBinIdx(float x) {
|
|||||||
return 511 - (tmp.u16 >> 7);
|
return 511 - (tmp.u16 >> 7);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int kNumThreadsPerBlock = 512>
|
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
|
||||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
__device__ void topKPerRowJob(const float* logits, const int rowStart,
|
||||||
const int* rowEnds, int* outIndices,
|
const int rowEnd, const int rowIdx,
|
||||||
float* outLogits, int stride0, int stride1) {
|
int* outIndices, int stride0, int stride1) {
|
||||||
// The number of bins in the histogram.
|
|
||||||
static constexpr int kNumBins = 512;
|
|
||||||
|
|
||||||
// The top-k width.
|
|
||||||
static constexpr int kTopK = 2048;
|
|
||||||
// The number of elements per thread for the final top-k sort.
|
// The number of elements per thread for the final top-k sort.
|
||||||
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
|
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
|
||||||
// The class to sort the elements during the final top-k sort.
|
// The class to sort the elements during the final top-k sort.
|
||||||
@ -103,17 +98,11 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
|||||||
__shared__ int smemHistogram[kNumBins];
|
__shared__ int smemHistogram[kNumBins];
|
||||||
// Shared memory to store the selected indices.
|
// Shared memory to store the selected indices.
|
||||||
__shared__ int smemIndices[kTopK];
|
__shared__ int smemIndices[kTopK];
|
||||||
// Shared memory to store the selected logits.
|
|
||||||
__shared__ float smemLogits[kTopK];
|
|
||||||
// Shared memory to store the threshold bin.
|
// Shared memory to store the threshold bin.
|
||||||
__shared__ int smemThresholdBinIdx[1];
|
__shared__ int smemThresholdBinIdx[1];
|
||||||
// Shared memory counter to register the candidates for the final phase.
|
// Shared memory counter to register the candidates for the final phase.
|
||||||
__shared__ int smemFinalDstIdx[1];
|
__shared__ int smemFinalDstIdx[1];
|
||||||
|
|
||||||
// The row computed by this block.
|
|
||||||
int rowIdx = blockIdx.x;
|
|
||||||
// The range of logits within the row.
|
|
||||||
int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
|
|
||||||
// The length of the row.
|
// The length of the row.
|
||||||
int rowLen = rowEnd - rowStart;
|
int rowLen = rowEnd - rowStart;
|
||||||
|
|
||||||
@ -124,13 +113,10 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
|||||||
rowIt += kNumThreadsPerBlock) {
|
rowIt += kNumThreadsPerBlock) {
|
||||||
int idx = rowStart + rowIt;
|
int idx = rowStart + rowIt;
|
||||||
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
|
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
|
||||||
outLogits[rowIdx * kTopK + rowIt] =
|
|
||||||
logits[rowIdx * stride0 + idx * stride1];
|
|
||||||
}
|
}
|
||||||
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
|
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
|
||||||
rowIt += kNumThreadsPerBlock) {
|
rowIt += kNumThreadsPerBlock) {
|
||||||
outIndices[rowIdx * kTopK + rowIt] = -1;
|
outIndices[rowIdx * kTopK + rowIt] = -1;
|
||||||
outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -201,7 +187,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
|||||||
uint16_t idx = extractBinIdx(logit);
|
uint16_t idx = extractBinIdx(logit);
|
||||||
if (idx < thresholdBinIdx) {
|
if (idx < thresholdBinIdx) {
|
||||||
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
|
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
|
||||||
smemLogits[dstIdx] = logit;
|
|
||||||
smemIndices[dstIdx] = rowIt;
|
smemIndices[dstIdx] = rowIt;
|
||||||
} else if (idx == thresholdBinIdx) {
|
} else if (idx == thresholdBinIdx) {
|
||||||
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
||||||
@ -250,7 +235,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
|||||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||||
int dstIdx = baseIdx + srcIdx;
|
int dstIdx = baseIdx + srcIdx;
|
||||||
if (dstIdx < kTopK) {
|
if (dstIdx < kTopK) {
|
||||||
smemLogits[dstIdx] = finalLogits[ii];
|
|
||||||
smemIndices[dstIdx] = finalIndices[ii];
|
smemIndices[dstIdx] = finalIndices[ii];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -258,31 +242,58 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
|||||||
// Make sure the data is in shared memory.
|
// Make sure the data is in shared memory.
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// The topK logits.
|
|
||||||
float topKLogits[kNumTopKItemsPerThread];
|
|
||||||
// The topK indices.
|
|
||||||
int topKIndices[kNumTopKItemsPerThread];
|
|
||||||
|
|
||||||
// Load from shared memory.
|
|
||||||
#pragma unroll
|
|
||||||
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
|
||||||
topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
|
|
||||||
topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort the elements.
|
|
||||||
TopKSort(smemFinal.topKSort)
|
|
||||||
.SortDescendingBlockedToStriped(topKLogits, topKIndices);
|
|
||||||
|
|
||||||
// Store to global memory.
|
// Store to global memory.
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
||||||
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
|
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
|
||||||
outIndices[offset] = topKIndices[ii] - rowStart;
|
outIndices[offset] =
|
||||||
outLogits[offset] = topKLogits[ii];
|
smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int kNumThreadsPerBlock = 512>
|
||||||
|
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||||
|
const int* rowEnds, int* outIndices,
|
||||||
|
int stride0, int stride1) {
|
||||||
|
// The number of bins in the histogram.
|
||||||
|
static constexpr int kNumBins = 512;
|
||||||
|
|
||||||
|
// The top-k width.
|
||||||
|
static constexpr int kTopK = 2048;
|
||||||
|
|
||||||
|
// The row computed by this block.
|
||||||
|
int rowIdx = blockIdx.x;
|
||||||
|
|
||||||
|
// The range of logits within the row.
|
||||||
|
int rowStart = rowStarts[rowIdx];
|
||||||
|
int rowEnd = rowEnds[rowIdx];
|
||||||
|
|
||||||
|
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
||||||
|
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int kNumThreadsPerBlock = 512>
|
||||||
|
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
|
||||||
|
int* outIndices, int stride0,
|
||||||
|
int stride1, int next_n) {
|
||||||
|
// The number of bins in the histogram.
|
||||||
|
static constexpr int kNumBins = 512;
|
||||||
|
|
||||||
|
// The top-k width.
|
||||||
|
static constexpr int kTopK = 2048;
|
||||||
|
|
||||||
|
// The row computed by this block.
|
||||||
|
int rowIdx = blockIdx.x;
|
||||||
|
|
||||||
|
// The range of logits within the row.
|
||||||
|
int rowStart = 0;
|
||||||
|
int seq_len = seqLens[rowIdx / next_n];
|
||||||
|
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
|
||||||
|
|
||||||
|
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
||||||
|
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void apply_repetition_penalties_(
|
void apply_repetition_penalties_(
|
||||||
@ -326,10 +337,23 @@ void apply_repetition_penalties_(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||||
|
const torch::Tensor& seqLens, torch::Tensor& indices,
|
||||||
|
int64_t numRows, int64_t stride0, int64_t stride1) {
|
||||||
|
// Compute the results on the device.
|
||||||
|
constexpr int kNumThreadsPerBlock = 512;
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
vllm::topKPerRowDecode<kNumThreadsPerBlock>
|
||||||
|
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
||||||
|
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||||
|
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||||
|
static_cast<int>(stride1), static_cast<int>(next_n));
|
||||||
|
}
|
||||||
|
|
||||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||||
torch::Tensor& values, int64_t numRows, int64_t stride0,
|
int64_t numRows, int64_t stride0, int64_t stride1) {
|
||||||
int64_t stride1) {
|
|
||||||
// Compute the results on the device.
|
// Compute the results on the device.
|
||||||
constexpr int kNumThreadsPerBlock = 512;
|
constexpr int kNumThreadsPerBlock = 512;
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
@ -338,6 +362,5 @@ void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
|||||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
||||||
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||||
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||||
values.data_ptr<float>(), static_cast<int>(stride0),
|
static_cast<int>(stride0), static_cast<int>(stride1));
|
||||||
static_cast<int>(stride1));
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -175,12 +175,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"float epsilon) -> ()");
|
"float epsilon) -> ()");
|
||||||
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
|
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
|
||||||
|
|
||||||
// Polynomial Normalization.
|
|
||||||
ops.def(
|
|
||||||
"poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
|
|
||||||
"epsilon) -> ()");
|
|
||||||
ops.impl("poly_norm", torch::kCUDA, &poly_norm);
|
|
||||||
|
|
||||||
// Apply repetition penalties to logits in-place
|
// Apply repetition penalties to logits in-place
|
||||||
ops.def(
|
ops.def(
|
||||||
"apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
|
"apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
|
||||||
@ -191,10 +185,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// Optimized top-k per row operation
|
// Optimized top-k per row operation
|
||||||
ops.def(
|
ops.def(
|
||||||
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
||||||
"Tensor! indices, Tensor! values, int numRows, int stride0, "
|
"Tensor! indices, int numRows, int stride0, "
|
||||||
"int stride1) -> ()");
|
"int stride1) -> ()");
|
||||||
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
|
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
|
||||||
|
|
||||||
|
ops.def(
|
||||||
|
"top_k_per_row_decode(Tensor logits, int next_n, "
|
||||||
|
"Tensor seq_lens, Tensor! indices, int numRows, "
|
||||||
|
"int stride0, int stride1) -> ()");
|
||||||
|
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
|
||||||
|
|
||||||
// Layernorm-quant
|
// Layernorm-quant
|
||||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||||
ops.def(
|
ops.def(
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
# docs/contributing/dockerfile/dockerfile.md and
|
# docs/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
# docs/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.9.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
|
|
||||||
# By parameterizing the base images, we allow third-party to use their own
|
# By parameterizing the base images, we allow third-party to use their own
|
||||||
@ -132,7 +132,9 @@ WORKDIR /workspace
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
|
||||||
|
uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
|
||||||
|
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
@ -273,6 +275,7 @@ WORKDIR /vllm-workspace
|
|||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
|
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
|
||||||
ARG GDRCOPY_CUDA_VERSION=12.8
|
ARG GDRCOPY_CUDA_VERSION=12.8
|
||||||
# Keep in line with FINAL_BASE_IMAGE
|
# Keep in line with FINAL_BASE_IMAGE
|
||||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
||||||
@ -353,14 +356,23 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
|
||||||
|
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
|
||||||
|
&& uv pip install --system dist/*.whl --verbose \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
|
# TODO (huydhn): Remove this once xformers is released for 2.9.0
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
|
. /etc/environment
|
||||||
|
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
|
||||||
|
uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
|
||||||
|
BASH
|
||||||
|
|
||||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||||
# https://docs.flashinfer.ai/installation.html
|
# https://docs.flashinfer.ai/installation.html
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system flashinfer-cubin==0.4.0 \
|
uv pip install --system flashinfer-cubin==0.4.1 \
|
||||||
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
|
&& uv pip install --system flashinfer-jit-cache==0.4.1 \
|
||||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||||
&& flashinfer show-config
|
&& flashinfer show-config
|
||||||
|
|
||||||
@ -422,6 +434,7 @@ ARG PYTHON_VERSION
|
|||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
|
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@ -434,7 +447,8 @@ ENV UV_LINK_MODE=copy
|
|||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
||||||
uv pip install --system -r requirements/dev.txt; \
|
uv pip install --system -r requirements/dev.txt \
|
||||||
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
|
|||||||
@ -31,7 +31,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
|||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
apt-get update -y \
|
apt-get update -y \
|
||||||
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
|
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
|
||||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
|
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
||||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
@ -106,14 +106,106 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
|
FROM base AS build
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
|
|
||||||
|
# install build dependencies
|
||||||
|
COPY requirements/build.txt requirements/build.txt
|
||||||
|
|
||||||
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
# max jobs used by Ninja to build extensions
|
||||||
|
ARG max_jobs=2
|
||||||
|
ENV MAX_JOBS=${max_jobs}
|
||||||
|
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
||||||
|
ARG SCCACHE_ENDPOINT
|
||||||
|
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||||
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
|
|
||||||
|
# Flag to control whether to use pre-built vLLM wheels
|
||||||
|
ARG VLLM_USE_PRECOMPILED=""
|
||||||
|
|
||||||
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
echo "Installing sccache..." \
|
||||||
|
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||||
|
&& tar -xzf sccache.tar.gz \
|
||||||
|
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||||
|
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||||
|
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||||
|
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||||
|
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||||
|
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||||
|
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||||
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
|
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||||
|
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||||
|
&& sccache --show-stats \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||||
|
&& sccache --show-stats; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
ARG vllm_target_device="cpu"
|
||||||
|
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
|
# Clean any existing CMake artifacts
|
||||||
|
rm -rf .deps && \
|
||||||
|
mkdir -p .deps && \
|
||||||
|
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||||
|
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||||
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
|
# sync the default value with .buildkite/check-wheel-size.py
|
||||||
|
ARG VLLM_MAX_SIZE_MB=450
|
||||||
|
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||||
|
ARG RUN_WHEEL_CHECK=true
|
||||||
|
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||||
|
python3 check-wheel-size.py dist; \
|
||||||
|
else \
|
||||||
|
echo "Skipping wheel size check."; \
|
||||||
|
fi
|
||||||
|
|
||||||
######################### TEST DEPS #########################
|
######################### TEST DEPS #########################
|
||||||
FROM base AS vllm-test-deps
|
FROM base AS vllm-test-deps
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
|
||||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||||
cp requirements/test.in requirements/cpu-test.in && \
|
cp requirements/test.in requirements/cpu-test.in && \
|
||||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||||
|
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
|
||||||
|
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||||
|
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
|||||||
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
|||||||
|
|
||||||
|
|
||||||
# build flashinfer for torch nightly from source around 10 mins
|
# build flashinfer for torch nightly from source around 10 mins
|
||||||
# release version: v0.4.0
|
# release version: v0.4.1
|
||||||
# todo(elainewy): cache flashinfer build result for faster build
|
# todo(elainewy): cache flashinfer build result for faster build
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
echo "git clone flashinfer..." \
|
echo "git clone flashinfer..." \
|
||||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||||
&& cd flashinfer \
|
&& cd flashinfer \
|
||||||
&& git checkout v0.4.0 \
|
&& git checkout v0.4.1\
|
||||||
&& git submodule update --init --recursive \
|
&& git submodule update --init --recursive \
|
||||||
&& echo "finish git clone flashinfer..." \
|
&& echo "finish git clone flashinfer..." \
|
||||||
&& rm -rf build \
|
&& rm -rf build \
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# default base image
|
# default base image
|
||||||
ARG REMOTE_VLLM="0"
|
ARG REMOTE_VLLM="0"
|
||||||
ARG COMMON_WORKDIR=/app
|
ARG COMMON_WORKDIR=/app
|
||||||
ARG BASE_IMAGE=rocm/vllm-dev:base
|
ARG BASE_IMAGE=rocm/vllm-dev:base_custom_1020_rc1_20251008_tuned_20251008
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
|||||||
RUN apt-get update -q -y && apt-get install -q -y \
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||||
apt-transport-https ca-certificates wget curl
|
apt-transport-https ca-certificates wget curl
|
||||||
# Remove sccache
|
# Remove sccache
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
ARG COMMON_WORKDIR
|
ARG COMMON_WORKDIR
|
||||||
|
|||||||
@ -1,13 +1,13 @@
|
|||||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||||
ARG TRITON_BRANCH="f9e5bf54"
|
ARG TRITON_BRANCH="57c693b6"
|
||||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||||
ARG PYTORCH_BRANCH="b2fb6885"
|
ARG PYTORCH_BRANCH="1c57644d"
|
||||||
ARG PYTORCH_VISION_BRANCH="v0.23.0"
|
ARG PYTORCH_VISION_BRANCH="v0.23.0"
|
||||||
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||||
ARG FA_BRANCH="0e60e394"
|
ARG FA_BRANCH="0e60e394"
|
||||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||||
ARG AITER_BRANCH="2ab9f4cd"
|
ARG AITER_BRANCH="eef23c7f"
|
||||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|||||||
@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
|
|||||||
- [vllm.config.CompilationConfig][]
|
- [vllm.config.CompilationConfig][]
|
||||||
- [vllm.config.VllmConfig][]
|
- [vllm.config.VllmConfig][]
|
||||||
|
|
||||||
[](){ #offline-inference-api }
|
|
||||||
|
|
||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|
||||||
LLM Class.
|
LLM Class.
|
||||||
@ -45,18 +43,14 @@ Engine classes for offline and online inference.
|
|||||||
|
|
||||||
Inference parameters for vLLM APIs.
|
Inference parameters for vLLM APIs.
|
||||||
|
|
||||||
[](){ #sampling-params }
|
|
||||||
|
|
||||||
- [vllm.SamplingParams][]
|
- [vllm.SamplingParams][]
|
||||||
- [vllm.PoolingParams][]
|
- [vllm.PoolingParams][]
|
||||||
|
|
||||||
[](){ #multi-modality }
|
|
||||||
|
|
||||||
## Multi-Modality
|
## Multi-Modality
|
||||||
|
|
||||||
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
|
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
|
||||||
|
|
||||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
|
||||||
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
||||||
|
|
||||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
|
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 119 KiB |
@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
|
|||||||
|
|
||||||
There are three main levels of configuration, from highest priority to lowest priority:
|
There are three main levels of configuration, from highest priority to lowest priority:
|
||||||
|
|
||||||
- [Request parameters][completions-api] and [input arguments][sampling-params]
|
- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
|
||||||
- [Engine arguments](./engine_args.md)
|
- [Engine arguments](./engine_args.md)
|
||||||
- [Environment variables](./env_vars.md)
|
- [Environment variables](./env_vars.md)
|
||||||
|
|||||||
@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
|||||||
!!! note
|
!!! note
|
||||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||||
|
|
||||||
You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||||
|
|
||||||
## Quantization
|
## Quantization
|
||||||
|
|
||||||
|
|||||||
@ -27,8 +27,6 @@ You can monitor the number of preemption requests through Prometheus metrics exp
|
|||||||
|
|
||||||
In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
|
In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
|
||||||
|
|
||||||
[](){ #chunked-prefill }
|
|
||||||
|
|
||||||
## Chunked Prefill
|
## Chunked Prefill
|
||||||
|
|
||||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||||
@ -174,14 +172,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
|||||||
|
|
||||||
Known supported models (with corresponding benchmarks):
|
Known supported models (with corresponding benchmarks):
|
||||||
|
|
||||||
- dots_ocr (<gh-pr:25466>)
|
- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
|
||||||
- GLM-4.1V or above (<gh-pr:23168>)
|
- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
|
||||||
- InternVL (<gh-pr:23909>)
|
- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
|
||||||
- Kimi-VL (<gh-pr:23817>)
|
- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
|
||||||
- Llama4 (<gh-pr:18368>)
|
- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
|
||||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
|
||||||
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
|
- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
|
||||||
- Step3 (<gh-pr:22697>)
|
- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
|
||||||
|
|
||||||
## Input Processing
|
## Input Processing
|
||||||
|
|
||||||
|
|||||||
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
|
|||||||
|
|
||||||
### Tune your workloads
|
### Tune your workloads
|
||||||
|
|
||||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||||
|
|
||||||
### Future Topics We'll Cover
|
### Future Topics We'll Cover
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
|
|||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
See <gh-file:LICENSE>.
|
See [LICENSE](../../LICENSE).
|
||||||
|
|
||||||
## Developing
|
## Developing
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
|
|||||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||||
|
|
||||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
|||||||
|
|
||||||
### Documentation
|
### Documentation
|
||||||
|
|
||||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
|
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
|
||||||
|
|
||||||
Get started with:
|
Get started with:
|
||||||
|
|
||||||
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
|
|||||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
|
If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
|
||||||
|
|
||||||
## Pull Requests & Code Reviews
|
## Pull Requests & Code Reviews
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
|
|||||||
|
|
||||||
### DCO and Signed-off-by
|
### DCO and Signed-off-by
|
||||||
|
|
||||||
When contributing changes to this project, you must agree to the <gh-file:DCO>.
|
When contributing changes to this project, you must agree to the [DCO](../../DCO).
|
||||||
Commits must include a `Signed-off-by:` header which certifies agreement with
|
Commits must include a `Signed-off-by:` header which certifies agreement with
|
||||||
the terms of the DCO.
|
the terms of the DCO.
|
||||||
|
|
||||||
|
|||||||
@ -6,9 +6,10 @@ toc_depth: 4
|
|||||||
|
|
||||||
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
||||||
|
|
||||||
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||||
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
|
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
||||||
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
|
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||||
|
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
||||||
|
|
||||||
[Benchmark CLI]: #benchmark-cli
|
[Benchmark CLI]: #benchmark-cli
|
||||||
|
|
||||||
@ -29,7 +30,7 @@ th {
|
|||||||
| Dataset | Online | Offline | Data Path |
|
| Dataset | Online | Offline | Data Path |
|
||||||
|---------|--------|---------|-----------|
|
|---------|--------|---------|-----------|
|
||||||
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
||||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||||
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
||||||
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
||||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||||
@ -714,7 +715,7 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
|
|||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
|
|
||||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||||
- Video sampling is not yet implemented.
|
- Video sampling is not yet implemented.
|
||||||
|
|
||||||
Start the server (example):
|
Start the server (example):
|
||||||
@ -822,7 +823,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
|
|||||||
- CLIP: `--backend openai-embeddings-clip`
|
- CLIP: `--backend openai-embeddings-clip`
|
||||||
- VLM2Vec: `--backend openai-embeddings-vlm2vec`
|
- VLM2Vec: `--backend openai-embeddings-vlm2vec`
|
||||||
|
|
||||||
For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
|
For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
|
||||||
|
|
||||||
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
|
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
|
||||||
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
|
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
|
||||||
@ -924,7 +925,162 @@ throughput numbers correctly is also adjusted.
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
[](){ #performance-benchmarks }
|
## Parameter Sweeps
|
||||||
|
|
||||||
|
### Online Benchmark
|
||||||
|
|
||||||
|
[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
|
||||||
|
|
||||||
|
Follow these steps to run the script:
|
||||||
|
|
||||||
|
1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
|
||||||
|
2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
|
||||||
|
3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
|
||||||
|
|
||||||
|
- Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"max_num_seqs": 32,
|
||||||
|
"max_num_batched_tokens": 1024
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_num_seqs": 64,
|
||||||
|
"max_num_batched_tokens": 1024
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_num_seqs": 64,
|
||||||
|
"max_num_batched_tokens": 2048
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_num_seqs": 128,
|
||||||
|
"max_num_batched_tokens": 2048
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_num_seqs": 128,
|
||||||
|
"max_num_batched_tokens": 4096
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"max_num_batched_tokens": 4096
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
|
||||||
|
|
||||||
|
- Example: Using different input/output lengths for random dataset:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"random_input_len": 128,
|
||||||
|
"random_output_len": 32
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"random_input_len": 256,
|
||||||
|
"random_output_len": 64
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"random_input_len": 512,
|
||||||
|
"random_output_len": 128
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Determine where you want to save the results, and pass that to `--output-dir`.
|
||||||
|
|
||||||
|
Example command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m vllm.benchmarks.sweep.serve \
|
||||||
|
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||||
|
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||||
|
--serve-params benchmarks/serve_hparams.json \
|
||||||
|
--bench-params benchmarks/bench_hparams.json \
|
||||||
|
-o benchmarks/results
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! important
|
||||||
|
If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
|
||||||
|
You can use `--dry-run` to preview the commands to be run.
|
||||||
|
|
||||||
|
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
|
||||||
|
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
|
||||||
|
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
|
||||||
|
|
||||||
|
### SLA Auto-Tuner
|
||||||
|
|
||||||
|
[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
|
||||||
|
|
||||||
|
For example, to ensure E2E latency within different target values for 99% of requests:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"p99_e2el_ms": "<=200"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"p99_e2el_ms": "<=500"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"p99_e2el_ms": "<=1000"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"p99_e2el_ms": "<=2000"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Example command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m vllm.benchmarks.sweep.serve_sla \
|
||||||
|
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||||
|
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||||
|
--serve-params benchmarks/serve_hparams.json \
|
||||||
|
--bench-params benchmarks/bench_hparams.json \
|
||||||
|
--sla-params benchmarks/sla_hparams.json \
|
||||||
|
--sla-variable max_concurrency \
|
||||||
|
-o benchmarks/results
|
||||||
|
```
|
||||||
|
|
||||||
|
The algorithm for adjusting the SLA variable is as follows:
|
||||||
|
|
||||||
|
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
|
||||||
|
- For example, the initial request rate is set to the concurrency under infinite QPS.
|
||||||
|
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
|
||||||
|
3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
|
||||||
|
|
||||||
|
!!! important
|
||||||
|
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
|
||||||
|
|
||||||
|
For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
|
||||||
|
|
||||||
|
### Visualizer
|
||||||
|
|
||||||
|
[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
|
||||||
|
|
||||||
|
Example command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
|
||||||
|
--var-x max_concurrency \
|
||||||
|
--row-by random_input_len \
|
||||||
|
--col-by random_output_len \
|
||||||
|
--curve-by api_server_count,max_num_batched_tokens \
|
||||||
|
--filter-by 'max_concurrency<=1024'
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
You can use `--dry-run` to preview the figures to be plotted.
|
||||||
|
|
||||||
## Performance Benchmarks
|
## Performance Benchmarks
|
||||||
|
|
||||||
@ -962,7 +1118,7 @@ For more results visualization, check the [visualizing the results](https://gith
|
|||||||
|
|
||||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||||
|
|
||||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||||
|
|
||||||
### Continuous Benchmarking
|
### Continuous Benchmarking
|
||||||
|
|
||||||
@ -988,12 +1144,10 @@ The benchmarking currently runs on a predefined set of models configured in the
|
|||||||
|
|
||||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||||
|
|
||||||
[](){ #nightly-benchmarks }
|
|
||||||
|
|
||||||
## Nightly Benchmarks
|
## Nightly Benchmarks
|
||||||
|
|
||||||
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
||||||
|
|
||||||
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
||||||
|
|
||||||
More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||||
|
|||||||
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
|
|||||||
|
|
||||||
Strip timestamps and colorization:
|
Strip timestamps and colorization:
|
||||||
|
|
||||||
<gh-file:.buildkite/scripts/ci-clean-log.sh>
|
[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./ci-clean-log.sh ci.log
|
./ci-clean-log.sh ci.log
|
||||||
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
|
|||||||
|
|
||||||
CI test failures may be flaky. Use a bash loop to run repeatedly:
|
CI test failures may be flaky. Use a bash loop to run repeatedly:
|
||||||
|
|
||||||
<gh-file:.buildkite/scripts/rerun-test.sh>
|
[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
|
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
|
||||||
|
|||||||
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
|
|||||||
PyTorch version as early as possible when a new [PyTorch stable
|
PyTorch version as early as possible when a new [PyTorch stable
|
||||||
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
||||||
This process is non-trivial due to the gap between PyTorch
|
This process is non-trivial due to the gap between PyTorch
|
||||||
releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
|
releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
|
||||||
update along with a list of potential issues and how to address them.
|
update along with a list of potential issues and how to address them.
|
||||||
|
|
||||||
## Test PyTorch release candidates (RCs)
|
## Test PyTorch release candidates (RCs)
|
||||||
@ -85,9 +85,9 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
|
|||||||
it doesn't populate the cache, so re-running it to warm up the cache
|
it doesn't populate the cache, so re-running it to warm up the cache
|
||||||
is ineffective.
|
is ineffective.
|
||||||
|
|
||||||
While ongoing efforts like [#17419](gh-issue:17419)
|
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
|
||||||
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
|
||||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||||
|
|
||||||
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
|
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
|
||||||
@ -100,35 +100,17 @@ to warm it up so that future builds are faster.
|
|||||||
|
|
||||||
## Update dependencies
|
## Update dependencies
|
||||||
|
|
||||||
Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
|
Several vLLM dependencies like xFormers depend on PyTorch and need
|
||||||
to be updated accordingly. Rather than waiting for all of them to publish new
|
to be updated accordingly. Rather than waiting for all of them to publish new
|
||||||
releases (which would take too much time), they can be built from
|
releases (which would take too much time), they can be built from
|
||||||
source to unblock the update process.
|
source to unblock the update process.
|
||||||
|
|
||||||
### FlashInfer
|
|
||||||
|
|
||||||
Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
|
||||||
export FLASHINFER_ENABLE_SM90=1
|
|
||||||
uv pip install --system \
|
|
||||||
--no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
|
||||||
```
|
|
||||||
|
|
||||||
One caveat is that building FlashInfer from source adds approximately 30
|
|
||||||
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
|
|
||||||
public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
|
|
||||||
team if you want to get the package published there.
|
|
||||||
|
|
||||||
### xFormers
|
### xFormers
|
||||||
|
|
||||||
Similar to FlashInfer, here is how to build and install xFormers from source:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
|
||||||
MAX_JOBS=16 uv pip install --system \
|
MAX_JOBS=16 uv pip install --system \
|
||||||
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Update all the different vLLM platforms
|
## Update all the different vLLM platforms
|
||||||
@ -138,5 +120,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
|
|||||||
for different platforms in vLLM CI/CD allows us to selectively choose
|
for different platforms in vLLM CI/CD allows us to selectively choose
|
||||||
which platforms to update. For instance, updating XPU requires the corresponding
|
which platforms to update. For instance, updating XPU requires the corresponding
|
||||||
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
||||||
While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||||
<gh-pr:17444> completed the update for XPU.
|
<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Dockerfile
|
# Dockerfile
|
||||||
|
|
||||||
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
|
||||||
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
||||||
|
|
||||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# Summary
|
# Summary
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||||
|
|
||||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
|
|||||||
## 1. Bring your model code
|
## 1. Bring your model code
|
||||||
|
|
||||||
First, clone the PyTorch model code from the source repository.
|
First, clone the PyTorch model code from the source repository.
|
||||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
|
||||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
@ -83,7 +83,7 @@ def forward(
|
|||||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||||
|
|
||||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
|
||||||
|
|
||||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||||
|
|
||||||
@ -130,22 +130,22 @@ We consider 3 different scenarios:
|
|||||||
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
||||||
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
||||||
|
|
||||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||||
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||||
V0-only classes and code will be removed in the very near future.
|
V0-only classes and code will be removed in the very near future.
|
||||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
|
||||||
|
|
||||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||||
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
||||||
|
|
||||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||||
Please follow the same guidelines as case (2) for implementing these models.
|
Please follow the same guidelines as case (2) for implementing these models.
|
||||||
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
||||||
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
||||||
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
||||||
Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
|
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||||
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
||||||
Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
|
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
|
||||||
The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
|
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
|
||||||
|
|||||||
@ -507,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
```
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
|
Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
|
||||||
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
||||||
|
|
||||||
=== "With postprocessing: Fuyu"
|
=== "With postprocessing: Fuyu"
|
||||||
@ -569,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
```
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
|
||||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
@ -828,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
|
||||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
||||||
|
|
||||||
### Handling prompt updates unrelated to multi-modal data
|
### Handling prompt updates unrelated to multi-modal data
|
||||||
|
|
||||||
@ -837,9 +837,9 @@ Examples:
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
|
- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
|
||||||
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
|
- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
|
||||||
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
|
- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
||||||
|
|
||||||
### Custom HF processor
|
### Custom HF processor
|
||||||
|
|
||||||
@ -847,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
|
- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
|
||||||
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
|
- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
|
||||||
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
|
- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
|
||||||
|
|||||||
@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.
|
|||||||
|
|
||||||
## Built-in models
|
## Built-in models
|
||||||
|
|
||||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
|
||||||
This gives you the ability to modify the codebase and test your model.
|
This gives you the ability to modify the codebase and test your model.
|
||||||
|
|
||||||
After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
|
||||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
|
||||||
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
|
|||||||
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
|
|||||||
|
|
||||||
### Model loading
|
### Model loading
|
||||||
|
|
||||||
Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
|
Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
|
||||||
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
@ -26,26 +26,24 @@ Passing these tests provides more confidence that your implementation is correct
|
|||||||
|
|
||||||
### Model correctness
|
### Model correctness
|
||||||
|
|
||||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
|
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
|
||||||
|
|
||||||
#### Generative models
|
#### Generative models
|
||||||
|
|
||||||
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
|
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
|
||||||
|
|
||||||
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
||||||
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
||||||
|
|
||||||
#### Pooling models
|
#### Pooling models
|
||||||
|
|
||||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
|
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
||||||
|
|
||||||
[](){ #mm-processing-tests }
|
|
||||||
|
|
||||||
### Multi-modal processing
|
### Multi-modal processing
|
||||||
|
|
||||||
#### Common tests
|
#### Common tests
|
||||||
|
|
||||||
Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
|
Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
|
||||||
|
|
||||||
- Text + multi-modal data
|
- Text + multi-modal data
|
||||||
- Tokens + multi-modal data
|
- Tokens + multi-modal data
|
||||||
@ -54,6 +52,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>
|
|||||||
|
|
||||||
#### Model-specific tests
|
#### Model-specific tests
|
||||||
|
|
||||||
You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
|
You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
|
||||||
|
|
||||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
|
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
|
||||||
|
|||||||
@ -248,9 +248,9 @@ No extra registration is required beyond having your model class available via t
|
|||||||
|
|
||||||
## Examples in-tree
|
## Examples in-tree
|
||||||
|
|
||||||
- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
|
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
||||||
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
|
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
|
||||||
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
|
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
||||||
|
|
||||||
## Test with the API
|
## Test with the API
|
||||||
|
|
||||||
@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
|
|||||||
http://localhost:8000/v1/audio/translations
|
http://localhost:8000/v1/audio/translations
|
||||||
```
|
```
|
||||||
|
|
||||||
Or check out more examples in <gh-file:examples/online_serving>.
|
Or check out more examples in [examples/online_serving](../../../examples/online_serving).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
||||||
|
|||||||
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
|
|||||||
|
|
||||||
#### Offline Inference
|
#### Offline Inference
|
||||||
|
|
||||||
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
|
Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
|
||||||
|
|
||||||
#### OpenAI Server
|
#### OpenAI Server
|
||||||
|
|
||||||
@ -180,9 +180,13 @@ The profiling traces generated by the continuous profiling workflow are publicly
|
|||||||
The Python standard library includes
|
The Python standard library includes
|
||||||
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
||||||
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
||||||
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
|
Both the `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` functions can be
|
||||||
used to profile a section of code.
|
used to profile a section of code.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
The legacy import paths `vllm.utils.cprofile` and `vllm.utils.cprofile_context` are deprecated.
|
||||||
|
Please use `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` instead.
|
||||||
|
|
||||||
### Example usage - decorator
|
### Example usage - decorator
|
||||||
|
|
||||||
The first helper is a Python decorator that can be used to profile a function.
|
The first helper is a Python decorator that can be used to profile a function.
|
||||||
@ -190,9 +194,9 @@ If a filename is specified, the profile will be saved to that file. If no filena
|
|||||||
specified, profile data will be printed to stdout.
|
specified, profile data will be printed to stdout.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import vllm.utils
|
from vllm.utils.profiling import cprofile
|
||||||
|
|
||||||
@vllm.utils.cprofile("expensive_function.prof")
|
@cprofile("expensive_function.prof")
|
||||||
def expensive_function():
|
def expensive_function():
|
||||||
# some expensive code
|
# some expensive code
|
||||||
pass
|
pass
|
||||||
@ -204,13 +208,13 @@ The second helper is a context manager that can be used to profile a block of
|
|||||||
code. Similar to the decorator, the filename is optional.
|
code. Similar to the decorator, the filename is optional.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import vllm.utils
|
from vllm.utils.profiling import cprofile_context
|
||||||
|
|
||||||
def another_function():
|
def another_function():
|
||||||
# more expensive code
|
# more expensive code
|
||||||
pass
|
pass
|
||||||
|
|
||||||
with vllm.utils.cprofile_context("another_function.prof"):
|
with cprofile_context("another_function.prof"):
|
||||||
another_function()
|
another_function()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# Using Docker
|
# Using Docker
|
||||||
|
|
||||||
[](){ #deployment-docker-pre-built-image }
|
|
||||||
|
|
||||||
## Use vLLM's Official Docker Image
|
## Use vLLM's Official Docker Image
|
||||||
|
|
||||||
vLLM offers an official Docker image for deployment.
|
vLLM offers an official Docker image for deployment.
|
||||||
@ -10,7 +8,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
|
|||||||
```bash
|
```bash
|
||||||
docker run --runtime nvidia --gpus all \
|
docker run --runtime nvidia --gpus all \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
--env "HF_TOKEN=$HF_TOKEN" \
|
||||||
-p 8000:8000 \
|
-p 8000:8000 \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
vllm/vllm-openai:latest \
|
vllm/vllm-openai:latest \
|
||||||
@ -22,7 +20,7 @@ This image can also be used with other container engines such as [Podman](https:
|
|||||||
```bash
|
```bash
|
||||||
podman run --device nvidia.com/gpu=all \
|
podman run --device nvidia.com/gpu=all \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
--env "HF_TOKEN=$HF_TOKEN" \
|
||||||
-p 8000:8000 \
|
-p 8000:8000 \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
docker.io/vllm/vllm-openai:latest \
|
docker.io/vllm/vllm-openai:latest \
|
||||||
@ -37,7 +35,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
|||||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
|
||||||
|
|
||||||
If you need to use those dependencies (having accepted the license terms),
|
If you need to use those dependencies (having accepted the license terms),
|
||||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||||
@ -62,11 +60,9 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
|||||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #deployment-docker-build-image-from-source }
|
|
||||||
|
|
||||||
## Building vLLM's Docker Image from Source
|
## Building vLLM's Docker Image from Source
|
||||||
|
|
||||||
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||||
@ -128,7 +124,7 @@ To run vLLM with the custom-built Docker image:
|
|||||||
docker run --runtime nvidia --gpus all \
|
docker run --runtime nvidia --gpus all \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
-p 8000:8000 \
|
-p 8000:8000 \
|
||||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
--env "HF_TOKEN=<secret>" \
|
||||||
vllm/vllm-openai <args...>
|
vllm/vllm-openai <args...>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -1,11 +1,9 @@
|
|||||||
# Anyscale
|
# Anyscale
|
||||||
|
|
||||||
[](){ #deployment-anyscale }
|
|
||||||
|
|
||||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||||
|
|
||||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
|
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
|
||||||
|
|
||||||
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
||||||
|
|
||||||
|
|||||||
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
|
|||||||
- name: vllm-leader
|
- name: vllm-leader
|
||||||
image: docker.io/vllm/vllm-openai:latest
|
image: docker.io/vllm/vllm-openai:latest
|
||||||
env:
|
env:
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
- name: HF_TOKEN
|
||||||
value: <your-hf-token>
|
value: <your-hf-token>
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
|
|||||||
ephemeral-storage: 800Gi
|
ephemeral-storage: 800Gi
|
||||||
cpu: 125
|
cpu: 125
|
||||||
env:
|
env:
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
- name: HF_TOKEN
|
||||||
value: <your-hf-token>
|
value: <your-hf-token>
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /dev/shm
|
- mountPath: /dev/shm
|
||||||
|
|||||||
@ -36,7 +36,7 @@ pip install -U vllm \
|
|||||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
|
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
|
||||||
|
|
||||||
1. Run the script
|
1. Run the script
|
||||||
|
|
||||||
@ -74,7 +74,7 @@ pip install vllm \
|
|||||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
|
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
|
||||||
|
|
||||||
1. Run the script:
|
1. Run the script:
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ pip install vllm streamlit openai
|
|||||||
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
|
||||||
|
|
||||||
1. Start the streamlit web UI and start to chat:
|
1. Start the streamlit web UI and start to chat:
|
||||||
|
|
||||||
|
|||||||
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
|
|||||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
- name: HF_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
@ -209,7 +209,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
- name: HF_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
@ -298,7 +298,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
- name: HF_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
|
|||||||
@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-build }
|
|
||||||
|
|
||||||
## Build Nginx Container
|
## Build Nginx Container
|
||||||
|
|
||||||
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
|
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
|
||||||
@ -27,8 +25,6 @@ Build the container:
|
|||||||
docker build . -f Dockerfile.nginx --tag nginx-lb
|
docker build . -f Dockerfile.nginx --tag nginx-lb
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-conf }
|
|
||||||
|
|
||||||
## Create Simple Nginx Config file
|
## Create Simple Nginx Config file
|
||||||
|
|
||||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||||
@ -53,8 +49,6 @@ Create a file named `nginx_conf/nginx.conf`. Note that you can add as many serve
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-vllm-container }
|
|
||||||
|
|
||||||
## Build vLLM Container
|
## Build vLLM Container
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -73,16 +67,12 @@ docker build \
|
|||||||
--build-arg https_proxy=$https_proxy
|
--build-arg https_proxy=$https_proxy
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-docker-network }
|
|
||||||
|
|
||||||
## Create Docker Network
|
## Create Docker Network
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker network create vllm_nginx
|
docker network create vllm_nginx
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-launch-container }
|
|
||||||
|
|
||||||
## Launch vLLM Containers
|
## Launch vLLM Containers
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
@ -122,8 +112,6 @@ Notes:
|
|||||||
!!! note
|
!!! note
|
||||||
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-launch-nginx }
|
|
||||||
|
|
||||||
## Launch Nginx
|
## Launch Nginx
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -135,8 +123,6 @@ docker run \
|
|||||||
--name nginx-lb nginx-lb:latest
|
--name nginx-lb nginx-lb:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-verify-nginx }
|
|
||||||
|
|
||||||
## Verify That vLLM Servers Are Ready
|
## Verify That vLLM Servers Are Ready
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@ -47,9 +47,9 @@ Here is a sample of `LLM` class usage:
|
|||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
```
|
```
|
||||||
|
|
||||||
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
More API details can be found in the [Offline Inference](../api/README.md#offline-inference) section of the API docs.
|
||||||
|
|
||||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
|
||||||
|
|
||||||
### OpenAI-Compatible API Server
|
### OpenAI-Compatible API Server
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
|
|||||||
vllm serve <model>
|
vllm serve <model>
|
||||||
```
|
```
|
||||||
|
|
||||||
The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
|
The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).
|
||||||
|
|
||||||
Sometimes you may see the API server entrypoint used directly instead of via the
|
Sometimes you may see the API server entrypoint used directly instead of via the
|
||||||
`vllm` CLI command. For example:
|
`vllm` CLI command. For example:
|
||||||
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
|
|||||||
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
||||||
and may become unsupported in a future release.
|
and may become unsupported in a future release.
|
||||||
|
|
||||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).
|
||||||
|
|
||||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||||
|
|
||||||
@ -101,7 +101,7 @@ processing.
|
|||||||
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
||||||
token IDs from a language model into human-readable text.
|
token IDs from a language model into human-readable text.
|
||||||
|
|
||||||
The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
|
The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).
|
||||||
|
|
||||||
### AsyncLLMEngine
|
### AsyncLLMEngine
|
||||||
|
|
||||||
@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
|
|||||||
can handle multiple concurrent requests and stream outputs to clients.
|
can handle multiple concurrent requests and stream outputs to clients.
|
||||||
|
|
||||||
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
||||||
API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
|
API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).
|
||||||
|
|
||||||
The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
|
The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).
|
||||||
|
|
||||||
## Worker
|
## Worker
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@ In this document we will discuss the:
|
|||||||
In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
|
In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
The following contents are mostly based on the last commit of <gh-pr:20059>.
|
The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
|
||||||
@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
|
|||||||
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
|
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
||||||
|
|
||||||
### `CudagraphDispatcher`
|
### `CudagraphDispatcher`
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
|
FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
|
||||||
|
|
||||||
Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
|
Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
|
|||||||
|
|
||||||
The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
|
The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
|
||||||
|
|
||||||
Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
||||||
|
|
||||||
`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
||||||
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
||||||
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
|
|||||||
|
|
||||||
#### Step 1: Add an All2All manager
|
#### Step 1: Add an All2All manager
|
||||||
|
|
||||||
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
|
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).
|
||||||
|
|
||||||
#### Step 2: Add a FusedMoEPrepareAndFinalize Type
|
#### Step 2: Add a FusedMoEPrepareAndFinalize Type
|
||||||
|
|
||||||
@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
|
|||||||
|
|
||||||
### How To Unit Test
|
### How To Unit Test
|
||||||
|
|
||||||
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
|
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).
|
||||||
|
|
||||||
The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
|
The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
|
||||||
compatible, runs some correctness tests.
|
compatible, runs some correctness tests.
|
||||||
If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
|
If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
|
||||||
|
|
||||||
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
|
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
|
||||||
2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
|
2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
|
||||||
`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
|
`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
|
||||||
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
|
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
|
||||||
|
|
||||||
Doing this will add the new implementation to the test suite.
|
Doing this will add the new implementation to the test suite.
|
||||||
|
|
||||||
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
|
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
|
||||||
|
|
||||||
The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
||||||
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||||
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
|
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
|
||||||
with incompatible types, the script will error.
|
with incompatible types, the script will error.
|
||||||
|
|
||||||
### How To Profile
|
### How To Profile
|
||||||
|
|
||||||
Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
||||||
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
|
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
|
||||||
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
|
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
|
||||||
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||||
|
|||||||
@ -6,7 +6,7 @@ When performing an inference with IO Processor plugins, the prompt type is defin
|
|||||||
|
|
||||||
## Writing an IO Processor Plugin
|
## Writing an IO Processor Plugin
|
||||||
|
|
||||||
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
IOProcessorInput = TypeVar("IOProcessorInput")
|
IOProcessorInput = TypeVar("IOProcessorInput")
|
||||||
@ -67,9 +67,9 @@ The `parse_request` method is used for validating the user prompt and converting
|
|||||||
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
||||||
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
||||||
|
|
||||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
|
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
|
||||||
|
|
||||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
|
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||||
|
|
||||||
## Using an IO Processor plugin
|
## Using an IO Processor plugin
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
# Metrics
|
# Metrics
|
||||||
|
|
||||||
Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
|
vLLM exposes a rich set of metrics to support observability and capacity planning for the V1 engine.
|
||||||
|
|
||||||
## Objectives
|
## Objectives
|
||||||
|
|
||||||
- Achieve parity of metrics between v0 and v1.
|
- Provide comprehensive coverage of engine and request level metrics to aid production monitoring.
|
||||||
- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
|
- Prioritize Prometheus integrations, as this is what we expect to be used in production environments.
|
||||||
- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
|
- Offer logging support (i.e. printing metrics to the info log) for ad-hoc testing, debugging, development, and exploratory use cases.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
@ -17,45 +17,36 @@ Metrics in vLLM can be categorized as follows:
|
|||||||
|
|
||||||
The mental model is that server-level metrics help explain the values of request-level metrics.
|
The mental model is that server-level metrics help explain the values of request-level metrics.
|
||||||
|
|
||||||
### v0 Metrics
|
### Metrics Overview
|
||||||
|
|
||||||
In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
### v1 Metrics
|
||||||
|
|
||||||
- `vllm:num_requests_running` (Gauge)
|
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||||
- `vllm:num_requests_swapped` (Gauge)
|
|
||||||
- `vllm:num_requests_waiting` (Gauge)
|
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
|
||||||
- `vllm:gpu_cache_usage_perc` (Gauge)
|
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
|
||||||
- `vllm:cpu_cache_usage_perc` (Gauge)
|
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
|
||||||
- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
|
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
|
||||||
- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
|
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
|
||||||
- `vllm:prompt_tokens_total` (Counter)
|
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
|
||||||
- `vllm:generation_tokens_total` (Counter)
|
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
|
||||||
- `vllm:request_success_total` (Counter)
|
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
|
||||||
- `vllm:request_prompt_tokens` (Histogram)
|
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
|
||||||
- `vllm:request_generation_tokens` (Histogram)
|
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
|
||||||
- `vllm:time_to_first_token_seconds` (Histogram)
|
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
|
||||||
- `vllm:time_per_output_token_seconds` (Histogram)
|
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
|
||||||
- `vllm:e2e_request_latency_seconds` (Histogram)
|
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
|
||||||
- `vllm:request_queue_time_seconds` (Histogram)
|
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
|
||||||
- `vllm:request_inference_time_seconds` (Histogram)
|
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
|
||||||
- `vllm:request_prefill_time_seconds` (Histogram)
|
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
|
||||||
- `vllm:request_decode_time_seconds` (Histogram)
|
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
|
||||||
- `vllm:request_max_num_generation_tokens` (Histogram)
|
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
|
||||||
- `vllm:num_preemptions_total` (Counter)
|
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
|
||||||
- `vllm:cache_config_info` (Gauge)
|
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
|
||||||
- `vllm:lora_requests_info` (Gauge)
|
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
|
||||||
- `vllm:tokens_total` (Counter)
|
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
|
||||||
- `vllm:iteration_tokens_total` (Histogram)
|
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
|
||||||
- `vllm:time_in_queue_requests` (Histogram)
|
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
|
||||||
- `vllm:model_forward_time_milliseconds` (Histogram)
|
|
||||||
- `vllm:model_execute_time_milliseconds` (Histogram)
|
|
||||||
- `vllm:request_params_n` (Histogram)
|
|
||||||
- `vllm:request_params_max_tokens` (Histogram)
|
|
||||||
- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
|
|
||||||
- `vllm:spec_decode_efficiency` (Gauge)
|
|
||||||
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
|
|
||||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
|
||||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
|
||||||
|
|
||||||
These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
|
These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
|
||||||
|
|
||||||
@ -80,13 +71,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
|
|||||||
- `vllm:request_decode_time_seconds` - Requests decode time.
|
- `vllm:request_decode_time_seconds` - Requests decode time.
|
||||||
- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
|
- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
|
||||||
|
|
||||||
See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
|
See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.
|
||||||
|
|
||||||
### Prometheus Client Library
|
### Prometheus Client Library
|
||||||
|
|
||||||
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
|
||||||
|
|
||||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
During those migrations we briefly lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||||
@ -99,7 +90,9 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
|
|||||||
|
|
||||||
### Multi-process Mode
|
### Multi-process Mode
|
||||||
|
|
||||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
|
Historically, metrics were collected in the engine core process and multiprocess mode was used to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
|
||||||
|
|
||||||
|
More recently, metrics are collected in the API server process and multiprocess mode is only used when `--api-server-count > 1`. See <https://github.com/vllm-project/vllm/pull/17546> and details on [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
|
||||||
|
|
||||||
### Built in Python/Process Metrics
|
### Built in Python/Process Metrics
|
||||||
|
|
||||||
@ -116,41 +109,37 @@ The following metrics are supported by default by `prometheus_client`, but they
|
|||||||
- `process_open_fds`
|
- `process_open_fds`
|
||||||
- `process_max_fds`
|
- `process_max_fds`
|
||||||
|
|
||||||
This is relevant because if we move away from multiprocess mode in v1,
|
Therefore, these metrics are unavailable when `--api-server-count > 1`. It's questionable how relevant these are since they do not aggregate these stats for all processes that make up a vLLM instance.
|
||||||
we get these back. However, it's questionable how relevant these are
|
|
||||||
if they don't aggregate these stats for all processes that make up a
|
|
||||||
vLLM instance.
|
|
||||||
|
|
||||||
### v0 PRs and Issues
|
## Metrics Design
|
||||||
|
|
||||||
For background, these are some of the relevant PRs which added the v0 metrics:
|
The ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where was where much of the metrics design was planned. For example, see where [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
|
||||||
|
|
||||||
- <gh-pr:1890>
|
### Legacy PRs
|
||||||
- <gh-pr:2316>
|
|
||||||
- <gh-pr:2730>
|
|
||||||
- <gh-pr:4464>
|
|
||||||
- <gh-pr:7279>
|
|
||||||
|
|
||||||
Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
|
To help understand the background to the metrics design, here are some of the relevant PRs which added the original, now legacy, metrics:
|
||||||
|
|
||||||
## v1 Design
|
- <https://github.com/vllm-project/vllm/pull/1890>
|
||||||
|
- <https://github.com/vllm-project/vllm/pull/2316>
|
||||||
|
- <https://github.com/vllm-project/vllm/pull/2730>
|
||||||
|
- <https://github.com/vllm-project/vllm/pull/4464>
|
||||||
|
- <https://github.com/vllm-project/vllm/pull/7279>
|
||||||
|
|
||||||
### v1 PRs
|
### Metrics Implementation PRs
|
||||||
|
|
||||||
For background, here are the relevant v1 PRs relating to the v1
|
For background, here are the relevant PRs relating to the metrics implementation <https://github.com/vllm-project/vllm/issues/10582>:
|
||||||
metrics issue <gh-issue:10582>:
|
|
||||||
|
|
||||||
- <gh-pr:11962>
|
- <https://github.com/vllm-project/vllm/pull/11962>
|
||||||
- <gh-pr:11973>
|
- <https://github.com/vllm-project/vllm/pull/11973>
|
||||||
- <gh-pr:10907>
|
- <https://github.com/vllm-project/vllm/pull/10907>
|
||||||
- <gh-pr:12416>
|
- <https://github.com/vllm-project/vllm/pull/12416>
|
||||||
- <gh-pr:12478>
|
- <https://github.com/vllm-project/vllm/pull/12478>
|
||||||
- <gh-pr:12516>
|
- <https://github.com/vllm-project/vllm/pull/12516>
|
||||||
- <gh-pr:12530>
|
- <https://github.com/vllm-project/vllm/pull/12530>
|
||||||
- <gh-pr:12561>
|
- <https://github.com/vllm-project/vllm/pull/12561>
|
||||||
- <gh-pr:12579>
|
- <https://github.com/vllm-project/vllm/pull/12579>
|
||||||
- <gh-pr:12592>
|
- <https://github.com/vllm-project/vllm/pull/12592>
|
||||||
- <gh-pr:12644>
|
- <https://github.com/vllm-project/vllm/pull/12644>
|
||||||
|
|
||||||
### Metrics Collection
|
### Metrics Collection
|
||||||
|
|
||||||
@ -394,15 +383,14 @@ distinguish between per-adapter counts. This should be revisited.
|
|||||||
Note that `multiprocess_mode="livemostrecent"` is used - the most
|
Note that `multiprocess_mode="livemostrecent"` is used - the most
|
||||||
recent metric is used, but only from currently running processes.
|
recent metric is used, but only from currently running processes.
|
||||||
|
|
||||||
This was added in <gh-pr:9477> and there is
|
This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
|
||||||
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
||||||
If we revisit this design and deprecate the old metric, we should reduce
|
If we revisit this design and deprecate the old metric, we should
|
||||||
the need for a significant deprecation period by making the change in
|
coordinate with downstream users so they can migrate before the removal.
|
||||||
v0 also and asking this project to move to the new metric.
|
|
||||||
|
|
||||||
### Prefix Cache metrics
|
### Prefix Cache metrics
|
||||||
|
|
||||||
The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
|
The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
|
||||||
some interesting points which may be relevant to how we approach
|
some interesting points which may be relevant to how we approach
|
||||||
future metrics.
|
future metrics.
|
||||||
|
|
||||||
@ -439,8 +427,8 @@ suddenly (from their perspective) when it is removed, even if there is
|
|||||||
an equivalent metric for them to use.
|
an equivalent metric for them to use.
|
||||||
|
|
||||||
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
|
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
|
||||||
[deprecated](gh-pr:2764) (with a comment in the code),
|
[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
|
||||||
[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
|
[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).
|
||||||
|
|
||||||
In general:
|
In general:
|
||||||
|
|
||||||
@ -460,40 +448,38 @@ the project-wide deprecation policy.
|
|||||||
|
|
||||||
### Unimplemented - `vllm:tokens_total`
|
### Unimplemented - `vllm:tokens_total`
|
||||||
|
|
||||||
Added by <gh-pr:4464>, but apparently never implemented. This can just be
|
Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
|
||||||
removed.
|
removed.
|
||||||
|
|
||||||
### Duplicated - Queue Time
|
### Duplicated - Queue Time
|
||||||
|
|
||||||
The `vllm:time_in_queue_requests` Histogram metric was added by
|
The `vllm:time_in_queue_requests` Histogram metric was added by
|
||||||
<gh-pr:9659> and its calculation is:
|
<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
self.metrics.first_scheduled_time = now
|
self.metrics.first_scheduled_time = now
|
||||||
self.metrics.time_in_queue = now - self.metrics.arrival_time
|
self.metrics.time_in_queue = now - self.metrics.arrival_time
|
||||||
```
|
```
|
||||||
|
|
||||||
Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
|
Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
|
||||||
us with:
|
us with:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
if seq_group.is_finished():
|
if seq_group.is_finished():
|
||||||
if (
|
if (seq_group.metrics.first_scheduled_time is not None and
|
||||||
seq_group.metrics.first_scheduled_time is not None
|
seq_group.metrics.first_token_time is not None):
|
||||||
and seq_group.metrics.first_token_time is not None
|
|
||||||
):
|
|
||||||
time_queue_requests.append(
|
time_queue_requests.append(
|
||||||
seq_group.metrics.first_scheduled_time -
|
seq_group.metrics.first_scheduled_time -
|
||||||
seq_group.metrics.arrival_time
|
seq_group.metrics.arrival_time)
|
||||||
)
|
|
||||||
...
|
...
|
||||||
if seq_group.metrics.time_in_queue is not None:
|
if seq_group.metrics.time_in_queue is not None:
|
||||||
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
|
time_in_queue_requests.append(
|
||||||
|
seq_group.metrics.time_in_queue)
|
||||||
```
|
```
|
||||||
|
|
||||||
This seems duplicative, and one of them should be removed. The latter
|
This seems duplicative, and one of them should be removed. The latter
|
||||||
is used by the Grafana dashboard, so we should deprecate or remove the
|
is used by the Grafana dashboard, so we should deprecate or remove the
|
||||||
former from v0.
|
former.
|
||||||
|
|
||||||
### Prefix Cache Hit Rate
|
### Prefix Cache Hit Rate
|
||||||
|
|
||||||
@ -502,7 +488,7 @@ See above - we now expose 'queries' and 'hits' counters rather than a
|
|||||||
|
|
||||||
### KV Cache Offloading
|
### KV Cache Offloading
|
||||||
|
|
||||||
Two v0 metrics relate to a "swapped" preemption mode that is no
|
Two legacy metrics relate to a "swapped" preemption mode that is no
|
||||||
longer relevant in v1:
|
longer relevant in v1:
|
||||||
|
|
||||||
- `vllm:num_requests_swapped`
|
- `vllm:num_requests_swapped`
|
||||||
@ -513,7 +499,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
|||||||
memory. This is also known as "KV cache offloading" and is configured
|
memory. This is also known as "KV cache offloading" and is configured
|
||||||
with `--swap-space` and `--preemption-mode`.
|
with `--swap-space` and `--preemption-mode`.
|
||||||
|
|
||||||
In v0, [vLLM has long supported beam search](gh-issue:6226). The
|
Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
||||||
SequenceGroup encapsulated the idea of N Sequences which
|
SequenceGroup encapsulated the idea of N Sequences which
|
||||||
all shared the same prompt kv blocks. This enabled KV cache block
|
all shared the same prompt kv blocks. This enabled KV cache block
|
||||||
sharing between requests, and copy-on-write to do branching. CPU
|
sharing between requests, and copy-on-write to do branching. CPU
|
||||||
@ -526,7 +512,7 @@ and the part of the prompt that was evicted can be recomputed.
|
|||||||
|
|
||||||
SequenceGroup was removed in V1, although a replacement will be
|
SequenceGroup was removed in V1, although a replacement will be
|
||||||
required for "parallel sampling" (`n>1`).
|
required for "parallel sampling" (`n>1`).
|
||||||
[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
|
[Beam search was moved out of the core](https://github.com/vllm-project/vllm/issues/8306). There was a
|
||||||
lot of complex code for a very uncommon feature.
|
lot of complex code for a very uncommon feature.
|
||||||
|
|
||||||
In V1, with prefix caching being better (zero over head) and therefore
|
In V1, with prefix caching being better (zero over head) and therefore
|
||||||
@ -537,11 +523,11 @@ better.
|
|||||||
|
|
||||||
### Parallel Sampling
|
### Parallel Sampling
|
||||||
|
|
||||||
Some v0 metrics are only relevant in the context of "parallel
|
Some legacy metrics are only relevant in the context of "parallel
|
||||||
sampling". This is where the `n` parameter in a request is used to
|
sampling". This is where the `n` parameter in a request is used to
|
||||||
request multiple completions from the same prompt.
|
request multiple completions from the same prompt.
|
||||||
|
|
||||||
As part of adding parallel sampling support in <gh-pr:10980>, we should
|
As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
|
||||||
also add these metrics.
|
also add these metrics.
|
||||||
|
|
||||||
- `vllm:request_params_n` (Histogram)
|
- `vllm:request_params_n` (Histogram)
|
||||||
@ -556,7 +542,7 @@ also add these metrics.
|
|||||||
|
|
||||||
### Speculative Decoding
|
### Speculative Decoding
|
||||||
|
|
||||||
Some v0 metrics are specific to "speculative decoding". This is where
|
Some legacy metrics are specific to "speculative decoding". This is where
|
||||||
we generate candidate tokens using a faster, approximate method or
|
we generate candidate tokens using a faster, approximate method or
|
||||||
model and then validate those tokens with the larger model.
|
model and then validate those tokens with the larger model.
|
||||||
|
|
||||||
@ -566,9 +552,9 @@ model and then validate those tokens with the larger model.
|
|||||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||||
|
|
||||||
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
|
||||||
speculative decoding to v1. Other techniques will follow. We should
|
speculative decoding to v1. Other techniques will follow. We should
|
||||||
revisit the v0 metrics in this context.
|
revisit these metrics in this context.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
We should probably expose acceptance rate as separate accepted
|
We should probably expose acceptance rate as separate accepted
|
||||||
@ -587,7 +573,7 @@ see:
|
|||||||
- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
|
- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
|
||||||
- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
|
- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
|
||||||
- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
|
- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
|
||||||
- <gh-issue:5041> and <gh-pr:12726>.
|
- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
|
||||||
|
|
||||||
This is a non-trivial topic. Consider this comment from Rob:
|
This is a non-trivial topic. Consider this comment from Rob:
|
||||||
|
|
||||||
@ -641,7 +627,7 @@ metrics are often relatively straightforward to add:
|
|||||||
metrics are usually of very limited use unless they can be enabled
|
metrics are usually of very limited use unless they can be enabled
|
||||||
by default and in production.
|
by default and in production.
|
||||||
3. They have an impact on development and maintenance of the
|
3. They have an impact on development and maintenance of the
|
||||||
project. Every metric added to v0 has made this v1 effort more
|
project. Every metric added over time has made this effort more
|
||||||
time-consuming, and perhaps not all metrics justify this ongoing
|
time-consuming, and perhaps not all metrics justify this ongoing
|
||||||
investment in their maintenance.
|
investment in their maintenance.
|
||||||
|
|
||||||
@ -652,24 +638,24 @@ performance and health. Tracing, on the other hand, tracks individual
|
|||||||
requests as they move through different services and components. Both
|
requests as they move through different services and components. Both
|
||||||
fall under the more general heading of "Observability".
|
fall under the more general heading of "Observability".
|
||||||
|
|
||||||
v0 has support for OpenTelemetry tracing:
|
vLLM has support for OpenTelemetry tracing:
|
||||||
|
|
||||||
- Added by <gh-pr:4687>
|
- Added by <https://github.com/vllm-project/vllm/pull/4687> and reinstated by <https://github.com/vllm-project/vllm/pull/20372>
|
||||||
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
||||||
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
|
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||||
- [User-facing docs](../examples/online_serving/opentelemetry.md)
|
- [User-facing docs](../examples/online_serving/opentelemetry.md)
|
||||||
- [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
- [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
||||||
- [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
|
- [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
|
||||||
|
|
||||||
OpenTelemetry has a
|
OpenTelemetry has a
|
||||||
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
||||||
|
|
||||||
Since metrics is a big enough topic on its own, we are going to tackle
|
Since metrics is a big enough topic on its own, we consider the topic
|
||||||
the topic of tracing in v1 separately.
|
of tracing to be quite separate from metrics.
|
||||||
|
|
||||||
### OpenTelemetry Model Forward vs Execute Time
|
### OpenTelemetry Model Forward vs Execute Time
|
||||||
|
|
||||||
In v0, we have the following two metrics:
|
The current implementation exposes the following two metrics:
|
||||||
|
|
||||||
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
||||||
in the model forward pass when this request was in the batch.
|
in the model forward pass when this request was in the batch.
|
||||||
@ -685,7 +671,7 @@ documentation for this option states:
|
|||||||
> use of possibly costly and or blocking operations and hence might
|
> use of possibly costly and or blocking operations and hence might
|
||||||
> have a performance impact.
|
> have a performance impact.
|
||||||
|
|
||||||
The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
|
The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
|
||||||
as:
|
as:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Multi-Modal Data Processing
|
# Multi-Modal Data Processing
|
||||||
|
|
||||||
To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
To enable various optimizations in vLLM such as [chunked prefill](../configuration/optimization.md#chunked-prefill) and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||||
|
|
||||||
Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
|
Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
|
||||||
|
|
||||||
@ -41,14 +41,10 @@ While HF processors support text + multi-modal inputs natively, this is not so f
|
|||||||
|
|
||||||
Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
|
Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
|
||||||
|
|
||||||
[](){ #mm-dummy-text }
|
|
||||||
|
|
||||||
### Dummy text
|
### Dummy text
|
||||||
|
|
||||||
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
|
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
|
||||||
|
|
||||||
[](){ #mm-automatic-prompt-updating }
|
|
||||||
|
|
||||||
### Automatic prompt updating
|
### Automatic prompt updating
|
||||||
|
|
||||||
We address the second issue by implementing model-agnostic code in
|
We address the second issue by implementing model-agnostic code in
|
||||||
@ -60,8 +56,8 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
|
|||||||
|
|
||||||
## Processor Output Caching
|
## Processor Output Caching
|
||||||
|
|
||||||
Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
||||||
|
|
||||||
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
||||||
|
|
||||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
|
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||||
|
|||||||
@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
|
|||||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
||||||
| marlin experts | standard | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts] |
|
| marlin experts | standard,</br>batched | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
||||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
||||||
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor
|
|||||||
|
|
||||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
||||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
|
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
|
||||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Debugging
|
## Debugging
|
||||||
|
|
||||||
Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
|
Please see the [Troubleshooting](../usage/troubleshooting.md#python-multiprocessing)
|
||||||
page for information on known issues and how to solve them.
|
page for information on known issues and how to solve them.
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
|
|||||||
|
|
||||||
Related PRs:
|
Related PRs:
|
||||||
|
|
||||||
- <gh-pr:8823>
|
- <https://github.com/vllm-project/vllm/pull/8823>
|
||||||
|
|
||||||
## Prior State in v1
|
## Prior State in v1
|
||||||
|
|
||||||
|
|||||||
@ -41,7 +41,7 @@ Every plugin has three parts:
|
|||||||
|
|
||||||
1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
|
1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
|
||||||
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
|
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
|
||||||
3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
3. **Plugin value**: The fully qualified name of the function or module to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||||
|
|
||||||
## Types of supported plugins
|
## Types of supported plugins
|
||||||
|
|
||||||
@ -51,6 +51,8 @@ Every plugin has three parts:
|
|||||||
|
|
||||||
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
|
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
|
||||||
|
|
||||||
|
- **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
|
||||||
|
|
||||||
## Guidelines for Writing Plugins
|
## Guidelines for Writing Plugins
|
||||||
|
|
||||||
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
||||||
|
|||||||
@ -213,22 +213,22 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
|
|||||||
|
|
||||||

|

|
||||||
|
|
||||||
**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
|
**Time 2: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
**Time 3: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
|
**Time 4: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
**Time 6: Request 1 is finished and free.**
|
**Time 5: Request 1 is finished and free.**
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
|
**Time 6: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
|
||||||
|
|
||||||

|

|
||||||
|
|||||||
@ -19,8 +19,8 @@ vLLM will take all the available factors into consideration, and decide a direct
|
|||||||
|
|
||||||
The factors considered include:
|
The factors considered include:
|
||||||
|
|
||||||
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
|
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](../../vllm/config))
|
||||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](../../vllm/compilation/compiler_interface.py))
|
||||||
- The model's forward function and the relevant functions called by the forward function (see below)
|
- The model's forward function and the relevant functions called by the forward function (see below)
|
||||||
|
|
||||||
With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
|
With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
|
||||||
|
|||||||
@ -36,45 +36,43 @@ th:not(:first-child) {
|
|||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
|
||||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||||
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
||||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|
||||||
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
||||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
||||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
|
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
|
||||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
|
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
|
||||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||||
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
|
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
|
||||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
|
||||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
|
||||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||||
|
|
||||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||||
|
|
||||||
[](){ #feature-x-hardware }
|
|
||||||
|
|
||||||
### Feature x Hardware
|
### Feature x Hardware
|
||||||
|
|
||||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | Intel GPU |
|
||||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
|
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
|
||||||
| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [APC](automatic_prefix_caching.md) | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) |
|
||||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970) |
|
||||||
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965) |
|
||||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ |
|
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ |
|
||||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) |
|
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ |
|
||||||
|
|||||||
@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,
|
|||||||
|
|
||||||
Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
|
Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
|
||||||
|
|
||||||
<gh-file:examples/offline_inference/automatic_prefix_caching.py>
|
[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)
|
||||||
|
|
||||||
## Example workloads
|
## Example workloads
|
||||||
|
|
||||||
|
|||||||
@ -17,14 +17,14 @@ Two main reasons:
|
|||||||
|
|
||||||
## Usage example
|
## Usage example
|
||||||
|
|
||||||
Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
|
Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
|
||||||
|
|
||||||
Now supports 5 types of connectors:
|
Now supports 5 types of connectors:
|
||||||
|
|
||||||
- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
|
- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
|
||||||
- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
|
- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
|
||||||
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
|
- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
|
||||||
- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
|
- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
|
||||||
- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
|
- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -45,7 +45,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
|
|||||||
|
|
||||||
## Benchmarks
|
## Benchmarks
|
||||||
|
|
||||||
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
|
Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
|||||||
@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter.
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||||
|
|
||||||
## Serving LoRA Adapters
|
## Serving LoRA Adapters
|
||||||
|
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
# Multimodal Inputs
|
# Multimodal Inputs
|
||||||
|
|
||||||
This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
|
This page teaches you how to pass multi-modal inputs to [multi-modal models](../models/supported_models.md#list-of-multimodal-language-models) in vLLM.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
|
We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
|
||||||
and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
|
and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
@ -129,7 +129,7 @@ You can pass a single image to the `'image'` field of the multi-modal dictionary
|
|||||||
print(generated_text)
|
print(generated_text)
|
||||||
```
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
|
||||||
|
|
||||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
|||||||
print(generated_text)
|
print(generated_text)
|
||||||
```
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
Full example: [examples/offline_inference/vision_language_multi_image.py](../../examples/offline_inference/vision_language_multi_image.py)
|
||||||
|
|
||||||
If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||||||
|
|
||||||
@ -346,26 +346,32 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
|
|||||||
!!! note
|
!!! note
|
||||||
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
|
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
|
||||||
|
|
||||||
### Audio Inputs
|
### Audio Inputs
|
||||||
|
|
||||||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/audio_language.py>
|
Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
|
||||||
|
|
||||||
### Embedding Inputs
|
### Embedding Inputs
|
||||||
|
|
||||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||||
|
|
||||||
|
You must enable this feature via `enable_mm_embeds=True`.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||||
|
Only enable this flag for trusted users!
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
# Inference with image embeddings as input
|
# Inference with image embeddings as input
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||||
@ -397,7 +403,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
|||||||
image_embeds = torch.load(...)
|
image_embeds = torch.load(...)
|
||||||
|
|
||||||
# Qwen2-VL
|
# Qwen2-VL
|
||||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
llm = LLM(
|
||||||
|
"Qwen/Qwen2-VL-2B-Instruct",
|
||||||
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
enable_mm_embeds=True,
|
||||||
|
)
|
||||||
mm_data = {
|
mm_data = {
|
||||||
"image": {
|
"image": {
|
||||||
"image_embeds": image_embeds,
|
"image_embeds": image_embeds,
|
||||||
@ -407,7 +417,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
|||||||
}
|
}
|
||||||
|
|
||||||
# MiniCPM-V
|
# MiniCPM-V
|
||||||
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
|
llm = LLM(
|
||||||
|
"openbmb/MiniCPM-V-2_6",
|
||||||
|
trust_remote_code=True,
|
||||||
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
enable_mm_embeds=True,
|
||||||
|
)
|
||||||
mm_data = {
|
mm_data = {
|
||||||
"image": {
|
"image": {
|
||||||
"image_embeds": image_embeds,
|
"image_embeds": image_embeds,
|
||||||
@ -434,11 +449,11 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
|||||||
A chat template is **required** to use Chat Completions API.
|
A chat template is **required** to use Chat Completions API.
|
||||||
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
|
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
|
||||||
|
|
||||||
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
If no default chat template is available, we will first look for a built-in fallback in [vllm/transformers_utils/chat_templates/registry.py](../../vllm/transformers_utils/chat_templates/registry.py).
|
||||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||||
|
|
||||||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
For certain models, we provide alternative chat templates inside [examples](../../examples).
|
||||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
|
For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
|
||||||
|
|
||||||
### Image Inputs
|
### Image Inputs
|
||||||
|
|
||||||
@ -524,7 +539,7 @@ Then, you can use the OpenAI client as follows:
|
|||||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
```
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
|
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
|
||||||
@ -595,7 +610,7 @@ Then, you can use the OpenAI client as follows:
|
|||||||
print("Chat completion output from image url:", result)
|
print("Chat completion output from image url:", result)
|
||||||
```
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||||
@ -719,7 +734,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
|||||||
print("Chat completion output from audio url:", result)
|
print("Chat completion output from audio url:", result)
|
||||||
```
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
|
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
|
||||||
@ -732,7 +747,13 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
|
|||||||
### Embedding Inputs
|
### Embedding Inputs
|
||||||
|
|
||||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||||
pass a tensor of shape to the corresponding field of the multi-modal dictionary.
|
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||||
|
|
||||||
|
You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||||
|
Only enable this flag for trusted users!
|
||||||
|
|
||||||
#### Image Embedding Inputs
|
#### Image Embedding Inputs
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
|
|||||||
Install the NIXL library: `uv pip install nixl`, as a quick start.
|
Install the NIXL library: `uv pip install nixl`, as a quick start.
|
||||||
|
|
||||||
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
|
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
|
||||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
|
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
|
||||||
|
|
||||||
For non-cuda platform, please install nixl with ucx build from source, instructed as below.
|
For non-cuda platform, please install nixl with ucx build from source, instructed as below.
|
||||||
|
|
||||||
@ -170,6 +170,6 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
|
|||||||
|
|
||||||
Refer to these example scripts in the vLLM repository:
|
Refer to these example scripts in the vLLM repository:
|
||||||
|
|
||||||
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||||
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||||
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||||
|
|||||||
@ -16,16 +16,20 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
|
|||||||
|
|
||||||
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
|
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
|
||||||
|
|
||||||
<gh-file:examples/offline_inference/prompt_embed_inference.py>
|
[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)
|
||||||
|
|
||||||
## Online Serving
|
## Online Serving
|
||||||
|
|
||||||
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
|
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
|
||||||
|
|
||||||
When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
|
When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
|
||||||
|
|
||||||
Prompt embeddings are passed in as base64 encoded torch tensors.
|
Prompt embeddings are passed in as base64 encoded torch tensors.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||||
|
Only enable this flag for trusted users!
|
||||||
|
|
||||||
### Transformers Inputs via OpenAI Client
|
### Transformers Inputs via OpenAI Client
|
||||||
|
|
||||||
First, launch the OpenAI-compatible server:
|
First, launch the OpenAI-compatible server:
|
||||||
@ -37,4 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
|
[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
|
||||||
|
|||||||
@ -64,4 +64,4 @@ th:not(:first-child) {
|
|||||||
!!! note
|
!!! note
|
||||||
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||||
|
|
||||||
For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
|
For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
|
||||||
|
|||||||
@ -196,7 +196,7 @@ The reasoning content is also available when both tool calling and the reasoning
|
|||||||
print(f"Arguments: {tool_call.arguments}")
|
print(f"Arguments: {tool_call.arguments}")
|
||||||
```
|
```
|
||||||
|
|
||||||
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
|
For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
@ -204,7 +204,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
|
|||||||
|
|
||||||
## How to support a new reasoning model
|
## How to support a new reasoning model
|
||||||
|
|
||||||
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
@ -264,7 +264,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
|||||||
"""
|
"""
|
||||||
```
|
```
|
||||||
|
|
||||||
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
!!! warning
|
!!! warning
|
||||||
Please note that speculative decoding in vLLM is not yet optimized and does
|
Please note that speculative decoding in vLLM is not yet optimized and does
|
||||||
not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
|
not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
|
||||||
The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
|
The work to optimize it is ongoing and can be followed here: <https://github.com/vllm-project/vllm/issues/4630>
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
|
Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
|
||||||
@ -183,7 +183,7 @@ A variety of speculative models of this type are available on HF hub:
|
|||||||
## Speculating using EAGLE based draft models
|
## Speculating using EAGLE based draft models
|
||||||
|
|
||||||
The following code configures vLLM to use speculative decoding where proposals are generated by
|
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||||
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
|
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](../../examples/offline_inference/spec_decode.py).
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
@ -218,8 +218,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
|
|||||||
A few important things to consider when using the EAGLE based draft models:
|
A few important things to consider when using the EAGLE based draft models:
|
||||||
|
|
||||||
1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
|
1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
|
||||||
be able to be loaded and used directly by vLLM after <gh-pr:12304>.
|
be able to be loaded and used directly by vLLM after <https://github.com/vllm-project/vllm/pull/12304>.
|
||||||
If you are using vllm version before <gh-pr:12304>, please use the
|
If you are using vllm version before <https://github.com/vllm-project/vllm/pull/12304>, please use the
|
||||||
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
|
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
|
||||||
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
|
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
|
||||||
|
|
||||||
@ -229,7 +229,7 @@ A few important things to consider when using the EAGLE based draft models:
|
|||||||
|
|
||||||
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
||||||
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
|
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
|
||||||
investigation and tracked here: <gh-issue:9565>.
|
investigation and tracked here: <https://github.com/vllm-project/vllm/issues/9565>.
|
||||||
|
|
||||||
4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
|
4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
|
||||||
That is, to specify `"method": "eagle3"` in `speculative_config`.
|
That is, to specify `"method": "eagle3"` in `speculative_config`.
|
||||||
@ -267,7 +267,7 @@ speculative decoding, breaking down the guarantees into three key areas:
|
|||||||
> distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
|
> distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
|
||||||
> - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
|
> - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
|
||||||
> without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
|
> without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
|
||||||
> provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
|
> provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../tests/spec_decode/e2e).
|
||||||
> verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
|
> verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
|
||||||
|
|
||||||
3. **vLLM Logprob Stability**
|
3. **vLLM Logprob Stability**
|
||||||
@ -289,4 +289,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
|
|||||||
- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
|
- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
|
||||||
- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
|
- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
|
||||||
- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
|
- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
|
||||||
- [Dynamic speculative decoding](gh-issue:4565)
|
- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
|
||||||
|
|||||||
@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
|
|||||||
Answer: x = -29/8
|
Answer: x = -29/8
|
||||||
```
|
```
|
||||||
|
|
||||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
|
An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)
|
||||||
|
|
||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|
||||||
|
|||||||
@ -151,9 +151,9 @@ Known issues:
|
|||||||
much shorter than what vLLM generates. Since an exception is thrown when this condition
|
much shorter than what vLLM generates. Since an exception is thrown when this condition
|
||||||
is not met, the following additional chat templates are provided:
|
is not met, the following additional chat templates are provided:
|
||||||
|
|
||||||
* <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that
|
* [examples/tool_chat_template_mistral.jinja](../../examples/tool_chat_template_mistral.jinja) - this is the "official" Mistral chat template, but tweaked so that
|
||||||
it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
|
it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
|
||||||
* <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
|
* [examples/tool_chat_template_mistral_parallel.jinja](../../examples/tool_chat_template_mistral_parallel.jinja) - this is a "better" version that adds a tool-use system prompt
|
||||||
when tools are provided, that results in much better reliability when working with parallel tool calling.
|
when tools are provided, that results in much better reliability when working with parallel tool calling.
|
||||||
|
|
||||||
Recommended flags:
|
Recommended flags:
|
||||||
@ -187,16 +187,16 @@ Known issues:
|
|||||||
|
|
||||||
VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
|
VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
|
||||||
|
|
||||||
* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1
|
* [examples/tool_chat_template_llama3.1_json.jinja](../../examples/tool_chat_template_llama3.1_json.jinja) - this is the "official" chat template for the Llama 3.1
|
||||||
models, but tweaked so that it works better with vLLM.
|
models, but tweaked so that it works better with vLLM.
|
||||||
* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for
|
* [examples/tool_chat_template_llama3.2_json.jinja](../../examples/tool_chat_template_llama3.2_json.jinja) - this extends upon the Llama 3.1 chat template by adding support for
|
||||||
images.
|
images.
|
||||||
|
|
||||||
Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
|
Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
|
||||||
|
|
||||||
VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
|
VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
|
||||||
|
|
||||||
* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
|
* [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja) - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
|
||||||
|
|
||||||
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
|
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
|
||||||
|
|
||||||
@ -212,7 +212,7 @@ Supported models:
|
|||||||
|
|
||||||
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
|
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
|
||||||
|
|
||||||
<gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
|
[examples/tool_chat_template_granite.jinja](../../examples/tool_chat_template_granite.jinja): this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
|
||||||
|
|
||||||
* `ibm-granite/granite-3.1-8b-instruct`
|
* `ibm-granite/granite-3.1-8b-instruct`
|
||||||
|
|
||||||
@ -224,7 +224,7 @@ Supported models:
|
|||||||
|
|
||||||
Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
|
Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
|
||||||
|
|
||||||
<gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
|
[examples/tool_chat_template_granite_20b_fc.jinja](../../examples/tool_chat_template_granite_20b_fc.jinja): this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
|
||||||
|
|
||||||
### InternLM Models (`internlm`)
|
### InternLM Models (`internlm`)
|
||||||
|
|
||||||
@ -282,8 +282,8 @@ Flags: `--tool-call-parser hermes`
|
|||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
|
* `MiniMaxAi/MiniMax-M1-40k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
|
||||||
* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
|
* `MiniMaxAi/MiniMax-M1-80k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
|
||||||
|
|
||||||
Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
|
Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
|
||||||
|
|
||||||
@ -291,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m
|
|||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
|
* `deepseek-ai/DeepSeek-V3-0324` (use with [examples/tool_chat_template_deepseekv3.jinja](../../examples/tool_chat_template_deepseekv3.jinja))
|
||||||
* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
|
* `deepseek-ai/DeepSeek-R1-0528` (use with [examples/tool_chat_template_deepseekr1.jinja](../../examples/tool_chat_template_deepseekr1.jinja))
|
||||||
|
|
||||||
Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
|
Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
|
||||||
|
|
||||||
@ -300,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
|
|||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>)
|
* `deepseek-ai/DeepSeek-V3.1` (use with [examples/tool_chat_template_deepseekv31.jinja](../../examples/tool_chat_template_deepseekv31.jinja))
|
||||||
|
|
||||||
Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
|
Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
|
||||||
|
|
||||||
@ -379,12 +379,12 @@ Limitations:
|
|||||||
|
|
||||||
Example supported models:
|
Example supported models:
|
||||||
|
|
||||||
* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
|
* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
|
||||||
* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
|
* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
|
||||||
* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
|
* `Team-ACE/ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
|
||||||
* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
|
* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
|
||||||
* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
|
* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
|
||||||
* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
|
* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
|
||||||
|
|
||||||
Flags: `--tool-call-parser pythonic --chat-template {see_above}`
|
Flags: `--tool-call-parser pythonic --chat-template {see_above}`
|
||||||
|
|
||||||
@ -393,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`
|
|||||||
|
|
||||||
## How to Write a Tool Parser Plugin
|
## How to Write a Tool Parser Plugin
|
||||||
|
|
||||||
A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>.
|
A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
|
||||||
|
|
||||||
Here is a summary of a plugin file:
|
Here is a summary of a plugin file:
|
||||||
|
|
||||||
|
|||||||
@ -4,19 +4,19 @@ vLLM is a Python library that supports the following CPU variants. Select your C
|
|||||||
|
|
||||||
=== "Intel/AMD x86"
|
=== "Intel/AMD x86"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation"
|
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:installation"
|
||||||
|
|
||||||
=== "ARM AArch64"
|
=== "ARM AArch64"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation"
|
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:installation"
|
||||||
|
|
||||||
=== "Apple silicon"
|
=== "Apple silicon"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation"
|
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:installation"
|
||||||
|
|
||||||
=== "IBM Z (S390X)"
|
=== "IBM Z (S390X)"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation"
|
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
@ -24,19 +24,19 @@ vLLM is a Python library that supports the following CPU variants. Select your C
|
|||||||
|
|
||||||
=== "Intel/AMD x86"
|
=== "Intel/AMD x86"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:requirements"
|
||||||
|
|
||||||
=== "ARM AArch64"
|
=== "ARM AArch64"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:requirements"
|
||||||
|
|
||||||
=== "Apple silicon"
|
=== "Apple silicon"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:requirements"
|
||||||
|
|
||||||
=== "IBM Z (S390X)"
|
=== "IBM Z (S390X)"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:requirements"
|
||||||
|
|
||||||
## Set up using Python
|
## Set up using Python
|
||||||
|
|
||||||
@ -52,19 +52,19 @@ Currently, there are no pre-built CPU wheels.
|
|||||||
|
|
||||||
=== "Intel/AMD x86"
|
=== "Intel/AMD x86"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
=== "ARM AArch64"
|
=== "ARM AArch64"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
=== "Apple silicon"
|
=== "Apple silicon"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
=== "IBM Z (s390x)"
|
=== "IBM Z (s390x)"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
## Set up using Docker
|
## Set up using Docker
|
||||||
|
|
||||||
@ -72,24 +72,24 @@ Currently, there are no pre-built CPU wheels.
|
|||||||
|
|
||||||
=== "Intel/AMD x86"
|
=== "Intel/AMD x86"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images"
|
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
|
||||||
|
|
||||||
### Build image from source
|
### Build image from source
|
||||||
|
|
||||||
=== "Intel/AMD x86"
|
=== "Intel/AMD x86"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-image-from-source"
|
||||||
|
|
||||||
=== "ARM AArch64"
|
=== "ARM AArch64"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
|
||||||
|
|
||||||
=== "Apple silicon"
|
=== "Apple silicon"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
|
||||||
|
|
||||||
=== "IBM Z (S390X)"
|
=== "IBM Z (S390X)"
|
||||||
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-image-from-source"
|
||||||
|
|
||||||
## Related runtime environment variables
|
## Related runtime environment variables
|
||||||
|
|
||||||
|
|||||||
@ -153,11 +153,11 @@ VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
|
|||||||
|
|
||||||
### Pre-built images
|
### Pre-built images
|
||||||
|
|
||||||
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
|
See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
|
||||||
|
|
||||||
### Build image from source
|
### Build image from source
|
||||||
|
|
||||||
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
|
You can use [docker/Dockerfile.tpu](../../../docker/Dockerfile.tpu) to build a Docker image with TPU support.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|||||||
@ -11,11 +11,11 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
|
|||||||
# --8<-- [start:set-up-using-python]
|
# --8<-- [start:set-up-using-python]
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
|
PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <https://github.com/vllm-project/vllm/issues/8420> for more details.
|
||||||
|
|
||||||
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
||||||
|
|
||||||
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
|
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-wheel-from-source) for more details.
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-python]
|
# --8<-- [end:set-up-using-python]
|
||||||
# --8<-- [start:pre-built-wheels]
|
# --8<-- [start:pre-built-wheels]
|
||||||
@ -44,8 +44,6 @@ export CUDA_VERSION=118 # or 126
|
|||||||
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
|
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #install-the-latest-code }
|
|
||||||
|
|
||||||
#### Install the latest code
|
#### Install the latest code
|
||||||
|
|
||||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||||
@ -128,11 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
|
|||||||
uv pip install --editable .
|
uv pip install --editable .
|
||||||
```
|
```
|
||||||
|
|
||||||
You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
|
You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
|
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
|
||||||
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
|
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
|
||||||
|
|
||||||
#### Full build (with compilation)
|
#### Full build (with compilation)
|
||||||
|
|
||||||
@ -250,7 +248,7 @@ uv pip install -e .
|
|||||||
# --8<-- [end:build-wheel-from-source]
|
# --8<-- [end:build-wheel-from-source]
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
|
See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
|
||||||
|
|
||||||
Another way to access the latest code is to use the docker images:
|
Another way to access the latest code is to use the docker images:
|
||||||
|
|
||||||
@ -266,11 +264,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
|
|||||||
# --8<-- [end:pre-built-images]
|
# --8<-- [end:pre-built-images]
|
||||||
# --8<-- [start:build-image-from-source]
|
# --8<-- [start:build-image-from-source]
|
||||||
|
|
||||||
See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
|
See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
|
||||||
|
|
||||||
# --8<-- [end:build-image-from-source]
|
# --8<-- [end:build-image-from-source]
|
||||||
# --8<-- [start:supported-features]
|
# --8<-- [start:supported-features]
|
||||||
|
|
||||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
|
||||||
|
|
||||||
# --8<-- [end:supported-features]
|
# --8<-- [end:supported-features]
|
||||||
@ -4,15 +4,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
|||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:installation"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:installation"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:installation"
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
@ -24,15 +24,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
|||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:requirements"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:requirements"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:requirements"
|
||||||
|
|
||||||
## Set up using Python
|
## Set up using Python
|
||||||
|
|
||||||
@ -42,45 +42,43 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
|||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:set-up-using-python"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:set-up-using-python"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:set-up-using-python"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
|
||||||
|
|
||||||
### Pre-built wheels
|
### Pre-built wheels
|
||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-wheels"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-wheels"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-wheels"
|
||||||
|
|
||||||
[](){ #build-from-source }
|
|
||||||
|
|
||||||
### Build wheel from source
|
### Build wheel from source
|
||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-wheel-from-source"
|
||||||
|
|
||||||
## Set up using Docker
|
## Set up using Docker
|
||||||
|
|
||||||
@ -88,40 +86,40 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
|||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-images"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-images"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"
|
||||||
|
|
||||||
### Build image from source
|
### Build image from source
|
||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-image-from-source"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-image-from-source"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"
|
||||||
|
|
||||||
## Supported features
|
## Supported features
|
||||||
|
|
||||||
=== "NVIDIA CUDA"
|
=== "NVIDIA CUDA"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
|
--8<-- "docs/getting_started/installation/gpu.cuda.inc.md:supported-features"
|
||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
|
--8<-- "docs/getting_started/installation/gpu.rocm.inc.md:supported-features"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
|
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:supported-features"
|
||||||
|
|||||||
@ -146,7 +146,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
|
|||||||
|
|
||||||
#### (Optional) Build an image with ROCm software stack
|
#### (Optional) Build an image with ROCm software stack
|
||||||
|
|
||||||
Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
|
Build a docker image from [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base) which setup ROCm software stack needed by the vLLM.
|
||||||
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
|
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
|
||||||
If you choose to build this rocm_base image yourself, the steps are as follows.
|
If you choose to build this rocm_base image yourself, the steps are as follows.
|
||||||
|
|
||||||
@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
|
|||||||
|
|
||||||
#### Build an image with vLLM
|
#### Build an image with vLLM
|
||||||
|
|
||||||
First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
|
First, build a docker image from [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) and launch a docker container from the image.
|
||||||
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -181,10 +181,10 @@ It is important that the user kicks off the docker build using buildkit. Either
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
|
[docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
|
||||||
It provides flexibility to customize the build of docker image using the following arguments:
|
It provides flexibility to customize the build of docker image using the following arguments:
|
||||||
|
|
||||||
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
|
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base)
|
||||||
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
|
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
|
||||||
|
|
||||||
Their values can be passed in when running `docker build` with `--build-arg` options.
|
Their values can be passed in when running `docker build` with `--build-arg` options.
|
||||||
@ -217,6 +217,6 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
|
|||||||
# --8<-- [end:build-image-from-source]
|
# --8<-- [end:build-image-from-source]
|
||||||
# --8<-- [start:supported-features]
|
# --8<-- [start:supported-features]
|
||||||
|
|
||||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
|
||||||
|
|
||||||
# --8<-- [end:supported-features]
|
# --8<-- [end:supported-features]
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user