Compare commits
1 Commits
mergify/ho
...
benchmark_
| Author | SHA1 | Date | |
|---|---|---|---|
| 221118dc85 |
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|||||||
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
|||||||
@ -6,6 +6,11 @@
|
|||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
line-length = 88
|
line-length = 88
|
||||||
|
exclude = [
|
||||||
|
# External file, leaving license intact
|
||||||
|
"examples/other/fp8/quantizer/quantize.py",
|
||||||
|
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
||||||
|
]
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
id: build-wheel-cuda-12-8
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
@ -12,11 +11,10 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
id: build-wheel-cuda-12-6
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -30,11 +28,10 @@ steps:
|
|||||||
|
|
||||||
- label: "Build wheel - CUDA 11.8"
|
- label: "Build wheel - CUDA 11.8"
|
||||||
# depends_on: block-build-cu118-wheel
|
# depends_on: block-build-cu118-wheel
|
||||||
id: build-wheel-cuda-11-8
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -47,7 +44,6 @@ steps:
|
|||||||
|
|
||||||
- label: "Build release image"
|
- label: "Build release image"
|
||||||
depends_on: block-release-image-build
|
depends_on: block-release-image-build
|
||||||
id: build-release-image
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
@ -55,18 +51,6 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image
|
|
||||||
- build-wheel-cuda-12-8
|
|
||||||
- build-wheel-cuda-12-6
|
|
||||||
- build-wheel-cuda-11-8
|
|
||||||
id: annotate-release-workflow
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
if: build.env("NIGHTLY") == "1"
|
if: build.env("NIGHTLY") == "1"
|
||||||
@ -80,16 +64,15 @@ steps:
|
|||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
- docker-login#v3.0.0:
|
- docker-login#v3.0.0:
|
||||||
username: vllmbot
|
username: vllm
|
||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
id: input-release-version
|
|
||||||
fields:
|
fields:
|
||||||
- text: "What is the release version?"
|
- text: "What is the release version?"
|
||||||
key: release-version
|
key: "release-version"
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
|
|||||||
@ -1,31 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get release version and strip leading 'v' if present
|
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
|
||||||
|
|
||||||
if [ -z "$RELEASE_VERSION" ]; then
|
|
||||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|
||||||
To download the wheel:
|
|
||||||
\`\`\`
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
To download and upload the image:
|
|
||||||
|
|
||||||
\`\`\`
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
docker push vllm/vllm-openai:latest
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
\`\`\`
|
|
||||||
EOF
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Usage: ./ci_clean_log.sh ci.log
|
|
||||||
# This script strips timestamps and color codes from CI log files.
|
|
||||||
|
|
||||||
# Check if argument is given
|
|
||||||
if [ $# -lt 1 ]; then
|
|
||||||
echo "Usage: $0 ci.log"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
INPUT_FILE="$1"
|
|
||||||
|
|
||||||
# Strip timestamps
|
|
||||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
|
||||||
|
|
||||||
# Strip colorization
|
|
||||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
|
||||||
@ -94,10 +94,6 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
if [[ $commands == *" kernels/core"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
|
|||||||
@ -7,7 +7,6 @@ set -ex
|
|||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
if [[ -n "$container_id" ]]; then
|
if [[ -n "$container_id" ]]; then
|
||||||
podman stop --all -t0
|
|
||||||
podman rm -f "$container_id" || true
|
podman rm -f "$container_id" || true
|
||||||
fi
|
fi
|
||||||
podman system prune -f
|
podman system prune -f
|
||||||
@ -38,7 +37,7 @@ function cpu_tests() {
|
|||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|||||||
@ -6,67 +6,72 @@ set -ex
|
|||||||
|
|
||||||
# allow to bind to different cores
|
# allow to bind to different cores
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
||||||
|
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
|
export BUILDKITE_BUILD_NUMBER=$3
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||||
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
VLLM_USE_V1=0 pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_ipex_quant.py"
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run chunked-prefill and prefix-cache test
|
# Run chunked-prefill and prefix-cache test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v -k cpu_model \
|
pytest -s -v -k cpu_model \
|
||||||
tests/basic_correctness/test_chunked_prefill.py"
|
tests/basic_correctness/test_chunked_prefill.py"
|
||||||
|
|
||||||
# online serving
|
# online serving
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
|
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
@ -78,7 +83,7 @@ function cpu_tests() {
|
|||||||
--tokenizer facebook/opt-125m"
|
--tokenizer facebook/opt-125m"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
@ -86,4 +91,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
||||||
|
|||||||
@ -10,17 +10,15 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
|||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
# certain versions of HPU software stack have a bug that can
|
# certain versions of HPU software stack have a bug that can
|
||||||
# override the exit code of the script, so we need to use
|
# override the exit code of the script, so we need to use
|
||||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
# separate remove_docker_container and remove_docker_container_and_exit
|
||||||
# functions, while other platforms only need one remove_docker_container
|
# functions, while other platforms only need one remove_docker_container
|
||||||
# function.
|
# function.
|
||||||
EXITCODE=1
|
EXITCODE=1
|
||||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||||
trap remove_docker_containers_and_exit EXIT
|
trap remove_docker_container_and_exit EXIT
|
||||||
remove_docker_containers
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
|
|||||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
# prune old image and containers to save disk space, and only once a day
|
||||||
# by using a timestamp file in tmp.
|
# by using a timestamp file in tmp.
|
||||||
@ -48,16 +47,8 @@ trap remove_docker_container EXIT
|
|||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
||||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
|
||||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
|
||||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
|
||||||
echo 'Running test file: '$f;
|
|
||||||
python3 -m pytest \$f -v --capture=tee-sys;
|
|
||||||
done
|
|
||||||
"
|
|
||||||
|
|||||||
@ -2,184 +2,102 @@
|
|||||||
|
|
||||||
set -xu
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
docker rm -f vllm-tpu || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
# Build the docker image.
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
# Set up cleanup.
|
# Set up cleanup.
|
||||||
cleanup_docker() {
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
# Get Docker's root directory
|
trap remove_docker_container EXIT
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
if [ -z "$docker_root" ]; then
|
remove_docker_container
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
# For HF_TOKEN.
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
vllm-tpu /bin/bash -c '
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||||
set -u # Treat unset variables as an error.
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& export VLLM_XLA_CACHE_PATH= \
|
||||||
|
&& export VLLM_USE_V1=1 \
|
||||||
|
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||||
|
&& echo HARDWARE \
|
||||||
|
&& tpu-info \
|
||||||
|
&& { \
|
||||||
|
echo TEST_0: Running test_perf.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
||||||
|
echo TEST_0_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_1: Running test_compilation.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
||||||
|
echo TEST_1_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_2: Running test_basic.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
||||||
|
echo TEST_2_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||||
|
echo TEST_3_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_4: Running test_quantization_accuracy.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
||||||
|
echo TEST_4_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
||||||
|
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
||||||
|
echo TEST_5_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_6: Running test_tpu_model_runner.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
||||||
|
echo TEST_6_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_7: Running test_sampler.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
||||||
|
echo TEST_7_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_8: Running test_topk_topp_sampler.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
||||||
|
echo TEST_8_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_9: Running test_multimodal.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
||||||
|
echo TEST_9_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_10: Running test_pallas.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
||||||
|
echo TEST_10_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_11: Running test_struct_output_generate.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
||||||
|
echo TEST_11_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
{ \
|
||||||
|
echo TEST_12: Running test_moe_pallas.py; \
|
||||||
|
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
||||||
|
echo TEST_12_EXIT_CODE: \$?; \
|
||||||
|
} & \
|
||||||
|
# Disable the TPU LoRA tests until the feature is activated
|
||||||
|
# & { \
|
||||||
|
# echo TEST_13: Running test_moe_pallas.py; \
|
||||||
|
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
||||||
|
# echo TEST_13_EXIT_CODE: \$?; \
|
||||||
|
# } & \
|
||||||
|
wait \
|
||||||
|
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
||||||
|
"
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 0 "test_perf.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
|
||||||
run_and_track_test 1 "test_compilation.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
|
||||||
run_and_track_test 2 "test_basic.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
|
||||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
|
||||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
|
||||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
|
||||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
|
||||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
|
||||||
run_and_track_test 7 "test_sampler.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
|
||||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
|
||||||
run_and_track_test 9 "test_multimodal.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
|
||||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 12 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 13 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
|||||||
@ -1,18 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
|
||||||
|
|
||||||
# Check if argument is given
|
|
||||||
if [ $# -lt 1 ]; then
|
|
||||||
echo "Usage: $0 path/to/test.py::test_name"
|
|
||||||
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
TEST=$1
|
|
||||||
COUNT=1
|
|
||||||
|
|
||||||
while pytest -sv "$TEST"; do
|
|
||||||
COUNT=$((COUNT + 1))
|
|
||||||
echo "RUN NUMBER ${COUNT}"
|
|
||||||
done
|
|
||||||
@ -1,24 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
# Environment config
|
|
||||||
TEST_NAME=llama8b
|
|
||||||
CONTAINER_NAME=vllm-tpu
|
|
||||||
|
|
||||||
# vllm config
|
|
||||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
MAX_NUM_SEQS=512
|
|
||||||
MAX_NUM_BATCHED_TOKENS=512
|
|
||||||
TENSOR_PARALLEL_SIZE=1
|
|
||||||
MAX_MODEL_LEN=2048
|
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
|
||||||
EXPECTED_THROUGHPUT=8.0
|
|
||||||
INPUT_LEN=1800
|
|
||||||
OUTPUT_LEN=128
|
|
||||||
@ -1,102 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
if [ ! -f "$1" ]; then
|
|
||||||
echo "Error: The env file '$1' does not exist."
|
|
||||||
exit 1 # Exit the script with a non-zero status to indicate an error
|
|
||||||
fi
|
|
||||||
|
|
||||||
ENV_FILE=$1
|
|
||||||
|
|
||||||
# For testing on local vm, use `set -a` to export all variables
|
|
||||||
source /etc/environment
|
|
||||||
source $ENV_FILE
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
docker rm -f vllm-tpu || true;
|
|
||||||
docker rm -f $CONTAINER_NAME || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build docker image.
|
|
||||||
# TODO: build the image outside the script and share the image with other
|
|
||||||
# tpu test if building time is too long.
|
|
||||||
DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg GIT_REPO_CHECK=0 \
|
|
||||||
--tag vllm/vllm-tpu-bm \
|
|
||||||
--progress plain -f docker/Dockerfile.tpu .
|
|
||||||
|
|
||||||
LOG_ROOT=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $LOG_ROOT"
|
|
||||||
|
|
||||||
if [ -z "$HF_TOKEN" ]; then
|
|
||||||
echo "Error: HF_TOKEN is not set or is empty."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Make sure mounted disk or dir exists
|
|
||||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
|
||||||
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Run model $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "starting docker...$CONTAINER_NAME"
|
|
||||||
echo
|
|
||||||
docker run \
|
|
||||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
|
||||||
--env-file $ENV_FILE \
|
|
||||||
-e HF_TOKEN="$HF_TOKEN" \
|
|
||||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
|
||||||
-e MODEL=$MODEL \
|
|
||||||
-e WORKSPACE=/workspace \
|
|
||||||
--name $CONTAINER_NAME \
|
|
||||||
-d \
|
|
||||||
--privileged \
|
|
||||||
--network host \
|
|
||||||
-v /dev/shm:/dev/shm \
|
|
||||||
vllm/vllm-tpu-bm tail -f /dev/null
|
|
||||||
|
|
||||||
echo "run script..."
|
|
||||||
echo
|
|
||||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
|
|
||||||
|
|
||||||
echo "copy result back..."
|
|
||||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
|
||||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
|
||||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
|
||||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
|
||||||
|
|
||||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
|
||||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
|
||||||
|
|
||||||
if [ "$BUILDKITE" = "true" ]; then
|
|
||||||
echo "Running inside Buildkite"
|
|
||||||
buildkite-agent artifact upload "$VLLM_LOG"
|
|
||||||
buildkite-agent artifact upload "$BM_LOG"
|
|
||||||
else
|
|
||||||
echo "Not running inside Buildkite"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#
|
|
||||||
# compare the throughput with EXPECTED_THROUGHPUT
|
|
||||||
# and assert meeting the expectation
|
|
||||||
#
|
|
||||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
|
||||||
echo "Failed to get the throughput"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
|
||||||
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,94 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
|
||||||
BM_LOG="$WORKSPACE/bm_log.txt"
|
|
||||||
|
|
||||||
if [ -n "$TARGET_COMMIT" ]; then
|
|
||||||
head_hash=$(git rev-parse HEAD)
|
|
||||||
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
|
||||||
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "model: $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
#
|
|
||||||
# create a log folder
|
|
||||||
#
|
|
||||||
mkdir "$WORKSPACE/log"
|
|
||||||
|
|
||||||
# TODO: Move to image building.
|
|
||||||
pip install pandas
|
|
||||||
pip install datasets
|
|
||||||
|
|
||||||
#
|
|
||||||
# create sonnet_4x
|
|
||||||
#
|
|
||||||
echo "Create sonnet_4x.txt"
|
|
||||||
echo "" > benchmarks/sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
#
|
|
||||||
# start vllm service in backend
|
|
||||||
#
|
|
||||||
echo "lanching vllm..."
|
|
||||||
echo "logging to $VLLM_LOG"
|
|
||||||
echo
|
|
||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
|
||||||
--seed 42 \
|
|
||||||
--disable-log-requests \
|
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
|
||||||
--no-enable-prefix-caching \
|
|
||||||
--download_dir $DOWNLOAD_DIR \
|
|
||||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
|
||||||
|
|
||||||
|
|
||||||
echo "wait for 20 minutes.."
|
|
||||||
echo
|
|
||||||
# sleep 1200
|
|
||||||
# wait for 10 minutes...
|
|
||||||
for i in {1..120}; do
|
|
||||||
# TODO: detect other type of errors.
|
|
||||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
|
||||||
echo "Detected RuntimeError, exiting."
|
|
||||||
exit 1
|
|
||||||
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
|
||||||
echo "Application started"
|
|
||||||
break
|
|
||||||
else
|
|
||||||
echo "wait for 10 seconds..."
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
#
|
|
||||||
# run test
|
|
||||||
#
|
|
||||||
echo "run benchmark test..."
|
|
||||||
echo "logging to $BM_LOG"
|
|
||||||
echo
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos > "$BM_LOG"
|
|
||||||
|
|
||||||
echo "completed..."
|
|
||||||
echo
|
|
||||||
|
|
||||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
|
||||||
echo "throughput: $throughput"
|
|
||||||
echo
|
|
||||||
@ -33,13 +33,14 @@ steps:
|
|||||||
|
|
||||||
- label: Documentation Build # 2min
|
- label: Documentation Build # 2min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/test_docs"
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r ../requirements/docs.txt
|
- pip install -r ../../requirements/docs.txt
|
||||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
- SPHINXOPTS=\"-W\" make html
|
||||||
- mkdocs build
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
|
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -58,7 +59,6 @@ steps:
|
|||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
- pytest -v -s worker # Worker
|
- pytest -v -s worker # Worker
|
||||||
@ -125,7 +125,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
@ -138,14 +138,12 @@ steps:
|
|||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
@ -155,11 +153,9 @@ steps:
|
|||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
- python3 ../examples/offline_inference/data_parallel.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
@ -201,9 +197,8 @@ steps:
|
|||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
- tests/test_vllm_port
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
@ -225,7 +220,6 @@ steps:
|
|||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s v1/test_utils.py
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_metrics_reader.py
|
|
||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
@ -250,7 +244,7 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
@ -277,6 +271,17 @@ steps:
|
|||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
|
- label: LogitsProcessor Test # 5min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- vllm/model_executor/guided_decoding
|
||||||
|
- tests/test_logits_processor
|
||||||
|
- tests/model_executor/test_guided_processors
|
||||||
|
commands:
|
||||||
|
- pytest -v -s test_logits_processor.py
|
||||||
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
- label: Speculative decoding tests # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -289,7 +294,7 @@ steps:
|
|||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
@ -307,7 +312,6 @@ steps:
|
|||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@ -320,7 +324,6 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@ -383,23 +386,10 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
- tests/tensorizer_loader
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
|
|
||||||
- label: Model Executor Test
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
soft_fail: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor
|
|
||||||
- tests/model_executor
|
|
||||||
commands:
|
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s model_executor
|
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@ -424,9 +414,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
commands:
|
||||||
# temporary install here since we need nightly, will move to requirements/test.in
|
|
||||||
# after torchao 0.12 release
|
|
||||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@ -480,7 +467,10 @@ steps:
|
|||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_utils.py
|
- pytest -v -s models/test_utils.py
|
||||||
- pytest -v -s models/test_vision.py
|
- pytest -v -s models/test_vision.py
|
||||||
- pytest -v -s models/test_initialization.py
|
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||||
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
||||||
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
||||||
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
||||||
|
|
||||||
- label: Language Models Test (Standard)
|
- label: Language Models Test (Standard)
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -494,25 +484,16 @@ steps:
|
|||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/language -m core_model
|
- pytest -v -s models/language -m core_model
|
||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
- label: Language Models Test (Extended)
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
- pytest -v -s models/language/generation -m 'not core_model'
|
- pytest -v -s models/language -m 'not core_model'
|
||||||
|
|
||||||
- label: Language Models Test (Extended Pooling) # 36min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/pooling
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard)
|
- label: Multi-Modal Models Test (Standard)
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -624,11 +605,9 @@ steps:
|
|||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
|
|||||||
20
.github/CODEOWNERS
vendored
20
.github/CODEOWNERS
vendored
@ -10,17 +10,14 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
|
||||||
/vllm/reasoning @aarnphm
|
|
||||||
/vllm/entrypoints @aarnphm
|
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
/vllm/v1/structured_output @mgoin @russellb
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
@ -29,8 +26,8 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
@ -40,11 +37,6 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/lora @jeejeelee
|
|
||||||
|
|
||||||
# Docs
|
|
||||||
/docs @hmellor
|
|
||||||
mkdocs.yaml @hmellor
|
|
||||||
|
|||||||
6
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
6
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -81,14 +81,14 @@ body:
|
|||||||
required: true
|
required: true
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: >
|
||||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||||
|
|
||||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
Thanks for reporting 🙏!
|
Thanks for contributing 🎉!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@ -1,69 +0,0 @@
|
|||||||
name: 🧪 CI failure report
|
|
||||||
description: Report a failing test.
|
|
||||||
title: "[CI Failure]: "
|
|
||||||
labels: ["ci-failure"]
|
|
||||||
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
#### Include the name of the failing Buildkite step and test file in the title.
|
|
||||||
- type: input
|
|
||||||
attributes:
|
|
||||||
label: Name of failing test
|
|
||||||
description: |
|
|
||||||
Paste in the fully-qualified name of the failing test from the logs.
|
|
||||||
placeholder: |
|
|
||||||
`path/to/test_file.py::test_name[params]`
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: checkboxes
|
|
||||||
attributes:
|
|
||||||
label: Basic information
|
|
||||||
description: Select all items that apply to the failing test.
|
|
||||||
options:
|
|
||||||
- label: Flaky test
|
|
||||||
- label: Can reproduce locally
|
|
||||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 🧪 Describe the failing test
|
|
||||||
description: |
|
|
||||||
Please provide a clear and concise description of the failing test.
|
|
||||||
placeholder: |
|
|
||||||
A clear and concise description of the failing test.
|
|
||||||
|
|
||||||
```
|
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
|
||||||
```
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 📝 History of failing test
|
|
||||||
description: |
|
|
||||||
Since when did the test start to fail?
|
|
||||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
|
||||||
|
|
||||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
|
||||||
|
|
||||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
|
||||||
|
|
||||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
|
||||||
|
|
||||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
|
||||||
placeholder: |
|
|
||||||
Approximate timeline and/or problematic PRs
|
|
||||||
|
|
||||||
A link to the Buildkite analytics of the failing test (if available)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: CC List.
|
|
||||||
description: >
|
|
||||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for reporting 🙏!
|
|
||||||
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,18 +1,6 @@
|
|||||||
## Essential Elements of an Effective PR Description Checklist
|
FILL IN THE PR DESCRIPTION HERE
|
||||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
|
||||||
- [ ] The test plan, such as providing test command.
|
|
||||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
|
||||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
|
||||||
|
|
||||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
## Test Plan
|
|
||||||
|
|
||||||
## Test Result
|
|
||||||
|
|
||||||
## (Optional) Documentation Update
|
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
345
.github/mergify.yml
vendored
345
.github/mergify.yml
vendored
@ -1,181 +1,184 @@
|
|||||||
pull_request_rules:
|
pull_request_rules:
|
||||||
- name: label-llama
|
- name: label-documentation
|
||||||
description: Automatically apply llama label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^[^/]+\.md$
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^docs/
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
- files~=^examples/
|
||||||
- files~=^vllm/model_executor/models/.*llama.*\.py
|
actions:
|
||||||
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
label:
|
||||||
actions:
|
add:
|
||||||
label:
|
- documentation
|
||||||
add:
|
|
||||||
- llama
|
- name: label-ci-build
|
||||||
- name: label-documentation
|
description: Automatically apply ci/build label
|
||||||
description: Automatically apply documentation label
|
conditions:
|
||||||
conditions:
|
- or:
|
||||||
- or:
|
- files~=^\.github/
|
||||||
- files~=^[^/]+\.md$
|
- files~=\.buildkite/
|
||||||
- files~=^docs/
|
- files~=^cmake/
|
||||||
- files~=^examples/
|
- files=CMakeLists.txt
|
||||||
actions:
|
- files~=^docker/Dockerfile
|
||||||
label:
|
- files~=^requirements.*\.txt
|
||||||
add:
|
- files=setup.py
|
||||||
- documentation
|
actions:
|
||||||
- name: label-ci-build
|
label:
|
||||||
description: Automatically apply ci/build label
|
add:
|
||||||
conditions:
|
- ci/build
|
||||||
- or:
|
|
||||||
- files~=^\.github/
|
- name: label-frontend
|
||||||
- files~=\.buildkite/
|
description: Automatically apply frontend label
|
||||||
- files~=^cmake/
|
conditions:
|
||||||
- files=CMakeLists.txt
|
- files~=^vllm/entrypoints/
|
||||||
- files~=^docker/Dockerfile
|
actions:
|
||||||
- files~=^requirements.*\.txt
|
label:
|
||||||
- files=setup.py
|
add:
|
||||||
actions:
|
- frontend
|
||||||
label:
|
|
||||||
add:
|
- name: label-multi-modality
|
||||||
- ci/build
|
description: Automatically apply multi-modality label
|
||||||
- name: label-frontend
|
conditions:
|
||||||
description: Automatically apply frontend label
|
- or:
|
||||||
conditions:
|
- files~=^vllm/multimodal/
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^tests/multimodal/
|
||||||
actions:
|
- files~=^tests/models/multimodal/
|
||||||
label:
|
- files~=^tests/models/*/audio_language/
|
||||||
add:
|
- files~=^tests/models/*/vision_language/
|
||||||
- frontend
|
- files=tests/models/test_vision.py
|
||||||
- name: label-multi-modality
|
actions:
|
||||||
description: Automatically apply multi-modality label
|
label:
|
||||||
conditions:
|
add:
|
||||||
- or:
|
- multi-modality
|
||||||
- files~=^vllm/multimodal/
|
|
||||||
- files~=^tests/multimodal/
|
- name: label-structured-output
|
||||||
- files~=^tests/models/multimodal/
|
description: Automatically apply structured-output label
|
||||||
- files~=^tests/models/*/audio_language/
|
conditions:
|
||||||
- files~=^tests/models/*/vision_language/
|
- or:
|
||||||
- files=tests/models/test_vision.py
|
- files~=^benchmarks/structured_schemas/
|
||||||
actions:
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
label:
|
- files=benchmarks/run_structured_output_benchmark.sh
|
||||||
add:
|
- files=docs/source/features/structured_outputs.md
|
||||||
- multi-modality
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
- name: label-structured-output
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
description: Automatically apply structured-output label
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
conditions:
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
- or:
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files~=^tests/v1/structured_output/
|
||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||||
- files=docs/features/structured_outputs.md
|
- files~=^vllm/v1/structured_output/
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
actions:
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
label:
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
add:
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
- structured-output
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
- name: label-speculative-decoding
|
||||||
- files~=^tests/v1/structured_output/
|
description: Automatically apply speculative-decoding label
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
conditions:
|
||||||
- files~=^vllm/v1/structured_output/
|
- or:
|
||||||
actions:
|
- files~=^vllm/spec_decode/
|
||||||
label:
|
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||||
add:
|
- files~=^tests/spec_decode/
|
||||||
- structured-output
|
actions:
|
||||||
- name: label-speculative-decoding
|
label:
|
||||||
description: Automatically apply speculative-decoding label
|
add:
|
||||||
conditions:
|
- speculative-decoding
|
||||||
- or:
|
|
||||||
- files~=^vllm/spec_decode/
|
- name: label-v1
|
||||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
description: Automatically apply v1 label
|
||||||
- files~=^tests/spec_decode/
|
conditions:
|
||||||
actions:
|
- or:
|
||||||
label:
|
- files~=^vllm/v1/
|
||||||
add:
|
- files~=^tests/v1/
|
||||||
- speculative-decoding
|
actions:
|
||||||
- name: label-v1
|
label:
|
||||||
description: Automatically apply v1 label
|
add:
|
||||||
conditions:
|
- v1
|
||||||
- or:
|
|
||||||
- files~=^vllm/v1/
|
- name: label-tpu
|
||||||
- files~=^tests/v1/
|
description: Automatically apply tpu label
|
||||||
actions:
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
label:
|
conditions:
|
||||||
add:
|
- or:
|
||||||
- v1
|
- files~=tpu.py
|
||||||
- name: label-tpu
|
- files~=_tpu
|
||||||
description: Automatically apply tpu label
|
- files~=tpu_
|
||||||
conditions:
|
- files~=/tpu/
|
||||||
- or:
|
- files~=pallas
|
||||||
- files~=tpu.py
|
actions:
|
||||||
- files~=_tpu
|
label:
|
||||||
- files~=tpu_
|
add:
|
||||||
- files~=/tpu/
|
- tpu
|
||||||
- files~=pallas
|
|
||||||
actions:
|
- name: label-tpu-remove
|
||||||
label:
|
description: Automatically remove tpu label
|
||||||
add:
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
- tpu
|
conditions:
|
||||||
- name: label-tpu-remove
|
- and:
|
||||||
description: Automatically remove tpu label
|
- -files~=tpu.py
|
||||||
conditions:
|
- -files~=_tpu
|
||||||
- and:
|
- -files~=tpu_
|
||||||
- -files~=tpu.py
|
- -files~=/tpu/
|
||||||
- -files~=_tpu
|
- -files~=pallas
|
||||||
- -files~=tpu_
|
actions:
|
||||||
- -files~=/tpu/
|
label:
|
||||||
- -files~=pallas
|
remove:
|
||||||
actions:
|
- tpu
|
||||||
label:
|
|
||||||
remove:
|
- name: label-tool-calling
|
||||||
- tpu
|
description: Automatically add tool-calling label
|
||||||
- name: label-tool-calling
|
conditions:
|
||||||
description: Automatically add tool-calling label
|
- or:
|
||||||
conditions:
|
- files~=^tests/tool_use/
|
||||||
- or:
|
- files~=^tests/mistral_tool_use/
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files~=^tests/mistral_tool_use/
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=docs/source/features/tool_calling.md
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
||||||
- files=docs/features/tool_calling.md
|
- files=docs/source/getting_started/examples/chat_with_tools.md
|
||||||
- files~=^examples/tool_chat_*
|
- files~=^examples/tool_chat_*
|
||||||
- files=examples/offline_inference/chat_with_tools.py
|
- files=examples/offline_inference/chat_with_tools.py
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- tool-calling
|
- tool-calling
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
|
||||||
conditions:
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- needs-rebase
|
- needs-rebase
|
||||||
comment:
|
comment:
|
||||||
message: |
|
message: |
|
||||||
This pull request has merge conflicts that must be resolved before it can be
|
This pull request has merge conflicts that must be resolved before it can be
|
||||||
merged. Please rebase the PR, @{{author}}.
|
merged. Please rebase the PR, @{{author}}.
|
||||||
|
|
||||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||||
- name: assign reviewer for tensorizer changes
|
|
||||||
conditions:
|
- name: assign reviewer for tensorizer changes
|
||||||
|
conditions:
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/tensorizer_loader/
|
- files~=^tests/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
- sangstar
|
- "sangstar"
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
|
||||||
conditions:
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
- needs-rebase
|
- needs-rebase
|
||||||
|
|||||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
|||||||
|
|
||||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||||
python3 - <<EOF
|
python3 - <<EOF
|
||||||
import regex as re
|
import re
|
||||||
|
|
||||||
with open("${NEW}", "r") as file:
|
with open("${NEW}", "r") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
|
|||||||
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@ -20,12 +20,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
- name: Install Python dependencies
|
|
||||||
run: |
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
python3 -m pip install regex
|
|
||||||
|
|
||||||
- name: Update PR description
|
- name: Update PR description
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||||
|
|||||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -77,6 +77,11 @@ instance/
|
|||||||
# Scrapy stuff:
|
# Scrapy stuff:
|
||||||
.scrapy
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
docs/source/getting_started/examples/
|
||||||
|
docs/source/api/vllm
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
target/
|
target/
|
||||||
@ -146,7 +151,6 @@ venv.bak/
|
|||||||
|
|
||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
docs/examples
|
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|||||||
@ -11,15 +11,13 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
|
||||||
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.11.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
files: ^(.buildkite|benchmarks)/.*
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.4.1
|
rev: v2.4.1
|
||||||
hooks:
|
hooks:
|
||||||
@ -41,7 +39,6 @@ repos:
|
|||||||
rev: v0.9.29
|
rev: v0.9.29
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
exclude: '.*\.inc\.md'
|
|
||||||
args: [fix]
|
args: [fix]
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
@ -60,7 +57,7 @@ repos:
|
|||||||
entry: tools/mypy.sh 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
@ -130,21 +127,6 @@ repos:
|
|||||||
name: Update Dockerfile dependency graph
|
name: Update Dockerfile dependency graph
|
||||||
entry: tools/update-dockerfile-graph.sh
|
entry: tools/update-dockerfile-graph.sh
|
||||||
language: script
|
language: script
|
||||||
- id: enforce-import-regex-instead-of-re
|
|
||||||
name: Enforce import regex as re
|
|
||||||
entry: python tools/enforce_regex_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# forbid directly import triton
|
|
||||||
- id: forbid-direct-triton-import
|
|
||||||
name: "Forbid direct 'import triton'"
|
|
||||||
entry: python tools/check_triton_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@ -8,8 +8,12 @@ build:
|
|||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
|
||||||
mkdocs:
|
sphinx:
|
||||||
configuration: mkdocs.yaml
|
configuration: docs/source/conf.py
|
||||||
|
fail_on_warning: true
|
||||||
|
|
||||||
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
|
formats: []
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
@ -23,15 +23,15 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
# Prevent installation of dependencies (cutlass) by default.
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
|
# Supported NVIDIA architectures.
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
@ -79,15 +79,6 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
|
||||||
else()
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@ -182,6 +173,9 @@ include(FetchContent)
|
|||||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set rocm version dev int.
|
||||||
|
#
|
||||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
#
|
#
|
||||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||||
@ -189,6 +183,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||||
@ -231,8 +226,6 @@ endif()
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
@ -242,7 +235,6 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/sampler.cu"
|
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
@ -289,6 +281,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||||
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
@ -543,8 +537,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# CUTLASS MoE kernels
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
||||||
# if it's possible to compile MoE kernels that use its output.
|
# to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||||
@ -785,7 +779,5 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
|
||||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||||
|
|||||||
18
README.md
18
README.md
@ -1,7 +1,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
||||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
|
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||||
</picture>
|
</picture>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -58,8 +58,8 @@ vLLM is fast with:
|
|||||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
|
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
|
|
||||||
@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g., E5-Mistral)
|
- Embedding Models (e.g. E5-Mistral)
|
||||||
- Multi-modal LLMs (e.g., LLaVA)
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||||
@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
|||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
|
||||||
|
|
||||||
## Sponsors
|
## Sponsors
|
||||||
|
|
||||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
|
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
||||||
|
|||||||
@ -8,6 +8,4 @@ Please report security issues privately using [the vulnerability submission form
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
|
||||||
|
|
||||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|||||||
@ -64,12 +64,6 @@ become available.
|
|||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
|
||||||
<td><strong>Custom</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>Local file: <code>data.jsonl</code></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
@ -130,38 +124,6 @@ P99 ITL (ms): 8.39
|
|||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
### Custom Dataset
|
|
||||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
|
||||||
|
|
||||||
```
|
|
||||||
{"prompt": "What is the capital of India?"}
|
|
||||||
{"prompt": "What is the capital of Iran?"}
|
|
||||||
{"prompt": "What is the capital of China?"}
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# start server
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# run benchmarking script
|
|
||||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
|
||||||
--backend vllm \
|
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name custom \
|
|
||||||
--dataset-path <path-to-your-data-jsonl> \
|
|
||||||
--custom-skip-chat-template \
|
|
||||||
--num-prompts 80 \
|
|
||||||
--max-concurrency 1 \
|
|
||||||
--temperature=0.3 \
|
|
||||||
--top-p=0.75 \
|
|
||||||
--result-dir "./log/"
|
|
||||||
```
|
|
||||||
|
|
||||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -184,9 +146,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--speculative-config $'{"method": "ngram",
|
--speculative-model "[ngram]" \
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
--ngram_prompt_lookup_min 2 \
|
||||||
"prompt_lookup_min": 2}'
|
--ngram-prompt-lookup-max 5 \
|
||||||
|
--num_speculative_tokens 5
|
||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
@ -241,16 +204,6 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--seed 42
|
--seed 42
|
||||||
```
|
```
|
||||||
|
|
||||||
**`philschmid/mt-bench`**
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path philschmid/mt-bench \
|
|
||||||
--num-prompts 80
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
### Running With Sampling Parameters
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
@ -321,9 +274,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--output-len=100 \
|
--output-len=100 \
|
||||||
--num-prompts=2048 \
|
--num-prompts=2048 \
|
||||||
--async-engine \
|
--async-engine \
|
||||||
--speculative-config $'{"method": "ngram",
|
--speculative-model="[ngram]" \
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
--ngram_prompt_lookup_min=2 \
|
||||||
"prompt_lookup_min": 2}'
|
--ngram-prompt-lookup-max=5 \
|
||||||
|
--num_speculative_tokens=5
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@ -10,15 +10,11 @@
|
|||||||
# 3. Set variables (ALL REQUIRED)
|
# 3. Set variables (ALL REQUIRED)
|
||||||
# BASE: your directory for vllm repo
|
# BASE: your directory for vllm repo
|
||||||
# MODEL: the model served by vllm
|
# MODEL: the model served by vllm
|
||||||
# TP: ways of tensor parallelism
|
|
||||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
# DOWNLOAD_DIR: directory to download and load model weights.
|
||||||
# INPUT_LEN: request input len
|
# INPUT_LEN: request input len
|
||||||
# OUTPUT_LEN: request output len
|
# OUTPUT_LEN: request output len
|
||||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
# MIN_CACHE_HIT_PCT: prefix cache rate
|
||||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
||||||
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
|
|
||||||
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
|
|
||||||
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
|
|
||||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
||||||
# 5. The final result will be saved in RESULT file.
|
# 5. The final result will be saved in RESULT file.
|
||||||
|
|
||||||
@ -34,27 +30,31 @@
|
|||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
BASE=""
|
BASE=""
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
TP=1
|
|
||||||
DOWNLOAD_DIR=""
|
DOWNLOAD_DIR=""
|
||||||
INPUT_LEN=4000
|
INPUT_LEN=4000
|
||||||
OUTPUT_LEN=16
|
OUTPUT_LEN=16
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
NUM_SEQS_LIST="128 256"
|
|
||||||
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
|
|
||||||
echo "result file: $RESULT"
|
echo "result file$ $RESULT"
|
||||||
echo "model: $MODEL"
|
echo "model: $MODEL"
|
||||||
|
echo
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
rm -rf $LOG_FOLDER
|
||||||
mkdir -p $LOG_FOLDER
|
mkdir -p $LOG_FOLDER
|
||||||
|
|
||||||
cd "$BASE/vllm"
|
cd "$BASE/vllm"
|
||||||
|
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
||||||
|
echo "" > benchmarks/sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
pip install -q datasets
|
pip install datasets
|
||||||
|
|
||||||
current_hash=$(git rev-parse HEAD)
|
current_hash=$(git rev-parse HEAD)
|
||||||
echo "hash:$current_hash" >> "$RESULT"
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
@ -64,69 +64,53 @@ best_throughput=0
|
|||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
best_goodput=0
|
best_goodput=0
|
||||||
|
|
||||||
start_server() {
|
|
||||||
local gpu_memory_utilization=$1
|
|
||||||
local max_num_seqs=$2
|
|
||||||
local max_num_batched_tokens=$3
|
|
||||||
local vllm_log=$4
|
|
||||||
|
|
||||||
pkill -f vllm
|
|
||||||
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
|
||||||
--disable-log-requests \
|
|
||||||
--port 8004 \
|
|
||||||
--gpu-memory-utilization $gpu_memory_utilization \
|
|
||||||
--max-num-seqs $max_num_seqs \
|
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
|
||||||
--tensor-parallel-size $TP \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--load-format dummy \
|
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
|
||||||
|
|
||||||
# wait for 10 minutes...
|
|
||||||
server_started=0
|
|
||||||
for i in {1..60}; do
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
|
||||||
server_started=1
|
|
||||||
break
|
|
||||||
else
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
if (( ! server_started )); then
|
|
||||||
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
|
||||||
return 1
|
|
||||||
else
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
run_benchmark() {
|
run_benchmark() {
|
||||||
local max_num_seqs=$1
|
local max_num_seqs=$1
|
||||||
local max_num_batched_tokens=$2
|
local max_num_batched_tokens=$2
|
||||||
local gpu_memory_utilization=$3
|
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
pkill -f vllm
|
|
||||||
|
|
||||||
echo "starting server..."
|
# start the server
|
||||||
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
||||||
result=$?
|
--disable-log-requests \
|
||||||
if [[ "$result" -eq 1 ]]; then
|
--port 8004 \
|
||||||
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
--gpu-memory-utilization 0.98 \
|
||||||
else
|
--max-num-seqs $max_num_seqs \
|
||||||
echo "server started."
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
fi
|
--tensor-parallel-size 1 \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--load-format dummy \
|
||||||
|
--download-dir $DOWNLOAD_DIR \
|
||||||
|
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||||
|
echo "wait for 10 minutes.."
|
||||||
echo
|
echo
|
||||||
|
# wait for 10 minutes...
|
||||||
|
server_started=0
|
||||||
|
for i in {1..60}; do
|
||||||
|
if grep -Fq "Application startup complete" "$vllm_log"; then
|
||||||
|
echo "Application started"
|
||||||
|
server_started=1
|
||||||
|
break
|
||||||
|
else
|
||||||
|
# echo "wait for 10 seconds..."
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if (( ! server_started )); then
|
||||||
|
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
||||||
|
echo "pkill -f vllm"
|
||||||
|
echo
|
||||||
|
pkill vllm
|
||||||
|
sleep 10
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
|
echo
|
||||||
meet_latency_requirement=0
|
meet_latency_requirement=0
|
||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
@ -134,29 +118,29 @@ run_benchmark() {
|
|||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name sonnet \
|
||||||
--random-input-len $INPUT_LEN \
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
--random-output-len $OUTPUT_LEN \
|
--sonnet-input-len $INPUT_LEN \
|
||||||
|
--sonnet-output-len $OUTPUT_LEN \
|
||||||
--ignore-eos \
|
--ignore-eos \
|
||||||
--disable-tqdm \
|
--disable-tqdm \
|
||||||
--request-rate inf \
|
--request-rate inf \
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 1000 \
|
--num-prompts 100 \
|
||||||
--random-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 > "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
meet_latency_requirement=1
|
meet_latency_requirement=1
|
||||||
request_rate=inf
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if (( ! meet_latency_requirement )); then
|
if (( ! meet_latency_requirement )); then
|
||||||
# start from request-rate as int(throughput) + 1
|
# start from request-rate as int(through_put) + 1
|
||||||
request_rate=$((${throughput%.*} + 1))
|
request_rate=$((${through_put%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
@ -165,18 +149,19 @@ run_benchmark() {
|
|||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name sonnet \
|
||||||
--random-input-len $INPUT_LEN \
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
--random-output-len $OUTPUT_LEN \
|
--sonnet-input-len $INPUT_LEN \
|
||||||
--ignore-eos \
|
--sonnet-output-len $OUTPUT_LEN \
|
||||||
|
--ignore_eos \
|
||||||
--disable-tqdm \
|
--disable-tqdm \
|
||||||
--request-rate $request_rate \
|
--request-rate $request_rate \
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 100 \
|
||||||
--random-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 > "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
@ -188,10 +173,10 @@ run_benchmark() {
|
|||||||
fi
|
fi
|
||||||
# write the results and update the best result.
|
# write the results and update the best result.
|
||||||
if ((meet_latency_requirement)); then
|
if ((meet_latency_requirement)); then
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
||||||
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
|
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
||||||
best_throughput=$throughput
|
best_throughput=$through_put
|
||||||
best_max_num_seqs=$max_num_seqs
|
best_max_num_seqs=$max_num_seqs
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
best_goodput=$goodput
|
best_goodput=$goodput
|
||||||
@ -203,39 +188,22 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
|
echo "pkill -f vllm"
|
||||||
|
echo
|
||||||
pkill vllm
|
pkill vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
|
rm -f $vllm_log
|
||||||
printf '=%.0s' $(seq 1 20)
|
printf '=%.0s' $(seq 1 20)
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
|
|
||||||
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
|
||||||
|
|
||||||
# first find out the max gpu-memory-utilization without HBM OOM.
|
num_seqs_list="128 256"
|
||||||
gpu_memory_utilization=0.98
|
num_batched_tokens_list="512 1024 2048 4096"
|
||||||
find_gpu_memory_utilization=0
|
for num_seqs in $num_seqs_list; do
|
||||||
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
for num_batched_tokens in $num_batched_tokens_list; do
|
||||||
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
run_benchmark $num_seqs $num_batched_tokens
|
||||||
result=$?
|
exit 0
|
||||||
if [[ "$result" -eq 0 ]]; then
|
|
||||||
find_gpu_memory_utilization=1
|
|
||||||
break
|
|
||||||
else
|
|
||||||
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
|
|
||||||
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
|
|
||||||
else
|
|
||||||
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
for num_seqs in "${num_seqs_list[@]}"; do
|
|
||||||
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
|
|
||||||
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
|
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
@ -195,11 +194,6 @@ async def async_request_deepspeed_mii(
|
|||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
|
||||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
|
||||||
)
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
) as session:
|
) as session:
|
||||||
@ -210,8 +204,6 @@ async def async_request_deepspeed_mii(
|
|||||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@ -223,7 +215,7 @@ async def async_request_deepspeed_mii(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with session.post(
|
||||||
url=api_url, json=payload, headers=headers
|
url=request_func_input.api_url, json=payload
|
||||||
) as response:
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
parsed_resp = await response.json()
|
parsed_resp = await response.json()
|
||||||
@ -325,7 +317,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += text or ""
|
generated_text += text or ""
|
||||||
if usage := data.get("usage"):
|
elif usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get("completion_tokens")
|
output.output_tokens = usage.get("completion_tokens")
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
@ -612,7 +604,6 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
"llama.cpp": async_request_openai_completions,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BACKENDS = [
|
OPENAI_COMPATIBLE_BACKENDS = [
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
"""
|
||||||
This module defines a framework for sampling benchmark requests from various
|
This module defines a framework for sampling benchmark requests from various
|
||||||
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
||||||
@ -10,6 +9,9 @@ generation. Supported dataset types include:
|
|||||||
- BurstGPT
|
- BurstGPT
|
||||||
- HuggingFace
|
- HuggingFace
|
||||||
- VisionArena
|
- VisionArena
|
||||||
|
|
||||||
|
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
||||||
|
SampleRequest instances, similar to the approach used in ShareGPT.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
@ -33,7 +35,6 @@ from transformers import PreTrainedTokenizerBase
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.utils import get_adapter_absolute_path
|
from vllm.lora.utils import get_adapter_absolute_path
|
||||||
from vllm.multimodal import MultiModalDataDict
|
from vllm.multimodal import MultiModalDataDict
|
||||||
from vllm.multimodal.image import convert_image_mode
|
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -256,7 +257,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
if isinstance(image, dict) and "bytes" in image:
|
if isinstance(image, dict) and "bytes" in image:
|
||||||
image = Image.open(BytesIO(image["bytes"]))
|
image = Image.open(BytesIO(image["bytes"]))
|
||||||
if isinstance(image, Image.Image):
|
if isinstance(image, Image.Image):
|
||||||
image = convert_image_mode(image, "RGB")
|
image = image.convert("RGB")
|
||||||
with io.BytesIO() as image_data:
|
with io.BytesIO() as image_data:
|
||||||
image.save(image_data, format="JPEG")
|
image.save(image_data, format="JPEG")
|
||||||
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
|
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
|
||||||
@ -440,97 +441,6 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Custom Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class CustomDataset(BenchmarkDataset):
|
|
||||||
"""
|
|
||||||
Implements the Custom dataset. Loads data from a JSONL file and generates
|
|
||||||
sample requests based on conversation turns. E.g.,
|
|
||||||
```
|
|
||||||
{"prompt": "What is the capital of India?"}
|
|
||||||
{"prompt": "What is the capital of Iran?"}
|
|
||||||
{"prompt": "What is the capital of China?"}
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, **kwargs) -> None:
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.load_data()
|
|
||||||
|
|
||||||
def load_data(self) -> None:
|
|
||||||
if self.dataset_path is None:
|
|
||||||
raise ValueError("dataset_path must be provided for loading data.")
|
|
||||||
|
|
||||||
# self.data will be a list of dictionaries
|
|
||||||
# e.g., [{"prompt": "What is the capital of India?"}, ...]
|
|
||||||
# This will be the standardized format which load_data()
|
|
||||||
# has to convert into depending on the filetype of dataset_path.
|
|
||||||
# sample() will assume this standardized format of self.data
|
|
||||||
self.data = []
|
|
||||||
|
|
||||||
# Load the JSONL file
|
|
||||||
if self.dataset_path.endswith(".jsonl"):
|
|
||||||
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
|
|
||||||
|
|
||||||
# check if the JSONL file has a 'prompt' column
|
|
||||||
if "prompt" not in jsonl_data.columns:
|
|
||||||
raise ValueError("JSONL file must contain a 'prompt' column.")
|
|
||||||
|
|
||||||
# Convert each row to a dictionary and append to self.data
|
|
||||||
# This will convert the DataFrame to a list of dictionaries
|
|
||||||
# where each dictionary corresponds to a row in the DataFrame.
|
|
||||||
# This is the standardized format we want for self.data
|
|
||||||
for _, row in jsonl_data.iterrows():
|
|
||||||
self.data.append(row.to_dict())
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Only JSONL format is supported for CustomDataset."
|
|
||||||
)
|
|
||||||
|
|
||||||
random.seed(self.random_seed)
|
|
||||||
random.shuffle(self.data)
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
lora_path: Optional[str] = None,
|
|
||||||
max_loras: Optional[int] = None,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
enable_multimodal_chat: bool = False,
|
|
||||||
skip_chat_template: bool = False,
|
|
||||||
**kwargs,
|
|
||||||
) -> list:
|
|
||||||
sampled_requests = []
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt = item["prompt"]
|
|
||||||
|
|
||||||
# apply template
|
|
||||||
if not skip_chat_template:
|
|
||||||
prompt = tokenizer.apply_chat_template(
|
|
||||||
[{"role": "user", "content": prompt}],
|
|
||||||
add_generation_prompt=True,
|
|
||||||
tokenize=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Sonnet Dataset Implementation
|
# Sonnet Dataset Implementation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -865,15 +775,7 @@ class InstructCoderDataset(HuggingFaceDataset):
|
|||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
|
prompt = f"{item['instruction']}:\n{item['input']}"
|
||||||
the code, do not include any explanation."
|
|
||||||
|
|
||||||
# apply template
|
|
||||||
prompt = tokenizer.apply_chat_template(
|
|
||||||
[{"role": "user", "content": prompt}],
|
|
||||||
add_generation_prompt=True,
|
|
||||||
tokenize=False,
|
|
||||||
)
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@ -7,12 +6,13 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
@ -80,9 +80,17 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
llm.start_profile()
|
with torch.profiler.profile(
|
||||||
llm_generate()
|
activities=[
|
||||||
llm.stop_profile()
|
torch.profiler.ProfilerActivity.CPU,
|
||||||
|
torch.profiler.ProfilerActivity.CUDA,
|
||||||
|
],
|
||||||
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
|
str(profile_dir)
|
||||||
|
),
|
||||||
|
) as p:
|
||||||
|
llm_generate()
|
||||||
|
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm_generate()
|
llm_generate()
|
||||||
@ -95,7 +103,11 @@ def main(args: argparse.Namespace):
|
|||||||
run_to_completion(profile_dir=None)
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
profile_dir = args.profile_result_dir
|
||||||
|
if not profile_dir:
|
||||||
|
profile_dir = (
|
||||||
|
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
||||||
|
)
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@ -152,6 +164,15 @@ if __name__ == "__main__":
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="profile the generation process of a single batch",
|
help="profile the generation process of a single batch",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--profile-result-dir",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"path to save the pytorch profiler output. Can be visualized "
|
||||||
|
"with ui.perfetto.dev or Tensorboard."
|
||||||
|
),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-json",
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
@ -168,13 +189,5 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
# V1 enables prefix caching by default which skews the latency
|
|
||||||
# numbers. We need to disable prefix caching by default.
|
|
||||||
parser.set_defaults(enable_prefix_caching=False)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
|
||||||
raise OSError(
|
|
||||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
|
||||||
"Please set it to a valid path to use torch profiler."
|
|
||||||
)
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
"""
|
||||||
Offline benchmark to test the long document QA throughput.
|
Offline benchmark to test the long document QA throughput.
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
"""
|
||||||
Benchmark the efficiency of prefix caching.
|
Benchmark the efficiency of prefix caching.
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
r"""Benchmark online serving throughput.
|
r"""Benchmark online serving throughput.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@ -61,7 +60,6 @@ from benchmark_dataset import (
|
|||||||
ASRDataset,
|
ASRDataset,
|
||||||
BurstGPTDataset,
|
BurstGPTDataset,
|
||||||
ConversationDataset,
|
ConversationDataset,
|
||||||
CustomDataset,
|
|
||||||
HuggingFaceDataset,
|
HuggingFaceDataset,
|
||||||
InstructCoderDataset,
|
InstructCoderDataset,
|
||||||
MTBenchDataset,
|
MTBenchDataset,
|
||||||
@ -277,7 +275,7 @@ async def benchmark(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: list[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
logprobs: Optional[int],
|
logprobs: Optional[int],
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
burstiness: float,
|
burstiness: float,
|
||||||
@ -297,12 +295,14 @@ async def benchmark(
|
|||||||
raise ValueError(f"Unknown backend: {backend}")
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
|
last_idx = len(requests) - 1
|
||||||
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
|
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
|
||||||
input_requests[0].prompt,
|
requests[last_idx].prompt,
|
||||||
input_requests[0].prompt_len,
|
requests[last_idx].prompt_len,
|
||||||
input_requests[0].expected_output_len,
|
requests[last_idx].expected_output_len,
|
||||||
input_requests[0].multi_modal_data,
|
requests[last_idx].multi_modal_data,
|
||||||
)
|
)
|
||||||
|
input_requests = requests[:last_idx]
|
||||||
|
|
||||||
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
@ -617,6 +617,9 @@ def main(args: argparse.Namespace):
|
|||||||
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||||
base_url = f"http://{args.host}:{args.port}"
|
base_url = f"http://{args.host}:{args.port}"
|
||||||
|
|
||||||
|
# Create one more request (for a test request)
|
||||||
|
total_prompts = args.num_prompts + 1
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
tokenizer_id,
|
tokenizer_id,
|
||||||
tokenizer_mode=tokenizer_mode,
|
tokenizer_mode=tokenizer_mode,
|
||||||
@ -629,21 +632,12 @@ def main(args: argparse.Namespace):
|
|||||||
"'--dataset-path' if required."
|
"'--dataset-path' if required."
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset_name == "custom":
|
if args.dataset_name == "sonnet":
|
||||||
dataset = CustomDataset(dataset_path=args.dataset_path)
|
|
||||||
input_requests = dataset.sample(
|
|
||||||
num_requests=args.num_prompts,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
output_len=args.custom_output_len,
|
|
||||||
skip_chat_template=args.custom_skip_chat_template,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif args.dataset_name == "sonnet":
|
|
||||||
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
||||||
# For the "sonnet" dataset, formatting depends on the backend.
|
# For the "sonnet" dataset, formatting depends on the backend.
|
||||||
if args.backend == "openai-chat":
|
if args.backend == "openai-chat":
|
||||||
input_requests = dataset.sample(
|
input_requests = dataset.sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=total_prompts,
|
||||||
input_len=args.sonnet_input_len,
|
input_len=args.sonnet_input_len,
|
||||||
output_len=args.sonnet_output_len,
|
output_len=args.sonnet_output_len,
|
||||||
prefix_len=args.sonnet_prefix_len,
|
prefix_len=args.sonnet_prefix_len,
|
||||||
@ -655,7 +649,7 @@ def main(args: argparse.Namespace):
|
|||||||
"Tokenizer/model must have chat template for sonnet dataset."
|
"Tokenizer/model must have chat template for sonnet dataset."
|
||||||
)
|
)
|
||||||
input_requests = dataset.sample(
|
input_requests = dataset.sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=total_prompts,
|
||||||
input_len=args.sonnet_input_len,
|
input_len=args.sonnet_input_len,
|
||||||
output_len=args.sonnet_output_len,
|
output_len=args.sonnet_output_len,
|
||||||
prefix_len=args.sonnet_prefix_len,
|
prefix_len=args.sonnet_prefix_len,
|
||||||
@ -718,7 +712,7 @@ def main(args: argparse.Namespace):
|
|||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
random_seed=args.seed,
|
random_seed=args.seed,
|
||||||
).sample(
|
).sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=total_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
output_len=args.hf_output_len,
|
output_len=args.hf_output_len,
|
||||||
)
|
)
|
||||||
@ -730,15 +724,15 @@ def main(args: argparse.Namespace):
|
|||||||
random_seed=args.seed, dataset_path=args.dataset_path
|
random_seed=args.seed, dataset_path=args.dataset_path
|
||||||
).sample(
|
).sample(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
num_requests=args.num_prompts,
|
num_requests=total_prompts,
|
||||||
output_len=args.sharegpt_output_len,
|
output_len=args.sharegpt_output_len,
|
||||||
),
|
),
|
||||||
"burstgpt": lambda: BurstGPTDataset(
|
"burstgpt": lambda: BurstGPTDataset(
|
||||||
random_seed=args.seed, dataset_path=args.dataset_path
|
random_seed=args.seed, dataset_path=args.dataset_path
|
||||||
).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
|
).sample(tokenizer=tokenizer, num_requests=total_prompts),
|
||||||
"random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
|
"random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
num_requests=args.num_prompts,
|
num_requests=total_prompts,
|
||||||
prefix_len=args.random_prefix_len,
|
prefix_len=args.random_prefix_len,
|
||||||
input_len=args.random_input_len,
|
input_len=args.random_input_len,
|
||||||
output_len=args.random_output_len,
|
output_len=args.random_output_len,
|
||||||
@ -773,10 +767,6 @@ def main(args: argparse.Namespace):
|
|||||||
if "temperature" not in sampling_params:
|
if "temperature" not in sampling_params:
|
||||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||||
|
|
||||||
if args.backend == "llama.cpp":
|
|
||||||
# Disable prompt caching in llama.cpp backend
|
|
||||||
sampling_params["cache_prompt"] = False
|
|
||||||
|
|
||||||
# Avoid GC processing "static" data - reduce pause times.
|
# Avoid GC processing "static" data - reduce pause times.
|
||||||
gc.collect()
|
gc.collect()
|
||||||
gc.freeze()
|
gc.freeze()
|
||||||
@ -789,7 +779,7 @@ def main(args: argparse.Namespace):
|
|||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
input_requests=input_requests,
|
requests=input_requests,
|
||||||
logprobs=args.logprobs,
|
logprobs=args.logprobs,
|
||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
burstiness=args.burstiness,
|
burstiness=args.burstiness,
|
||||||
@ -849,8 +839,6 @@ def main(args: argparse.Namespace):
|
|||||||
]:
|
]:
|
||||||
if field in result_json:
|
if field in result_json:
|
||||||
del result_json[field]
|
del result_json[field]
|
||||||
if field in benchmark_result:
|
|
||||||
del benchmark_result[field]
|
|
||||||
|
|
||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
@ -863,7 +851,6 @@ def main(args: argparse.Namespace):
|
|||||||
if args.result_filename:
|
if args.result_filename:
|
||||||
file_name = args.result_filename
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
os.makedirs(args.result_dir, exist_ok=True)
|
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(
|
with open(
|
||||||
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
||||||
@ -904,7 +891,7 @@ if __name__ == "__main__":
|
|||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -1074,19 +1061,6 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
# group for dataset specific arguments
|
# group for dataset specific arguments
|
||||||
custom_group = parser.add_argument_group("custom dataset options")
|
|
||||||
custom_group.add_argument(
|
|
||||||
"--custom-output-len",
|
|
||||||
type=int,
|
|
||||||
default=256,
|
|
||||||
help="Number of output tokens per request, used only for custom dataset.",
|
|
||||||
)
|
|
||||||
custom_group.add_argument(
|
|
||||||
"--custom-skip-chat-template",
|
|
||||||
action="store_true",
|
|
||||||
help="Skip applying chat template to prompt, used only for custom dataset.",
|
|
||||||
)
|
|
||||||
|
|
||||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||||
sonnet_group.add_argument(
|
sonnet_group.add_argument(
|
||||||
"--sonnet-input-len",
|
"--sonnet-input-len",
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
r"""Benchmark online serving throughput with structured outputs.
|
r"""Benchmark online serving throughput with structured outputs.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@ -12,6 +11,7 @@ On the client side, run:
|
|||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
--structured-output-ratio 1.0 \
|
--structured-output-ratio 1.0 \
|
||||||
|
--structured-output-backend auto \
|
||||||
--request-rate 10 \
|
--request-rate 10 \
|
||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
|
|
||||||
@ -672,7 +672,7 @@ async def benchmark(
|
|||||||
def evaluate(ret, args):
|
def evaluate(ret, args):
|
||||||
def _eval_correctness_json(expected, actual):
|
def _eval_correctness_json(expected, actual):
|
||||||
# extract json string from string using regex
|
# extract json string from string using regex
|
||||||
import regex as re
|
import re
|
||||||
|
|
||||||
actual = actual.replace("\n", "").replace(" ", "").strip()
|
actual = actual.replace("\n", "").replace(" ", "").strip()
|
||||||
try:
|
try:
|
||||||
@ -687,7 +687,7 @@ def evaluate(ret, args):
|
|||||||
return actual in args.choice
|
return actual in args.choice
|
||||||
|
|
||||||
def _eval_correctness_regex(expected, actual):
|
def _eval_correctness_regex(expected, actual):
|
||||||
import regex as re
|
import re
|
||||||
|
|
||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
@ -66,9 +65,4 @@ class InfEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
def write_to_json(filename: str, records: list) -> None:
|
def write_to_json(filename: str, records: list) -> None:
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
json.dump(
|
json.dump(records, f, cls=InfEncoder)
|
||||||
records,
|
|
||||||
f,
|
|
||||||
cls=InfEncoder,
|
|
||||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
|
||||||
)
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Cutlass bench utils
|
# Cutlass bench utils
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import itertools
|
import itertools
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
|
|||||||
@ -1,223 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import argparse
|
|
||||||
import copy
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
|
||||||
|
|
||||||
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
|
||||||
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["batch_size"],
|
|
||||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
|
||||||
x_log=False,
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=[
|
|
||||||
"torch-bf16",
|
|
||||||
# "fp8-tensor-w-token-a",
|
|
||||||
"fp8-tensor-w-tensor-a",
|
|
||||||
"fp8-channel-w-token-a",
|
|
||||||
# "fp8-channel-w-tensor-a",
|
|
||||||
# "fp8-tensor-w-token-a-noquant",
|
|
||||||
"fp8-tensor-w-tensor-a-noquant",
|
|
||||||
"fp8-channel-w-token-a-noquant",
|
|
||||||
# "fp8-channel-w-tensor-a-noquant",
|
|
||||||
],
|
|
||||||
line_names=[
|
|
||||||
"torch-bf16",
|
|
||||||
# "fp8-tensor-w-token-a",
|
|
||||||
"fp8-tensor-w-tensor-a",
|
|
||||||
"fp8-channel-w-token-a",
|
|
||||||
# "fp8-channel-w-tensor-a",
|
|
||||||
# "fp8-tensor-w-token-a-noquant",
|
|
||||||
"fp8-tensor-w-tensor-a-noquant",
|
|
||||||
"fp8-channel-w-token-a-noquant",
|
|
||||||
# "fp8-channel-w-tensor-a-noquant",
|
|
||||||
],
|
|
||||||
ylabel="TFLOP/s (larger is better)",
|
|
||||||
plot_name="BF16 vs FP8 GEMMs",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(batch_size, provider, N, K):
|
|
||||||
M = batch_size
|
|
||||||
device = "cuda"
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
|
|
||||||
# Create input tensors
|
|
||||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
|
||||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
if "torch-bf16" in provider:
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
|
||||||
)
|
|
||||||
|
|
||||||
elif "fp8" in provider:
|
|
||||||
# Weights are always quantized ahead of time
|
|
||||||
if "noquant" in provider:
|
|
||||||
# For no quantization, we just measure the GEMM
|
|
||||||
if "tensor-w-token-a" in provider:
|
|
||||||
# Dynamic per-token quant for A, per-tensor quant for B
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
|
||||||
assert scale_b_fp8.numel() == 1
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
|
||||||
a, use_per_token_if_dynamic=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
elif "tensor-w-tensor-a" in provider:
|
|
||||||
# Static per-tensor quantization with fixed scales
|
|
||||||
# for both A and B
|
|
||||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
|
||||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
|
||||||
assert scale_b_fp8.numel() == 1
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
elif "channel-w-token-a" in provider:
|
|
||||||
# Static per-channel quantization for weights, per-token
|
|
||||||
# quant for A
|
|
||||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
|
||||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
|
||||||
assert scale_b_fp8.numel() == N
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
|
||||||
a, use_per_token_if_dynamic=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
elif "channel-w-tensor-a" in provider:
|
|
||||||
# Static per-channel quantization for weights, per-tensor
|
|
||||||
# quant for A
|
|
||||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
|
||||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
|
||||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
|
||||||
assert scale_b_fp8.numel() == N
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# In these cases, we quantize the activations during the GEMM call
|
|
||||||
if "tensor-w-token-a" in provider:
|
|
||||||
# Dynamic per-token quant for A, per-tensor quant for B
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
|
||||||
assert scale_b_fp8.numel() == 1
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
|
||||||
a, use_per_token_if_dynamic=True
|
|
||||||
)
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
elif "tensor-w-tensor-a" in provider:
|
|
||||||
# Static per-tensor quantization with fixed scales
|
|
||||||
# for both A and B
|
|
||||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
|
||||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
|
||||||
assert scale_b_fp8.numel() == 1
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
elif "channel-w-token-a" in provider:
|
|
||||||
# Static per-channel quantization for weights, per-token
|
|
||||||
# quant for A
|
|
||||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
|
||||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
|
||||||
assert scale_b_fp8.numel() == N
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
|
||||||
a, use_per_token_if_dynamic=True
|
|
||||||
)
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
elif "channel-w-tensor-a" in provider:
|
|
||||||
# Static per-channel quantization for weights, per-tensor
|
|
||||||
# quant for A
|
|
||||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
|
||||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
|
||||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
|
||||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
|
||||||
assert scale_b_fp8.numel() == N
|
|
||||||
|
|
||||||
def run_quant():
|
|
||||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
|
||||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
|
||||||
|
|
||||||
b_fp8 = b_fp8.t()
|
|
||||||
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: run_quant(), quantiles=quantiles
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate TFLOP/s, two flops per multiply-add
|
|
||||||
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
|
|
||||||
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_shapes(args):
|
|
||||||
KN_model_names = []
|
|
||||||
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
|
||||||
for model, tp_size in models_tps:
|
|
||||||
assert model in WEIGHT_SHAPES
|
|
||||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
|
||||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
|
||||||
KN.append(model)
|
|
||||||
KN_model_names.append(KN)
|
|
||||||
return KN_model_names
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
nargs="+",
|
|
||||||
type=str,
|
|
||||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
|
||||||
choices=[*WEIGHT_SHAPES.keys()],
|
|
||||||
help="List of models to benchmark",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--tp-sizes",
|
|
||||||
nargs="+",
|
|
||||||
type=int,
|
|
||||||
default=[1],
|
|
||||||
help="List of tensor parallel sizes",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
KN_model_names = prepare_shapes(args)
|
|
||||||
for K, N, model_name in KN_model_names:
|
|
||||||
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
|
||||||
benchmark.run(
|
|
||||||
print_data=True,
|
|
||||||
show_plots=True,
|
|
||||||
save_path=f"bench_fp8_res_n{N}_k{K}",
|
|
||||||
N=N,
|
|
||||||
K=K,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Benchmark finished!")
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
"""
|
||||||
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
|
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
|
||||||
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
|
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
|
||||||
@ -91,7 +90,7 @@ def bench_run(
|
|||||||
|
|
||||||
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
||||||
|
|
||||||
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
|
||||||
|
|
||||||
quant_blocksize = 16
|
quant_blocksize = 16
|
||||||
w1_blockscale = torch.empty(
|
w1_blockscale = torch.empty(
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
@ -7,8 +6,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
|
cutlass_moe_fp8,
|
||||||
fused_experts,
|
fused_experts,
|
||||||
fused_topk,
|
fused_topk,
|
||||||
)
|
)
|
||||||
@ -70,9 +69,18 @@ def bench_run(
|
|||||||
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
||||||
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||||
|
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
|
||||||
|
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
|
||||||
|
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||||
|
|
||||||
for expert in range(num_experts):
|
for expert in range(num_experts):
|
||||||
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
|
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
|
||||||
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
|
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
|
||||||
|
w1_q_notransp = w1_q.clone()
|
||||||
|
w2_q_notransp = w2_q.clone()
|
||||||
|
w1_q = w1_q.transpose(1, 2)
|
||||||
|
w2_q = w2_q.transpose(1, 2)
|
||||||
|
|
||||||
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
||||||
|
|
||||||
@ -113,6 +121,10 @@ def bench_run(
|
|||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
@ -120,10 +132,14 @@ def bench_run(
|
|||||||
a,
|
a,
|
||||||
w1,
|
w1,
|
||||||
w2,
|
w2,
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
c_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides2,
|
||||||
a1_scale=a_scale,
|
a1_scale=a_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -136,6 +152,10 @@ def bench_run(
|
|||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
):
|
):
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
@ -144,10 +164,14 @@ def bench_run(
|
|||||||
a,
|
a,
|
||||||
w1_q,
|
w1_q,
|
||||||
w2_q,
|
w2_q,
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
c_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides2,
|
||||||
a1_scale=a_scale,
|
a1_scale=a_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -193,6 +217,10 @@ def bench_run(
|
|||||||
w2_scale,
|
w2_scale,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
c_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides2,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
@ -201,8 +229,8 @@ def bench_run(
|
|||||||
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
||||||
run_triton_from_graph(
|
run_triton_from_graph(
|
||||||
a,
|
a,
|
||||||
w1_q,
|
w1_q_notransp,
|
||||||
w2_q,
|
w2_q_notransp,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
@ -221,12 +249,18 @@ def bench_run(
|
|||||||
"w2": w2,
|
"w2": w2,
|
||||||
"score": score,
|
"score": score,
|
||||||
"topk": topk,
|
"topk": topk,
|
||||||
|
"w1_q_notransp": w1_q_notransp,
|
||||||
|
"w2_q_notransp": w2_q_notransp,
|
||||||
# Cutlass params
|
# Cutlass params
|
||||||
"a_scale": a_scale,
|
"a_scale": a_scale,
|
||||||
"w1_q": w1_q,
|
"w1_q": w1_q,
|
||||||
"w2_q": w2_q,
|
"w2_q": w2_q,
|
||||||
"w1_scale": w1_scale,
|
"w1_scale": w1_scale,
|
||||||
"w2_scale": w2_scale,
|
"w2_scale": w2_scale,
|
||||||
|
"ab_strides1": ab_strides1,
|
||||||
|
"c_strides1": c_strides1,
|
||||||
|
"ab_strides2": ab_strides2,
|
||||||
|
"c_strides2": c_strides2,
|
||||||
# cuda graph params
|
# cuda graph params
|
||||||
"cutlass_graph": cutlass_graph,
|
"cutlass_graph": cutlass_graph,
|
||||||
"triton_graph": triton_graph,
|
"triton_graph": triton_graph,
|
||||||
@ -244,8 +278,8 @@ def bench_run(
|
|||||||
# Warmup
|
# Warmup
|
||||||
run_triton_moe(
|
run_triton_moe(
|
||||||
a,
|
a,
|
||||||
w1_q,
|
w1_q_notransp,
|
||||||
w2_q,
|
w2_q_notransp,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
@ -256,7 +290,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@ -287,12 +321,16 @@ def bench_run(
|
|||||||
w2_scale,
|
w2_scale,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
c_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides2,
|
||||||
num_warmup,
|
num_warmup,
|
||||||
)
|
)
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501
|
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from typing import Any, TypedDict
|
from typing import Any, TypedDict
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
@ -85,10 +84,7 @@ def main(
|
|||||||
if version == "v2":
|
if version == "v2":
|
||||||
if current_platform.is_rocm():
|
if current_platform.is_rocm():
|
||||||
global PARTITION_SIZE
|
global PARTITION_SIZE
|
||||||
if not args.custom_paged_attn and not current_platform.is_navi():
|
PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
|
||||||
PARTITION_SIZE = 1024
|
|
||||||
else:
|
|
||||||
PARTITION_SIZE = PARTITION_SIZE_ROCM
|
|
||||||
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
|
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
|
||||||
tmp_output = torch.empty(
|
tmp_output = torch.empty(
|
||||||
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||||
@ -163,7 +159,6 @@ def main(
|
|||||||
scale,
|
scale,
|
||||||
block_tables,
|
block_tables,
|
||||||
seq_lens,
|
seq_lens,
|
||||||
None,
|
|
||||||
block_size,
|
block_size,
|
||||||
max_seq_len,
|
max_seq_len,
|
||||||
alibi_slopes,
|
alibi_slopes,
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@ -23,7 +22,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
base: float = 10000,
|
base: int = 10000,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
WEIGHT_SHAPES = {
|
WEIGHT_SHAPES = {
|
||||||
"ideal": [[4 * 256 * 32, 256 * 32]],
|
"ideal": [[4 * 256 * 32, 256 * 32]],
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import time
|
import time
|
||||||
|
|||||||
@ -1,13 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import pickle
|
import pickle
|
||||||
|
import re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import regex as re
|
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
from torch.utils.benchmark import Measurement as TMeasurement
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
@ -49,50 +48,4 @@ WEIGHT_SHAPES = {
|
|||||||
([16384, 106496], 1),
|
([16384, 106496], 1),
|
||||||
([53248, 16384], 0),
|
([53248, 16384], 0),
|
||||||
],
|
],
|
||||||
"meta-llama/Llama-3.1-8B-Instruct": [
|
|
||||||
([4096, 6144], 1),
|
|
||||||
([4096, 4096], 0),
|
|
||||||
([4096, 28672], 1),
|
|
||||||
([14336, 4096], 0),
|
|
||||||
],
|
|
||||||
"meta-llama/Llama-3.3-70B-Instruct": [
|
|
||||||
([8192, 10240], 1),
|
|
||||||
([8192, 8192], 0),
|
|
||||||
([8192, 57344], 1),
|
|
||||||
([28672, 8192], 0),
|
|
||||||
],
|
|
||||||
"mistralai/Mistral-Large-Instruct-2407": [
|
|
||||||
([12288, 14336], 1),
|
|
||||||
([12288, 12288], 0),
|
|
||||||
([12288, 57344], 1),
|
|
||||||
([28672, 12288], 0),
|
|
||||||
],
|
|
||||||
"Qwen/Qwen2.5-7B-Instruct": [
|
|
||||||
([3584, 4608], 1),
|
|
||||||
([3584, 3584], 0),
|
|
||||||
([3584, 37888], 1),
|
|
||||||
([18944, 3584], 0),
|
|
||||||
],
|
|
||||||
"Qwen/Qwen2.5-32B-Instruct": [
|
|
||||||
([5120, 7168], 1),
|
|
||||||
([5120, 5120], 0),
|
|
||||||
([5120, 55296], 1),
|
|
||||||
([27648, 5120], 0),
|
|
||||||
],
|
|
||||||
"Qwen/Qwen2.5-72B-Instruct": [
|
|
||||||
([8192, 10240], 1),
|
|
||||||
([8192, 8192], 0),
|
|
||||||
([8192, 59136], 1),
|
|
||||||
([29568, 8192], 0),
|
|
||||||
],
|
|
||||||
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
|
|
||||||
([2048, 3072], 1),
|
|
||||||
([2048, 4096], 1),
|
|
||||||
([2048, 2048], 0),
|
|
||||||
([2048, 576], 0),
|
|
||||||
([2048, 21888], 1),
|
|
||||||
([10944, 2048], 0),
|
|
||||||
([2048, 2816], 1),
|
|
||||||
([1408, 2048], 0),
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|||||||
@ -6,6 +6,11 @@
|
|||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
line-length = 88
|
line-length = 88
|
||||||
|
exclude = [
|
||||||
|
# External file, leaving license intact
|
||||||
|
"examples/other/fp8/quantizer/quantize.py",
|
||||||
|
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
||||||
|
]
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
|||||||
@ -75,7 +75,6 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
|||||||
else()
|
else()
|
||||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
|
|
||||||
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||||
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||||
@ -107,19 +106,13 @@ elseif (AVX2_FOUND)
|
|||||||
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||||
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||||
|
|
||||||
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
elseif (POWER9_FOUND OR POWER10_FOUND)
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
if (POWER9_FOUND)
|
# Check for PowerPC VSX support
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-mvsx"
|
"-mvsx"
|
||||||
"-mcpu=power9"
|
"-mcpu=native"
|
||||||
"-mtune=power9")
|
"-mtune=native")
|
||||||
elseif (POWER10_FOUND OR POWER11_FOUND)
|
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
|
||||||
"-mvsx"
|
|
||||||
"-mcpu=power10"
|
|
||||||
"-mtune=power10")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
elseif (ASIMD_FOUND)
|
elseif (ASIMD_FOUND)
|
||||||
message(STATUS "ARMv8 or later architecture detected")
|
message(STATUS "ARMv8 or later architecture detected")
|
||||||
|
|||||||
@ -46,38 +46,22 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
|
||||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
|
|
||||||
|
|
||||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
|
||||||
# This is here to support installing all components under the same prefix with cmake --install.
|
|
||||||
# setup.py installs every component separately but uses the same prefix for all.
|
|
||||||
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
|
|
||||||
# and these statements don't hurt when installing neither component.
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
|
||||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
|
||||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
|
||||||
|
|
||||||
# Fetch the vllm-flash-attn library
|
# Fetch the vllm-flash-attn library
|
||||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||||
|
|
||||||
# Restore the install prefix
|
|
||||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|
||||||
|
|
||||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||||
# case only one is built, in the case both are built redundant work is done)
|
# case only one is built, in the case both are built redundant work is done)
|
||||||
install(
|
install(
|
||||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
DESTINATION vllm/vllm_flash_attn
|
DESTINATION vllm_flash_attn
|
||||||
COMPONENT _vllm_fa2_C
|
COMPONENT _vllm_fa2_C
|
||||||
FILES_MATCHING PATTERN "*.py"
|
FILES_MATCHING PATTERN "*.py"
|
||||||
)
|
)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
DESTINATION vllm/vllm_flash_attn
|
DESTINATION vllm_flash_attn
|
||||||
COMPONENT _vllm_fa3_C
|
COMPONENT _vllm_fa3_C
|
||||||
FILES_MATCHING PATTERN "*.py"
|
FILES_MATCHING PATTERN "*.py"
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# A command line tool for running pytorch's hipify preprocessor on CUDA
|
# A command line tool for running pytorch's hipify preprocessor on CUDA
|
||||||
|
|||||||
@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
|
|||||||
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
||||||
add_custom_target(
|
add_custom_target(
|
||||||
hipify${NAME}
|
hipify${NAME}
|
||||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||||
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
||||||
BYPRODUCTS ${HIP_SRCS}
|
BYPRODUCTS ${HIP_SRCS}
|
||||||
COMMENT "Running hipify on ${NAME} extension source files.")
|
COMMENT "Running hipify on ${NAME} extension source files.")
|
||||||
|
|||||||
@ -143,14 +143,6 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
|||||||
const uint pack_size = 16 / sizeof(scalar_t);
|
const uint pack_size = 16 / sizeof(scalar_t);
|
||||||
TORCH_CHECK(head_size % pack_size == 0,
|
TORCH_CHECK(head_size % pack_size == 0,
|
||||||
"headsize must be multiple of pack_size:", pack_size);
|
"headsize must be multiple of pack_size:", pack_size);
|
||||||
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
|
||||||
"output heads must be contiguous in memory");
|
|
||||||
TORCH_CHECK(
|
|
||||||
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
|
||||||
"prefix_output heads must be contiguous in memory");
|
|
||||||
TORCH_CHECK(
|
|
||||||
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
|
||||||
"suffix_output heads must be contiguous in memory");
|
|
||||||
float* output_lse_ptr = nullptr;
|
float* output_lse_ptr = nullptr;
|
||||||
if (output_lse.has_value()) {
|
if (output_lse.has_value()) {
|
||||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||||
|
|||||||
@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
|
|||||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
||||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||||
hw_info,
|
hw_info,
|
||||||
1, // split_kv
|
-1, // split_kv
|
||||||
nullptr, // is_var_split_kv
|
nullptr, // is_var_split_kv
|
||||||
};
|
};
|
||||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||||
|
|||||||
@ -19,7 +19,6 @@ namespace vec_op {
|
|||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
|
AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
|
||||||
|
|
||||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
|||||||
@ -15,6 +15,15 @@
|
|||||||
cutlassGetStatusString(error)); \
|
cutlassGetStatusString(error)); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Panic wrapper for unwinding CUDA runtime errors
|
||||||
|
*/
|
||||||
|
#define CUDA_CHECK(status) \
|
||||||
|
{ \
|
||||||
|
cudaError_t error = status; \
|
||||||
|
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
|
||||||
|
}
|
||||||
|
|
||||||
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
|
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
|
||||||
int max_shared_mem_per_block_opt_in = 0;
|
int max_shared_mem_per_block_opt_in = 0;
|
||||||
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
|
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import enum
|
import enum
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|||||||
@ -13,10 +13,6 @@
|
|||||||
#include <cub/block/block_load.cuh>
|
#include <cub/block/block_load.cuh>
|
||||||
#include <cub/block/block_store.cuh>
|
#include <cub/block/block_store.cuh>
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
|
||||||
namespace cub = hipcub;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "static_switch.h"
|
#include "static_switch.h"
|
||||||
|
|
||||||
|
|
||||||
@ -505,9 +501,15 @@ void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|||||||
auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
|
auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
|
||||||
|
|
||||||
if (kSmemSize >= 48 * 1024) {
|
if (kSmemSize >= 48 * 1024) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||||
|
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||||
|
#else
|
||||||
|
// There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
|
||||||
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||||
(void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
(void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||||
std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
|
std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
||||||
|
|
||||||
|
|||||||
@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
|||||||
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
||||||
if (kSmemSize >= 48 * 1024) {
|
if (kSmemSize >= 48 * 1024) {
|
||||||
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||||
(void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||||
}
|
}
|
||||||
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import glob
|
import glob
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
|
|||||||
@ -28,10 +28,4 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
|||||||
torch::Tensor num_tokens_post_pad, int64_t top_k,
|
torch::Tensor num_tokens_post_pad, int64_t top_k,
|
||||||
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
|
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
|
||||||
int64_t BLOCK_SIZE_K, int64_t bit);
|
int64_t BLOCK_SIZE_K, int64_t bit);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool moe_permute_unpermute_supported();
|
|
||||||
|
|
||||||
void shuffle_rows(const torch::Tensor& input_tensor,
|
|
||||||
const torch::Tensor& dst2src_map,
|
|
||||||
torch::Tensor& output_tensor);
|
|
||||||
@ -5,9 +5,6 @@
|
|||||||
#include "permute_unpermute_kernels/dispatch.h"
|
#include "permute_unpermute_kernels/dispatch.h"
|
||||||
#include "core/registration.h"
|
#include "core/registration.h"
|
||||||
|
|
||||||
// moe_permute kernels require at least CUDA 12.0
|
|
||||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
|
|
||||||
|
|
||||||
void moe_permute(
|
void moe_permute(
|
||||||
const torch::Tensor& input, // [n_token, hidden]
|
const torch::Tensor& input, // [n_token, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||||
@ -130,101 +127,7 @@ void moe_unpermute(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__global__ void shuffleInputRowsKernel(const T* input,
|
|
||||||
const int32_t* dst2src_map, T* output,
|
|
||||||
int64_t num_src_rows,
|
|
||||||
int64_t num_dst_rows, int64_t num_cols) {
|
|
||||||
int64_t dest_row_idx = blockIdx.x;
|
|
||||||
int64_t const source_row_idx = dst2src_map[dest_row_idx];
|
|
||||||
|
|
||||||
if (blockIdx.x < num_dst_rows) {
|
|
||||||
// Load 128-bits per thread
|
|
||||||
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
|
|
||||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
|
||||||
|
|
||||||
// Duplicate and permute rows
|
|
||||||
auto const* source_row_ptr =
|
|
||||||
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
|
|
||||||
auto* dest_row_ptr =
|
|
||||||
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
|
|
||||||
|
|
||||||
int64_t const start_offset = threadIdx.x;
|
|
||||||
int64_t const stride = blockDim.x;
|
|
||||||
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
|
|
||||||
|
|
||||||
for (int elem_index = start_offset; elem_index < num_elems_in_col;
|
|
||||||
elem_index += stride) {
|
|
||||||
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void shuffle_rows(const torch::Tensor& input_tensor,
|
|
||||||
const torch::Tensor& dst2src_map,
|
|
||||||
torch::Tensor& output_tensor) {
|
|
||||||
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
|
|
||||||
"Input and output tensors must have the same data type");
|
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
|
||||||
int64_t const blocks = output_tensor.size(0);
|
|
||||||
int64_t const threads = 256;
|
|
||||||
int64_t const num_dest_rows = output_tensor.size(0);
|
|
||||||
int64_t const num_src_rows = input_tensor.size(0);
|
|
||||||
int64_t const num_cols = input_tensor.size(1);
|
|
||||||
|
|
||||||
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
|
|
||||||
"num_cols must be divisible by 128 / "
|
|
||||||
"sizeof(input_tensor.scalar_type()) / 8");
|
|
||||||
|
|
||||||
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
|
|
||||||
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
|
||||||
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
|
|
||||||
dst2src_map.data_ptr<int32_t>(),
|
|
||||||
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
|
|
||||||
num_dest_rows, num_cols);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
|
||||||
torch::Tensor& topk_ids,
|
|
||||||
const torch::Tensor& token_expert_indicies,
|
|
||||||
const std::optional<torch::Tensor>& expert_map,
|
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
|
||||||
const std::optional<int64_t>& align_block_size,
|
|
||||||
torch::Tensor& permuted_input,
|
|
||||||
torch::Tensor& expert_first_token_offset,
|
|
||||||
torch::Tensor& src_row_id2dst_row_id_map,
|
|
||||||
torch::Tensor& m_indices) {
|
|
||||||
TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
|
|
||||||
}
|
|
||||||
|
|
||||||
void moe_unpermute(const torch::Tensor& input,
|
|
||||||
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
|
||||||
const torch::Tensor& token_expert_indicies,
|
|
||||||
const std::optional<torch::Tensor>& expert_map,
|
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
|
||||||
const std::optional<int64_t>& align_block_size,
|
|
||||||
torch::Tensor& permuted_input,
|
|
||||||
torch::Tensor& expert_first_token_offset,
|
|
||||||
torch::Tensor& src_row_id2dst_row_id_map,
|
|
||||||
torch::Tensor& m_indices) {
|
|
||||||
TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool moe_permute_unpermute_supported() {
|
|
||||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
|
|
||||||
return true;
|
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
m.impl("moe_permute", &moe_permute);
|
m.impl("moe_permute", &moe_permute);
|
||||||
m.impl("moe_unpermute", &moe_unpermute);
|
m.impl("moe_unpermute", &moe_unpermute);
|
||||||
}
|
}
|
||||||
@ -14,13 +14,12 @@
|
|||||||
__VA_ARGS__(); \
|
__VA_ARGS__(); \
|
||||||
break; \
|
break; \
|
||||||
}
|
}
|
||||||
#define MOE_DISPATCH_FLOAT_CASE(...) \
|
#define MOE_DISPATCH_FLOAT_CASE(...) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
|
|
||||||
|
|
||||||
#define MOE_DISPATCH(TYPE, ...) \
|
#define MOE_DISPATCH(TYPE, ...) \
|
||||||
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
|
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
|
||||||
@ -40,11 +39,6 @@ template <>
|
|||||||
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
|
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
|
||||||
using type = __nv_bfloat16;
|
using type = __nv_bfloat16;
|
||||||
};
|
};
|
||||||
// uint8 for packed fp4
|
|
||||||
template <>
|
|
||||||
struct ScalarType2CudaType<at::ScalarType::Byte> {
|
|
||||||
using type = uint8_t;
|
|
||||||
};
|
|
||||||
|
|
||||||
// #if __CUDA_ARCH__ >= 890
|
// #if __CUDA_ARCH__ >= 890
|
||||||
// fp8
|
// fp8
|
||||||
|
|||||||
@ -1,9 +1,6 @@
|
|||||||
|
|
||||||
#include "moe_permute_unpermute_kernel.h"
|
#include "moe_permute_unpermute_kernel.h"
|
||||||
|
|
||||||
// moe_permute kernels require at least CUDA 12.0
|
|
||||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
|
|
||||||
|
|
||||||
// CubKeyValueSorter definition begin
|
// CubKeyValueSorter definition begin
|
||||||
CubKeyValueSorter::CubKeyValueSorter()
|
CubKeyValueSorter::CubKeyValueSorter()
|
||||||
: num_experts_(0), num_bits_(sizeof(int) * 8) {}
|
: num_experts_(0), num_bits_(sizeof(int) * 8) {}
|
||||||
@ -134,6 +131,9 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
|
|||||||
int num_experts) {
|
int num_experts) {
|
||||||
auto tidx = threadIdx.x;
|
auto tidx = threadIdx.x;
|
||||||
auto bidx = blockIdx.x;
|
auto bidx = blockIdx.x;
|
||||||
|
auto lidx = tidx & 31;
|
||||||
|
auto widx = tidx >> 5;
|
||||||
|
auto warp_count = (blockDim.x + 31) >> 5;
|
||||||
auto offset = bidx * blockDim.x;
|
auto offset = bidx * blockDim.x;
|
||||||
auto bound = min(offset + blockDim.x, size);
|
auto bound = min(offset + blockDim.x, size);
|
||||||
extern __shared__ int smem_expert_map[];
|
extern __shared__ int smem_expert_map[];
|
||||||
@ -226,6 +226,4 @@ void getMIndices(int64_t* expert_first_token_offset,
|
|||||||
expert_first_token_offset, align_expert_first_token_offset, m_indices,
|
expert_first_token_offset, align_expert_first_token_offset, m_indices,
|
||||||
num_local_expert, align_block_size);
|
num_local_expert, align_block_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
@ -516,8 +516,9 @@ void topk_softmax(
|
|||||||
topk,
|
topk,
|
||||||
stream);
|
stream);
|
||||||
}
|
}
|
||||||
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
else
|
||||||
{
|
{
|
||||||
|
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
|
||||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||||
gating_output.data_ptr<float>(),
|
gating_output.data_ptr<float>(),
|
||||||
topk_weights.data_ptr<float>(),
|
topk_weights.data_ptr<float>(),
|
||||||
@ -529,17 +530,4 @@ void topk_softmax(
|
|||||||
topk,
|
topk,
|
||||||
stream);
|
stream);
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
|
|
||||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
|
||||||
gating_output.data_ptr<float>(),
|
|
||||||
topk_weights.data_ptr<float>(),
|
|
||||||
topk_indices.data_ptr<int64_t>(),
|
|
||||||
token_expert_indices.data_ptr<int>(),
|
|
||||||
softmax_workspace.data_ptr<float>(),
|
|
||||||
num_tokens,
|
|
||||||
num_experts,
|
|
||||||
topk,
|
|
||||||
stream);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user