Compare commits
121 Commits
use-uv-pyt
...
codex/remo
| Author | SHA1 | Date | |
|---|---|---|---|
| 944913c0fa | |||
| b8f603cebe | |||
| fc679696f8 | |||
| ab5e7d93f4 | |||
| 0340f45553 | |||
| 19a00eb210 | |||
| 391612e78b | |||
| 77c95f72f7 | |||
| 59f30d0448 | |||
| 43c146ca42 | |||
| 7c2ec0fe87 | |||
| 039b6bade3 | |||
| 6c04638214 | |||
| 91ac7f764d | |||
| 4be7d7c1c9 | |||
| 59b477645c | |||
| 778f554157 | |||
| d3c84297c3 | |||
| f509a20846 | |||
| 60bc25e74c | |||
| b893d661b1 | |||
| 6b6e98775f | |||
| 9c3c21c519 | |||
| 512b8affa4 | |||
| 1c0c68202c | |||
| 5f317530ec | |||
| 557b2e961d | |||
| 4e256cadc2 | |||
| d6953beb91 | |||
| 17edd8a807 | |||
| 3303cfb4ac | |||
| b7e8e4e6be | |||
| 432e1cbc23 | |||
| 201c971e96 | |||
| e0986ea07b | |||
| a964e5e6c3 | |||
| 78c1d5bfd2 | |||
| 59a85c366e | |||
| 119f00630b | |||
| a42d2df75f | |||
| 5c057e068f | |||
| ed3aeb25a4 | |||
| 86ee949128 | |||
| 4570535ec4 | |||
| 2a6dc67eb5 | |||
| f05fea1f5e | |||
| d0df145c2a | |||
| 1838cd4860 | |||
| 7d6b03381e | |||
| 7c2e91c4e0 | |||
| 736fbf4c89 | |||
| 44ea85137a | |||
| d3d649efec | |||
| ea507c3a93 | |||
| 9705fba7b7 | |||
| 2f7dbc9b42 | |||
| ea25a76c05 | |||
| 67bc0c003e | |||
| 5a05f26603 | |||
| 7ef40bb983 | |||
| 767cbb011d | |||
| 7cfa4b24bf | |||
| b71fcd4905 | |||
| 75003f34e8 | |||
| 78b8015a4d | |||
| 831b124151 | |||
| c1ffcb55da | |||
| 0879736aab | |||
| a26917332f | |||
| cd9e5b8340 | |||
| 300a59c4c3 | |||
| d76541a6c5 | |||
| dd96465fd7 | |||
| 4f8f47e87e | |||
| d78fda7cda | |||
| 73a99cc2a5 | |||
| adae0c1f43 | |||
| cbf9221992 | |||
| 5f42fc53b6 | |||
| 8ee846c27c | |||
| 812b7f54a8 | |||
| 5f2cacdb1e | |||
| aa5053e3fe | |||
| 79aa244678 | |||
| 2ed3f20dba | |||
| 48f309029a | |||
| 0e93ac0b3a | |||
| 5446ad1d24 | |||
| f9a8084e48 | |||
| 3e70e3d4d5 | |||
| eb0fa43868 | |||
| 0ad9951c41 | |||
| 8c9117181d | |||
| c4b48d3c0f | |||
| 10d765482d | |||
| 39b643dc1a | |||
| 711f485643 | |||
| 9c5ee91b2a | |||
| 27edd2aeb4 | |||
| e5017cd6d6 | |||
| 6a7796e871 | |||
| 47b9339546 | |||
| 5d5146eee3 | |||
| 2aaa423842 | |||
| ad2d788016 | |||
| 36ce76c632 | |||
| f1fc2107a3 | |||
| 13cdc02173 | |||
| 502640c3f9 | |||
| 3d5f1c8640 | |||
| 1cab2f9cad | |||
| 1e50f1be70 | |||
| ad87ba927a | |||
| decf7f794b | |||
| d00d652998 | |||
| 3b279a84be | |||
| 5e4a8223c6 | |||
| e51de388a2 | |||
| cc253b73d3 | |||
| 7d6fb905d9 | |||
| 418d111f8c |
@ -368,7 +368,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
|
|||||||
@ -181,18 +181,14 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="python3 \
|
server_command="vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
--model $model \
|
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="python3 \
|
server_command="vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
--model $model \
|
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -365,8 +365,7 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs python3 \
|
server_command="$server_envs vllm serve \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
|
|||||||
@ -1,46 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@ -150,11 +150,16 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||||
- "docker push vllm/vllm-openai:nightly"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||||
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||||
|
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||||
|
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||||
|
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||||
|
- "docker manifest push vllm/vllm-openai:nightly"
|
||||||
|
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
# Clean up old nightly builds (keep only last 14)
|
# Clean up old nightly builds (keep only last 14)
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
plugins:
|
plugins:
|
||||||
@ -163,3 +168,4 @@ steps:
|
|||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|||||||
@ -8,20 +8,41 @@ set -ex
|
|||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
|
|
||||||
# Get DockerHub token from environment
|
# Get DockerHub credentials from environment
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
||||||
|
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get DockerHub bearer token
|
||||||
|
echo "Getting DockerHub bearer token..."
|
||||||
|
set +x
|
||||||
|
BEARER_TOKEN=$(curl -s -X POST \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
||||||
|
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
||||||
|
set -x
|
||||||
|
|
||||||
|
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
||||||
|
echo "Error: Failed to get DockerHub bearer token"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
# Function to get all tags from DockerHub
|
||||||
get_all_tags() {
|
get_all_tags() {
|
||||||
local page=1
|
local page=1
|
||||||
local all_tags=""
|
local all_tags=""
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
set +x
|
||||||
|
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
"$REPO_API_URL?page=$page&page_size=100")
|
||||||
|
set -x
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||||
@ -43,7 +64,9 @@ delete_tag() {
|
|||||||
echo "Deleting tag: $tag_name"
|
echo "Deleting tag: $tag_name"
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
set +x
|
||||||
|
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
||||||
|
set -x
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||||
|
|||||||
191
.buildkite/scripts/hardware_ci/run-npu-test.sh
Normal file
191
.buildkite/scripts/hardware_ci/run-npu-test.sh
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Base ubuntu image with basic ascend development libraries and python installed
|
||||||
|
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
||||||
|
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
||||||
|
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
||||||
|
VLLM_ASCEND_TMP_DIR=
|
||||||
|
# Get the test run configuration file from the vllm-ascend repository
|
||||||
|
fetch_vllm_test_cfg() {
|
||||||
|
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
||||||
|
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
||||||
|
cleanup() {
|
||||||
|
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
||||||
|
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If the file already exists locally, just overwrite it
|
||||||
|
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
||||||
|
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
||||||
|
|
||||||
|
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
||||||
|
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
||||||
|
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
trap - EXIT
|
||||||
|
}
|
||||||
|
|
||||||
|
# Downloads test run configuration file from a remote URL.
|
||||||
|
# Loads the configuration into the current script environment.
|
||||||
|
get_config() {
|
||||||
|
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
||||||
|
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
source "${TEST_RUN_CONFIG_FILE}"
|
||||||
|
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# get test running configuration.
|
||||||
|
fetch_vllm_test_cfg
|
||||||
|
get_config
|
||||||
|
# Check if the function call was successful. If not, exit the script.
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
||||||
|
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
||||||
|
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
||||||
|
echo "agent_idx: ${agent_idx}"
|
||||||
|
builder_name="cachebuilder${agent_idx}"
|
||||||
|
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||||
|
mkdir -p ${builder_cache_dir}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
||||||
|
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
||||||
|
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
||||||
|
--progress=plain --load -t ${image_name} -f - .
|
||||||
|
FROM ${BASE_IMAGE_NAME}
|
||||||
|
|
||||||
|
# Define environments
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||||
|
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||||
|
apt-get update -y && \
|
||||||
|
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||||
|
rm -rf /var/cache/apt/* && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install for pytest to make the docker build cache layer always valid
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install pytest>=6.0 modelscope
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||||
|
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements/common.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install vLLM
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
python3 -m pip uninstall -y triton
|
||||||
|
|
||||||
|
# Install vllm-ascend
|
||||||
|
WORKDIR /workspace
|
||||||
|
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
||||||
|
ARG VLLM_ASCEND_TAG=main
|
||||||
|
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
||||||
|
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
||||||
|
|
||||||
|
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r /workspace/vllm-ascend/requirements.txt
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||||
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
|
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
|
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||||
|
|
||||||
|
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
ENV VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm-ascend
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f "${container_name}" || true;
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
docker system prune -f || true;
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||||
|
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||||
|
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||||
|
# returns --device /dev/davinci0 --device /dev/davinci1
|
||||||
|
parse_and_gen_devices() {
|
||||||
|
local input="$1"
|
||||||
|
local index cards_num
|
||||||
|
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
||||||
|
index="${BASH_REMATCH[1]}"
|
||||||
|
cards_num="${BASH_REMATCH[2]}"
|
||||||
|
else
|
||||||
|
echo "parse error" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local devices=""
|
||||||
|
local i=0
|
||||||
|
while (( i < cards_num )); do
|
||||||
|
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||||
|
devices="$devices --device /dev/davinci${dev_idx}"
|
||||||
|
((i++))
|
||||||
|
done
|
||||||
|
|
||||||
|
# trim leading space
|
||||||
|
devices="${devices#"${devices%%[![:space:]]*}"}"
|
||||||
|
# Output devices: assigned to the caller variable
|
||||||
|
printf '%s' "$devices"
|
||||||
|
}
|
||||||
|
|
||||||
|
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||||
|
|
||||||
|
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||||
|
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||||
|
# the hardware plugin vllm-ascend.
|
||||||
|
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||||
|
mkdir -p ${model_cache_dir}
|
||||||
|
docker run \
|
||||||
|
${devices} \
|
||||||
|
--device /dev/davinci_manager \
|
||||||
|
--device /dev/devmm_svm \
|
||||||
|
--device /dev/hisi_hdc \
|
||||||
|
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||||
|
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||||
|
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||||
|
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||||
|
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||||
|
-v ${model_cache_dir}:/root/.cache/modelscope \
|
||||||
|
--entrypoint="" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
"${image_name}" \
|
||||||
|
bash -c '
|
||||||
|
set -e
|
||||||
|
pytest -v -s tests/e2e/vllm_interface/
|
||||||
|
'
|
||||||
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|||||||
@ -397,6 +397,7 @@ steps:
|
|||||||
- pytest -v -s compile/test_pass_manager.py
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s compile/test_functionalization.py
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s compile/test_async_tp.py
|
- pytest -v -s compile/test_async_tp.py
|
||||||
@ -476,6 +477,7 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/mamba/
|
- csrc/mamba/
|
||||||
- tests/kernels/mamba
|
- tests/kernels/mamba
|
||||||
|
- vllm/model_executor/layers/mamba/ops
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
@ -833,11 +835,11 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
|
||||||
- label: GPT-OSS Eval (Blackwell)
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true # disable while debugging
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/evals/gpt_oss
|
- tests/evals/gpt_oss
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
@ -864,6 +866,16 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Blackwell LM Eval Small Models
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
gpu: b200
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
|
|||||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -23,6 +23,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||||
|
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
|
|||||||
30
.github/mergify.yml
vendored
30
.github/mergify.yml
vendored
@ -2,6 +2,7 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -14,6 +15,7 @@ pull_request_rules:
|
|||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -30,6 +32,7 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -46,6 +49,7 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -55,6 +59,7 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -70,6 +75,7 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -83,6 +89,7 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -94,6 +101,7 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@ -107,6 +115,7 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -121,6 +130,7 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
@ -142,6 +152,7 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -162,6 +173,7 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -181,6 +193,7 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -196,6 +209,7 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -208,6 +222,7 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -223,6 +238,7 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -237,6 +253,7 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
@ -255,8 +272,9 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- label != stale
|
||||||
- -closed
|
- conflict
|
||||||
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -270,6 +288,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
@ -281,6 +301,7 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
- name: assign reviewer for modelopt changes
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
@ -295,8 +316,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
@ -305,6 +326,7 @@ pull_request_rules:
|
|||||||
- name: label-kv-connector
|
- name: label-kv-connector
|
||||||
description: Automatically apply kv-connector label
|
description: Automatically apply kv-connector label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||||
|
|||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
@ -6,28 +6,16 @@ default_stages:
|
|||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/google/yapf
|
|
||||||
rev: v0.43.0
|
|
||||||
hooks:
|
|
||||||
- id: yapf
|
|
||||||
args: [--in-place, --verbose]
|
|
||||||
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
|
||||||
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.13.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff-check
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.35.5
|
rev: v1.35.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
- repo: https://github.com/PyCQA/isort
|
|
||||||
rev: 6.0.1
|
|
||||||
hooks:
|
|
||||||
- id: isort
|
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v20.1.3
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -2,9 +2,9 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
|
from benchmark_utils import TimeCollector
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,9 @@ import time
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from benchmark_utils import TimeCollector
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
@ -164,7 +164,7 @@ def invoke_main() -> None:
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batched", action="store_true", help="consider time to prepare batch"
|
"--batched", action="store_true", help="consider time to prepare batch"
|
||||||
) # noqa: E501
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
|
|||||||
@ -37,14 +37,13 @@ from typing import Optional
|
|||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm.asyncio import tqdm
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
RequestFuncInput,
|
RequestFuncInput,
|
||||||
RequestFuncOutput,
|
RequestFuncOutput,
|
||||||
)
|
)
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -910,13 +909,13 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
@ -158,7 +158,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
|
|||||||
@ -55,9 +55,7 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -65,9 +63,7 @@ benchmark() {
|
|||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -38,16 +38,12 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@ -62,18 +58,14 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
174
benchmarks/kernels/benchmark_reshape_and_cache.py
Normal file
174
benchmarks/kernels/benchmark_reshape_and_cache.py
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import (
|
||||||
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
|
FlexibleArgumentParser,
|
||||||
|
create_kv_caches_with_random,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_benchmark(
|
||||||
|
num_tokens: int,
|
||||||
|
num_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
block_size: int,
|
||||||
|
num_blocks: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
kv_cache_dtype: str,
|
||||||
|
num_iters: int,
|
||||||
|
benchmark_mode: str,
|
||||||
|
device: str = "cuda",
|
||||||
|
) -> float:
|
||||||
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
|
|
||||||
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
|
current_platform.seed_everything(42)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
|
# create random key / value tensors [T, H, D].
|
||||||
|
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
||||||
|
value = torch.randn_like(key)
|
||||||
|
|
||||||
|
# prepare the slot mapping.
|
||||||
|
# each token is assigned a unique slot in the KV-cache.
|
||||||
|
num_slots = block_size * num_blocks
|
||||||
|
if num_tokens > num_slots:
|
||||||
|
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
||||||
|
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||||
|
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
key_caches, value_caches = create_kv_caches_with_random(
|
||||||
|
num_blocks,
|
||||||
|
block_size,
|
||||||
|
1, # num_layers
|
||||||
|
num_heads,
|
||||||
|
head_size,
|
||||||
|
kv_cache_dtype,
|
||||||
|
dtype,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
# to free unused memory
|
||||||
|
del key_caches, value_caches
|
||||||
|
|
||||||
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
|
function_under_test = lambda: ops.reshape_and_cache(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
if benchmark_mode == "cudagraph":
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(g):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.perf_counter()
|
||||||
|
for _ in range(n_iters):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
return (end - start) / n_iters
|
||||||
|
|
||||||
|
# warm-up
|
||||||
|
run_cuda_benchmark(3)
|
||||||
|
|
||||||
|
lat = run_cuda_benchmark(num_iters)
|
||||||
|
|
||||||
|
# free tensors to mitigate OOM when sweeping
|
||||||
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return lat
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
rows = []
|
||||||
|
for exp in range(1, 17):
|
||||||
|
n_tok = 2**exp
|
||||||
|
lat = run_benchmark(
|
||||||
|
num_tokens=n_tok,
|
||||||
|
num_heads=args.num_heads,
|
||||||
|
head_size=args.head_size,
|
||||||
|
block_size=args.block_size,
|
||||||
|
num_blocks=args.num_blocks,
|
||||||
|
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
num_iters=args.iters,
|
||||||
|
benchmark_mode=args.mode,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
||||||
|
|
||||||
|
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
||||||
|
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("--num-heads", type=int, default=128)
|
||||||
|
parser.add_argument(
|
||||||
|
"--head-size",
|
||||||
|
type=int,
|
||||||
|
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||||
|
default=128,
|
||||||
|
)
|
||||||
|
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||||
|
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["half", "bfloat16", "float"],
|
||||||
|
default="bfloat16",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-cache-dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
default="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--iters", type=int, default=200)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
type=str,
|
||||||
|
choices=["cudagraph", "no_graph"],
|
||||||
|
default="cudagraph",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
||||||
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# fmt: off
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -9,7 +8,7 @@ import torch
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.deep_gemm import (
|
from vllm.utils.deep_gemm import (
|
||||||
@ -20,19 +19,21 @@ from vllm.utils.deep_gemm import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_shape(m: int,
|
def benchmark_shape(
|
||||||
n: int,
|
m: int,
|
||||||
k: int,
|
n: int,
|
||||||
warmup: int = 100,
|
k: int,
|
||||||
repeat: int = 10000,
|
warmup: int = 100,
|
||||||
verbose: bool = False) -> dict:
|
repeat: int = 10000,
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> dict:
|
||||||
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
||||||
|
|
||||||
# Create test tensors
|
# Create test tensors
|
||||||
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
|
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
|
||||||
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
|
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
# Reference result in BF16
|
# Reference result in BF16
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@ -49,34 +50,39 @@ def benchmark_shape(m: int,
|
|||||||
# Pre-quantize A for all implementations
|
# Pre-quantize A for all implementations
|
||||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||||
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
||||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||||
A, block_size[1], column_major_scales=True)
|
A, block_size[1], column_major_scales=True
|
||||||
|
)
|
||||||
|
|
||||||
# === DeepGEMM Implementation ===
|
# === DeepGEMM Implementation ===
|
||||||
def deepgemm_gemm():
|
def deepgemm_gemm():
|
||||||
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
|
fp8_gemm_nt(
|
||||||
(B_deepgemm, B_scale_deepgemm),
|
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
|
||||||
C_deepgemm)
|
)
|
||||||
return C_deepgemm
|
return C_deepgemm
|
||||||
|
|
||||||
# === vLLM Triton Implementation ===
|
# === vLLM Triton Implementation ===
|
||||||
def vllm_triton_gemm():
|
def vllm_triton_gemm():
|
||||||
return w8a8_block_fp8_matmul(A_vllm,
|
return w8a8_triton_block_scaled_mm(
|
||||||
B_vllm,
|
A_vllm,
|
||||||
A_scale_vllm,
|
B_vllm,
|
||||||
B_scale_vllm,
|
A_scale_vllm,
|
||||||
block_size,
|
B_scale_vllm,
|
||||||
output_dtype=torch.bfloat16)
|
block_size,
|
||||||
|
output_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
# === vLLM CUTLASS Implementation ===
|
# === vLLM CUTLASS Implementation ===
|
||||||
def vllm_cutlass_gemm():
|
def vllm_cutlass_gemm():
|
||||||
return ops.cutlass_scaled_mm(A_vllm_cutlass,
|
return ops.cutlass_scaled_mm(
|
||||||
B_vllm.T,
|
A_vllm_cutlass,
|
||||||
scale_a=A_scale_vllm_cutlass,
|
B_vllm.T,
|
||||||
scale_b=B_scale_vllm.T,
|
scale_a=A_scale_vllm_cutlass,
|
||||||
out_dtype=torch.bfloat16)
|
scale_b=B_scale_vllm.T,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
# Run correctness check first
|
# Run correctness check first
|
||||||
if verbose:
|
if verbose:
|
||||||
@ -93,26 +99,23 @@ def benchmark_shape(m: int,
|
|||||||
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
||||||
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
||||||
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
||||||
print("vLLM Triton vs DeepGEMM difference: "
|
print(
|
||||||
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
|
"vLLM Triton vs DeepGEMM difference: "
|
||||||
print("vLLM CUTLASS vs DeepGEMM difference: "
|
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
|
||||||
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
|
)
|
||||||
|
print(
|
||||||
|
"vLLM CUTLASS vs DeepGEMM difference: "
|
||||||
|
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
|
||||||
|
)
|
||||||
|
|
||||||
# Benchmark implementations
|
# Benchmark implementations
|
||||||
implementations = {
|
implementations = {
|
||||||
"DeepGEMM": deepgemm_gemm,
|
"DeepGEMM": deepgemm_gemm,
|
||||||
"vLLM Triton": vllm_triton_gemm,
|
"vLLM Triton": vllm_triton_gemm,
|
||||||
"vLLM CUTLASS": vllm_cutlass_gemm
|
"vLLM CUTLASS": vllm_cutlass_gemm,
|
||||||
}
|
}
|
||||||
|
|
||||||
benchmark_results = {
|
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
|
||||||
"shape": {
|
|
||||||
"m": m,
|
|
||||||
"n": n,
|
|
||||||
"k": k
|
|
||||||
},
|
|
||||||
"implementations": {}
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, func in implementations.items():
|
for name, func in implementations.items():
|
||||||
# Warmup
|
# Warmup
|
||||||
@ -140,38 +143,36 @@ def benchmark_shape(m: int,
|
|||||||
"tflops": tflops,
|
"tflops": tflops,
|
||||||
"gb_s": gb_s,
|
"gb_s": gb_s,
|
||||||
"diff": {
|
"diff": {
|
||||||
"DeepGEMM":
|
"DeepGEMM": 0.0
|
||||||
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
|
if name == "DeepGEMM"
|
||||||
"Reference":
|
else calc_diff(func(), C_deepgemm),
|
||||||
deepgemm_diff if name == "DeepGEMM" else
|
"Reference": deepgemm_diff
|
||||||
(vllm_triton_diff
|
if name == "DeepGEMM"
|
||||||
if name == "vLLM Triton" else vllm_cutlass_diff)
|
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
|
||||||
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate speedups
|
# Calculate speedups
|
||||||
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
||||||
for name, data in benchmark_results["implementations"].items():
|
for name, data in benchmark_results["implementations"].items():
|
||||||
if name != "DeepGEMM":
|
if name != "DeepGEMM":
|
||||||
speedup = baseline / data["time_ms"]
|
speedup = baseline / data["time_ms"]
|
||||||
benchmark_results["implementations"][name][
|
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
|
||||||
"speedup_vs_deepgemm"] = speedup
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"DeepGEMM is {1/speedup:.2f}x "
|
print(
|
||||||
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
|
f"DeepGEMM is {1 / speedup:.2f}x "
|
||||||
|
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
|
||||||
|
)
|
||||||
|
|
||||||
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
|
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
|
||||||
"time_ms"]
|
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
|
||||||
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
|
|
||||||
"time_ms"]
|
|
||||||
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
||||||
benchmark_results["implementations"]["vLLM CUTLASS"][
|
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
|
||||||
"speedup_vs_triton"] = cutlass_vs_triton
|
cutlass_vs_triton
|
||||||
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
||||||
@ -183,8 +184,7 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
def format_table_row(values, widths):
|
def format_table_row(values, widths):
|
||||||
"""Format a row with specified column widths."""
|
"""Format a row with specified column widths."""
|
||||||
return "| " + " | ".join(f"{val:{w}}"
|
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
|
||||||
for val, w in zip(values, widths)) + " |"
|
|
||||||
|
|
||||||
|
|
||||||
def print_table(headers, rows, title=None):
|
def print_table(headers, rows, title=None):
|
||||||
@ -292,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
for result in all_results:
|
for result in all_results:
|
||||||
shape = result["shape"]
|
shape = result["shape"]
|
||||||
impl_data = result["implementations"]["DeepGEMM"]
|
impl_data = result["implementations"]["DeepGEMM"]
|
||||||
deepgemm_rows.append([
|
deepgemm_rows.append(
|
||||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
[
|
||||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
|
shape["m"],
|
||||||
])
|
shape["n"],
|
||||||
|
shape["k"],
|
||||||
|
f"{impl_data['time_us']:.1f}",
|
||||||
|
f"{impl_data['tflops']:.1f}",
|
||||||
|
f"{impl_data['gb_s']:.1f}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
print_table(deepgemm_headers,
|
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
|
||||||
deepgemm_rows,
|
|
||||||
title="DeepGEMM Implementation:")
|
|
||||||
|
|
||||||
# Print vLLM Triton table
|
# Print vLLM Triton table
|
||||||
triton_headers = [
|
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
|
||||||
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
|
|
||||||
]
|
|
||||||
triton_rows = []
|
triton_rows = []
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
shape = result["shape"]
|
shape = result["shape"]
|
||||||
impl_data = result["implementations"]["vLLM Triton"]
|
impl_data = result["implementations"]["vLLM Triton"]
|
||||||
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||||
triton_rows.append([
|
triton_rows.append(
|
||||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
[
|
||||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
shape["m"],
|
||||||
format_speedup(speedup)
|
shape["n"],
|
||||||
])
|
shape["k"],
|
||||||
|
f"{impl_data['time_us']:.1f}",
|
||||||
|
f"{impl_data['tflops']:.1f}",
|
||||||
|
f"{impl_data['gb_s']:.1f}",
|
||||||
|
format_speedup(speedup),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
print_table(triton_headers,
|
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
|
||||||
triton_rows,
|
|
||||||
title="vLLM Triton Implementation:")
|
|
||||||
|
|
||||||
# Print vLLM CUTLASS table
|
# Print vLLM CUTLASS table
|
||||||
cutlass_headers = [
|
cutlass_headers = [
|
||||||
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
|
"m",
|
||||||
"vs Triton"
|
"n",
|
||||||
|
"k",
|
||||||
|
"Time (μs)",
|
||||||
|
"TFLOPS",
|
||||||
|
"GB/s",
|
||||||
|
"vs DeepGEMM",
|
||||||
|
"vs Triton",
|
||||||
]
|
]
|
||||||
cutlass_rows = []
|
cutlass_rows = []
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
@ -331,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
impl_data = result["implementations"]["vLLM CUTLASS"]
|
impl_data = result["implementations"]["vLLM CUTLASS"]
|
||||||
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||||
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
||||||
cutlass_rows.append([
|
cutlass_rows.append(
|
||||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
[
|
||||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
shape["m"],
|
||||||
format_speedup(vs_deepgemm),
|
shape["n"],
|
||||||
format_speedup(vs_triton)
|
shape["k"],
|
||||||
])
|
f"{impl_data['time_us']:.1f}",
|
||||||
|
f"{impl_data['tflops']:.1f}",
|
||||||
|
f"{impl_data['gb_s']:.1f}",
|
||||||
|
format_speedup(vs_deepgemm),
|
||||||
|
format_speedup(vs_triton),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
print_table(cutlass_headers,
|
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
|
||||||
cutlass_rows,
|
|
||||||
title="vLLM CUTLASS Implementation:")
|
|
||||||
|
|
||||||
# Calculate and print averages
|
# Calculate and print averages
|
||||||
print("\n===== AVERAGE PERFORMANCE =====")
|
print("\n===== AVERAGE PERFORMANCE =====")
|
||||||
|
|
||||||
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
||||||
avg_metrics = {
|
avg_metrics = {
|
||||||
impl: {
|
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
|
||||||
"tflops": 0,
|
|
||||||
"gb_s": 0,
|
|
||||||
"time_ms": 0
|
|
||||||
}
|
|
||||||
for impl in implementations
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
@ -370,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
||||||
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
||||||
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
||||||
avg_rows.append([
|
avg_rows.append(
|
||||||
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
|
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
|
||||||
])
|
)
|
||||||
|
|
||||||
print_table(avg_headers, avg_rows)
|
print_table(avg_headers, avg_rows)
|
||||||
|
|
||||||
@ -380,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
avg_speedups = {
|
avg_speedups = {
|
||||||
"DeepGEMM vs vLLM Triton": 0,
|
"DeepGEMM vs vLLM Triton": 0,
|
||||||
"DeepGEMM vs vLLM CUTLASS": 0,
|
"DeepGEMM vs vLLM CUTLASS": 0,
|
||||||
"vLLM CUTLASS vs vLLM Triton": 0
|
"vLLM CUTLASS vs vLLM Triton": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
||||||
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
||||||
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
|
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
|
||||||
"time_ms"]
|
|
||||||
|
|
||||||
avg_speedups[
|
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
||||||
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
||||||
avg_speedups[
|
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
|
||||||
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
vllm_triton_time / vllm_cutlass_time
|
||||||
avg_speedups[
|
)
|
||||||
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
|
|
||||||
|
|
||||||
print("\n===== AVERAGE SPEEDUPS =====")
|
print("\n===== AVERAGE SPEEDUPS =====")
|
||||||
speedup_headers = ["Comparison", "Speedup"]
|
speedup_headers = ["Comparison", "Speedup"]
|
||||||
@ -412,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
for impl in implementations:
|
for impl in implementations:
|
||||||
avg_diff[impl] += result["implementations"][impl]["diff"][
|
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
|
||||||
"Reference"]
|
|
||||||
|
|
||||||
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
||||||
diff_rows = []
|
diff_rows = []
|
||||||
|
|||||||
@ -1,49 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.isort]
|
|
||||||
known-first-party = ["vllm"]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@ -213,6 +213,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
endif()
|
endif()
|
||||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||||
|
add_compile_definitions(VLLM_USE_ACL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
@ -226,7 +227,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||||
set(ONEDNN_VERBOSE "OFF")
|
set(ONEDNN_VERBOSE "ON")
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
FetchContent_MakeAvailable(oneDNN)
|
||||||
|
|||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
|
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -16,7 +16,7 @@ import shutil
|
|||||||
|
|
||||||
from torch.utils.hipify.hipify_python import hipify
|
from torch.utils.hipify.hipify_python import hipify
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
# Project directory where all the source + include files live.
|
# Project directory where all the source + include files live.
|
||||||
@ -34,15 +34,14 @@ if __name__ == '__main__':
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Source files to convert.
|
# Source files to convert.
|
||||||
parser.add_argument("sources",
|
parser.add_argument(
|
||||||
help="Source files to hipify.",
|
"sources", help="Source files to hipify.", nargs="*", default=[]
|
||||||
nargs="*",
|
)
|
||||||
default=[])
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Limit include scope to project_dir only
|
# Limit include scope to project_dir only
|
||||||
includes = [os.path.join(args.project_dir, '*')]
|
includes = [os.path.join(args.project_dir, "*")]
|
||||||
|
|
||||||
# Get absolute path for all source files.
|
# Get absolute path for all source files.
|
||||||
extra_files = [os.path.abspath(s) for s in args.sources]
|
extra_files = [os.path.abspath(s) for s in args.sources]
|
||||||
@ -51,25 +50,31 @@ if __name__ == '__main__':
|
|||||||
# The directory might already exist to hold object files so we ignore that.
|
# The directory might already exist to hold object files so we ignore that.
|
||||||
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
|
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
|
||||||
|
|
||||||
hipify_result = hipify(project_directory=args.project_dir,
|
hipify_result = hipify(
|
||||||
output_directory=args.output_dir,
|
project_directory=args.project_dir,
|
||||||
header_include_dirs=[],
|
output_directory=args.output_dir,
|
||||||
includes=includes,
|
header_include_dirs=[],
|
||||||
extra_files=extra_files,
|
includes=includes,
|
||||||
show_detailed=True,
|
extra_files=extra_files,
|
||||||
is_pytorch_extension=True,
|
show_detailed=True,
|
||||||
hipify_extra_files_only=True)
|
is_pytorch_extension=True,
|
||||||
|
hipify_extra_files_only=True,
|
||||||
|
)
|
||||||
|
|
||||||
hipified_sources = []
|
hipified_sources = []
|
||||||
for source in args.sources:
|
for source in args.sources:
|
||||||
s_abs = os.path.abspath(source)
|
s_abs = os.path.abspath(source)
|
||||||
hipified_s_abs = (hipify_result[s_abs].hipified_path if
|
hipified_s_abs = (
|
||||||
(s_abs in hipify_result
|
hipify_result[s_abs].hipified_path
|
||||||
and hipify_result[s_abs].hipified_path is not None)
|
if (
|
||||||
else s_abs)
|
s_abs in hipify_result
|
||||||
|
and hipify_result[s_abs].hipified_path is not None
|
||||||
|
)
|
||||||
|
else s_abs
|
||||||
|
)
|
||||||
hipified_sources.append(hipified_s_abs)
|
hipified_sources.append(hipified_s_abs)
|
||||||
|
|
||||||
assert (len(hipified_sources) == len(args.sources))
|
assert len(hipified_sources) == len(args.sources)
|
||||||
|
|
||||||
# Print hipified source files.
|
# Print hipified source files.
|
||||||
print("\n".join(hipified_sources))
|
print("\n".join(hipified_sources))
|
||||||
|
|||||||
@ -16,9 +16,7 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cfloat> // FLT_MIN
|
#include <cfloat>
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
@ -210,6 +208,20 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
|
// Used to copy/convert one element
|
||||||
|
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||||
|
struct CopyWithScaleOp {
|
||||||
|
float scale;
|
||||||
|
|
||||||
|
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||||
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
|
dst = static_cast<OutT>(src);
|
||||||
|
} else {
|
||||||
|
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_kernel(
|
__global__ void reshape_and_cache_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -225,59 +237,51 @@ __global__ void reshape_and_cache_kernel(
|
|||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
if (slot_idx < 0) {
|
if (slot_idx < 0) {
|
||||||
// Padding token that should be ignored.
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
|
const int h_block_count = head_size / x; // head_size//x
|
||||||
|
|
||||||
const int n = num_heads * head_size;
|
const int h_block_idx = threadIdx.x;
|
||||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
if (h_block_idx >= num_heads * h_block_count) {
|
||||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
return;
|
||||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
}
|
||||||
|
|
||||||
const int head_idx = i / head_size;
|
const int head_idx = h_block_idx / h_block_count;
|
||||||
const int head_offset = i % head_size;
|
const int h_block = h_block_idx % h_block_count;
|
||||||
const int x_idx = head_offset / x;
|
|
||||||
const int x_offset = head_offset % x;
|
|
||||||
|
|
||||||
const int64_t tgt_key_idx =
|
const scalar_t* __restrict__ key_src =
|
||||||
block_idx * num_heads * (head_size / x) * block_size * x +
|
key + token_idx * key_stride + head_idx * head_size + h_block * x;
|
||||||
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
const int64_t src_value_start =
|
||||||
block_offset * x + x_offset;
|
token_idx * value_stride + head_idx * head_size + h_block * x;
|
||||||
const int64_t tgt_value_idx =
|
|
||||||
block_idx * num_heads * head_size * block_size +
|
cache_t* __restrict__ key_dst =
|
||||||
head_idx * head_size * block_size + head_offset * block_size +
|
key_cache + block_idx * num_heads * h_block_count * block_size * x +
|
||||||
block_offset;
|
head_idx * h_block_count * block_size * x + h_block * block_size * x +
|
||||||
scalar_t tgt_key = key[src_key_idx];
|
block_offset * x;
|
||||||
scalar_t tgt_value = value[src_value_idx];
|
const int64_t tgt_value_start =
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
block_idx * num_heads * h_block_count * x * block_size +
|
||||||
key_cache[tgt_key_idx] = tgt_key;
|
head_idx * h_block_count * x * block_size + h_block * x * block_size +
|
||||||
value_cache[tgt_value_idx] = tgt_value;
|
block_offset;
|
||||||
} else {
|
|
||||||
key_cache[tgt_key_idx] =
|
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||||
value_cache[tgt_value_idx] =
|
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||||
}
|
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||||
|
|
||||||
|
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
|
||||||
|
|
||||||
|
const scalar_t* __restrict__ value_src = value + src_value_start;
|
||||||
|
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < x; i++) {
|
||||||
|
v_op(value_dst[i * block_size], value_src[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used by vectorization_utils to copy/convert one element
|
|
||||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
|
||||||
struct CopyWithScaleOp {
|
|
||||||
float scale;
|
|
||||||
|
|
||||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
dst = static_cast<OutT>(src);
|
|
||||||
} else {
|
|
||||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_flash_kernel(
|
__global__ void reshape_and_cache_flash_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -424,84 +428,81 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
|||||||
const int64_t dst_idx_start =
|
const int64_t dst_idx_start =
|
||||||
block_idx * block_stride + block_offset * entry_stride;
|
block_idx * block_stride + block_offset * entry_stride;
|
||||||
|
|
||||||
// Create 4 tile scales in shared memory
|
// For the NoPE part, each tile of 128 elements is handled by half of one warp
|
||||||
__shared__ float smem[20];
|
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
||||||
float* shard_abs_max = smem;
|
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
||||||
float* tile_scales = smem + 16;
|
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
|
||||||
|
// So in total, we use 3 warps (96 threads) per block.
|
||||||
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
|
||||||
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
|
||||||
// The first thread of the first warp in each tile writes the scale
|
|
||||||
// value for the tile. The RoPE part (last 64 elements) is handled
|
|
||||||
// by another 2 warps (64 threads).
|
|
||||||
// So in total, we use 18 warps (576 threads) per block.
|
|
||||||
|
|
||||||
// Cast kv_cache to 16_bit for RoPE values
|
// Cast kv_cache to 16_bit for RoPE values
|
||||||
scalar_t* kv_cache_16bit =
|
scalar_t* kv_cache_16bit =
|
||||||
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
||||||
|
|
||||||
// The last 64 threads handle the RoPE part
|
// The last warp handles the RoPE part
|
||||||
if (threadIdx.x >= kv_lora_rank) {
|
if (threadIdx.x >= 64) {
|
||||||
const int8_t pe_idx = threadIdx.x - kv_lora_rank;
|
// Each thread handles two elements of RoPE
|
||||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
|
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
|
||||||
|
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
|
||||||
|
// Vectorized load of two 16-bit values, performed as one 32-bit load
|
||||||
|
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
|
||||||
// RoPE values start after the packed 8-bit NoPE values and the
|
// RoPE values start after the packed 8-bit NoPE values and the
|
||||||
// 32-bit scales
|
// 32-bit scales
|
||||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
|
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
|
||||||
kv_cache_16bit[dst_idx] = k_pe[src_idx];
|
// Vectorized store of two 16-bit values, performed as one 32-bit store
|
||||||
|
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine the scale for each chunk of NoPE
|
// The first two warps handle the NoPE part
|
||||||
const int16_t tile_idx = threadIdx.x >> 7;
|
const int8_t warp_idx = threadIdx.x >> 5;
|
||||||
const int16_t warp_idx = (threadIdx.x & 127) >> 5;
|
const int8_t lane_idx = threadIdx.x & 31;
|
||||||
const int16_t lane_idx = threadIdx.x & 31;
|
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
|
||||||
|
|
||||||
// Load the NoPE element for this thread into registers
|
// Each thread handles 8 elements of NoPE
|
||||||
const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
|
// Load the NoPE elements for this thread into registers
|
||||||
const scalar_t src_val = kv_c[src_idx];
|
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
|
||||||
|
// Vectorized load of eight 16-bit values, performed as an int4 load
|
||||||
|
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
|
||||||
|
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
|
||||||
|
|
||||||
// Warp-level reduction to find the max absolute value in the warp
|
// Max absolute value of this thread's elements
|
||||||
float max_abs = fabsf(src_val);
|
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
|
||||||
|
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
|
||||||
|
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
|
||||||
|
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
|
||||||
|
|
||||||
|
// Warp-level reduction to find the max absolute value in each half-warp
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 16; offset > 0; offset /= 2) {
|
for (int offset = 8; offset > 0; offset /= 2) {
|
||||||
#ifdef USE_ROCM
|
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
|
||||||
max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
|
|
||||||
#else
|
|
||||||
max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The first lane of each warp in each tile writes the max_abs of this part
|
// Compute the scale for the tile
|
||||||
// of the tile to shared memory
|
float tile_scale = max_abs / 448.f;
|
||||||
if (lane_idx == 0) {
|
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
||||||
shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// The first lane of the first warp in each tile computes the scale for the
|
// The first lane of each half-warp writes the scale to kv_cache
|
||||||
// tile and writes it to shared memory and to kv_cache
|
if ((lane_idx == 0) || (lane_idx == 16)) {
|
||||||
if (warp_idx == 0 && lane_idx == 0) {
|
|
||||||
float4 shard_abs_max_vec =
|
|
||||||
reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
|
|
||||||
float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
|
|
||||||
fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
|
|
||||||
448.f;
|
|
||||||
|
|
||||||
// Avoid division by zero in `scaled_convert`
|
|
||||||
tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
|
|
||||||
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
||||||
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
||||||
kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
|
kv_cache_32bit[dst_idx] = tile_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
// Now all threads in the block scale and write their elements
|
||||||
|
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
|
||||||
|
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
|
||||||
|
|
||||||
// Now all threads in the block scale and write their element
|
uint8_t result[8];
|
||||||
const float scale_val = tile_scales[tile_idx];
|
#pragma unroll
|
||||||
const int64_t dst_idx = dst_idx_start + threadIdx.x;
|
for (int i = 0; i < 8; i++) {
|
||||||
kv_cache[dst_idx] =
|
result[i] =
|
||||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
||||||
src_val, scale_val);
|
vals[i], tile_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store as aligned 64-bit writes
|
||||||
|
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
|
||||||
|
*reinterpret_cast<const uint64_t*>(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
@ -606,9 +607,10 @@ void reshape_and_cache(
|
|||||||
|
|
||||||
int key_stride = key.stride(0);
|
int key_stride = key.stride(0);
|
||||||
int value_stride = value.stride(0);
|
int value_stride = value.stride(0);
|
||||||
|
int head_div_x = head_size / x;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * head_size, 512));
|
dim3 block(std::min(num_heads * head_div_x, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
@ -741,13 +743,12 @@ void concat_and_cache_mla(
|
|||||||
|
|
||||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
if (kv_cache_dtype == "fp8_ds_mla") {
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
// For the NoPE part, each tile of 128 elements is handled by half of one
|
||||||
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
||||||
// The first thread of the first warp in each tile writes the scale
|
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
||||||
// value for the tile. The RoPE part (last 64 elements) is handled
|
// The RoPE part (last 64 elements) is handled by another 1 warp (32
|
||||||
// by another 2 warps (64 threads).
|
// threads). So in total, we use 3 warps (96 threads) per block.
|
||||||
// So in total, we use 18 warps (576 threads) per block.
|
dim3 block(96);
|
||||||
dim3 block(576);
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||||
CALL_CONCAT_AND_CACHE_DS_MLA);
|
CALL_CONCAT_AND_CACHE_DS_MLA);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -137,9 +137,8 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void DNNLMatMulPrimitiveHandler::prepack_weight(
|
void DNNLMatMulPrimitiveHandler::prepack_weight(
|
||||||
void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
|
void* original_b_ptr, dnnl::memory::desc original_b_md,
|
||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
dnnl::memory::desc b_target_mem_desc) {
|
||||||
{b_k_stride_, b_n_stride_});
|
|
||||||
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
|
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
|
||||||
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
|
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
|
||||||
{
|
{
|
||||||
@ -250,7 +249,9 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
|||||||
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
|
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
|
||||||
assert(!use_azp_);
|
assert(!use_azp_);
|
||||||
};
|
};
|
||||||
prepack_weight(args.b_ptr,
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
|
{b_k_stride_, b_n_stride_});
|
||||||
|
prepack_weight(args.b_ptr, original_b_md,
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||||
.use_bias = false,
|
.use_bias = false,
|
||||||
@ -412,12 +413,25 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
|||||||
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
||||||
ab_type_ == dnnl::memory::data_type::bf16 ||
|
ab_type_ == dnnl::memory::data_type::bf16 ||
|
||||||
ab_type_ == dnnl::memory::data_type::f16);
|
ab_type_ == dnnl::memory::data_type::f16);
|
||||||
prepack_weight(args.b_ptr,
|
|
||||||
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
|
{b_k_stride_, b_n_stride_});
|
||||||
|
|
||||||
|
prepack_weight(args.b_ptr, original_b_md,
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
MSizeCacheKey{
|
||||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
#ifdef VLLM_USE_ACL
|
||||||
.use_bias = false,
|
// Arm Compute Library (ACL) backend for oneDNN does
|
||||||
.bias_type = dnnl::memory::data_type::undef},
|
// not support runtime
|
||||||
|
// dimensions, so we set M to a default value
|
||||||
|
.a_m_size = 128,
|
||||||
|
.a_m_stride = b_k_size_,
|
||||||
|
#else
|
||||||
|
.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||||
|
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
||||||
|
#endif
|
||||||
|
.use_bias = false,
|
||||||
|
.bias_type = dnnl::memory::data_type::undef},
|
||||||
true)
|
true)
|
||||||
.weights_desc());
|
.weights_desc());
|
||||||
init_runtime_memory_cache(args);
|
init_runtime_memory_cache(args);
|
||||||
@ -443,13 +457,31 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|||||||
c_storage->set_data_handle((void*)args.c_ptr);
|
c_storage->set_data_handle((void*)args.c_ptr);
|
||||||
c_mem_desc->dims[0] = args.a_m_size;
|
c_mem_desc->dims[0] = args.a_m_size;
|
||||||
|
|
||||||
|
#ifndef VLLM_USE_ACL
|
||||||
|
// We do not support in ACL backend of oneDNN, we handle bias by:
|
||||||
|
// 1. copying it into the result tensor
|
||||||
|
// 2. attaching a fused-sum post-op to the matmul primitive
|
||||||
if (args.use_bias) {
|
if (args.use_bias) {
|
||||||
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
|
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
|
||||||
bias_storage->set_data_handle((void*)args.bias_ptr);
|
bias_storage->set_data_handle((void*)args.bias_ptr);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
dnnl::matmul matmul = get_matmul_cache(args);
|
dnnl::matmul matmul = get_matmul_cache(args);
|
||||||
|
|
||||||
|
// With ACL backend of oneDNN, the required memory format might change when the
|
||||||
|
// source tensor dims change. This does not really happen in practice, so isn't
|
||||||
|
// a performance hit, but we need to support it because the API allows for it.
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
auto new_expected_wei_desc =
|
||||||
|
dnnl::matmul::primitive_desc(
|
||||||
|
const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
|
||||||
|
.weights_desc();
|
||||||
|
if (new_expected_wei_desc != b_target_mem_desc_) {
|
||||||
|
prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
|
||||||
|
b_target_mem_desc_, new_expected_wei_desc);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
||||||
scratchpad_storage->set_data_handle(
|
scratchpad_storage->set_data_handle(
|
||||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
||||||
@ -484,7 +516,13 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
|||||||
} else {
|
} else {
|
||||||
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
||||||
{key.a_m_stride, 1});
|
{key.a_m_stride, 1});
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
// ACL's backend of oneDNN always expects the weight format to be "any"
|
||||||
|
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
|
||||||
|
dnnl::memory::format_tag::any);
|
||||||
|
#else
|
||||||
b_md = b_target_mem_desc_;
|
b_md = b_target_mem_desc_;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
||||||
dnnl::memory::format_tag::ab);
|
dnnl::memory::format_tag::ab);
|
||||||
@ -494,8 +532,18 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
|||||||
|
|
||||||
if (key.use_bias) {
|
if (key.use_bias) {
|
||||||
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
||||||
|
// Since ACL's matmuls don't support passing a bias_md, we apply the bias
|
||||||
|
// through a fused-sum post-op
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
dnnl::post_ops post_ops;
|
||||||
|
post_ops.append_sum();
|
||||||
|
attr.set_post_ops(post_ops);
|
||||||
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
||||||
|
attr);
|
||||||
|
#else
|
||||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
||||||
c_md, attr);
|
c_md, attr);
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
||||||
attr);
|
attr);
|
||||||
@ -511,13 +559,23 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
|
|||||||
default_engine(), nullptr);
|
default_engine(), nullptr);
|
||||||
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
||||||
|
|
||||||
|
// ACL matmuls don't support bias_md, so we don't need these
|
||||||
|
#ifndef VLLM_USE_ACL
|
||||||
memory_cache_[DNNL_ARG_BIAS] =
|
memory_cache_[DNNL_ARG_BIAS] =
|
||||||
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
||||||
default_engine(), nullptr);
|
default_engine(), nullptr);
|
||||||
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
|
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
|
||||||
|
#endif
|
||||||
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
||||||
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
||||||
default_engine(), nullptr);
|
default_engine(), nullptr);
|
||||||
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_onednn_acl_supported() {
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|||||||
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
|
|||||||
protected:
|
protected:
|
||||||
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
|
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
|
||||||
|
|
||||||
void prepack_weight(void* original_b_ptr,
|
void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
|
||||||
dnnl::memory::desc b_target_mem_desc);
|
dnnl::memory::desc b_target_mem_desc);
|
||||||
|
|
||||||
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
|
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
|
||||||
|
|||||||
@ -527,21 +527,42 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
|||||||
MatMulPrimitiveHandler* ptr =
|
MatMulPrimitiveHandler* ptr =
|
||||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||||
|
|
||||||
|
// ACL matmuls expect contiguous source tensors
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
torch::Tensor a_contig = a.contiguous();
|
||||||
|
#endif
|
||||||
|
|
||||||
MatMulPrimitiveHandler::ExecArgs exec_args;
|
MatMulPrimitiveHandler::ExecArgs exec_args;
|
||||||
|
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
exec_args.a_m_size = a_contig.size(0);
|
||||||
|
exec_args.a_m_stride = a_contig.stride(0);
|
||||||
|
#else
|
||||||
exec_args.a_m_size = a.size(0);
|
exec_args.a_m_size = a.size(0);
|
||||||
exec_args.a_m_stride = a.stride(0);
|
exec_args.a_m_stride = a.stride(0);
|
||||||
|
#endif
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
|
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
|
||||||
if (bias.has_value()) {
|
if (bias.has_value()) {
|
||||||
exec_args.use_bias = true;
|
exec_args.use_bias = true;
|
||||||
exec_args.bias_type = get_dnnl_type<scalar_t>();
|
exec_args.bias_type = get_dnnl_type<scalar_t>();
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
// ACL matmuls in oneDNN do not support a bias.
|
||||||
|
// We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
|
||||||
|
c.copy_(bias.value());
|
||||||
|
#else
|
||||||
exec_args.bias_ptr = bias->data_ptr<scalar_t>();
|
exec_args.bias_ptr = bias->data_ptr<scalar_t>();
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
exec_args.use_bias = false;
|
exec_args.use_bias = false;
|
||||||
exec_args.bias_type = get_dnnl_type<void>();
|
exec_args.bias_type = get_dnnl_type<void>();
|
||||||
exec_args.bias_ptr = nullptr;
|
exec_args.bias_ptr = nullptr;
|
||||||
}
|
}
|
||||||
|
#ifdef VLLM_USE_ACL
|
||||||
|
exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
|
||||||
|
#else
|
||||||
exec_args.a_ptr = a.data_ptr<scalar_t>();
|
exec_args.a_ptr = a.data_ptr<scalar_t>();
|
||||||
|
|
||||||
|
#endif
|
||||||
exec_args.c_ptr = c.data_ptr<scalar_t>();
|
exec_args.c_ptr = c.data_ptr<scalar_t>();
|
||||||
|
|
||||||
ptr->execute(exec_args);
|
ptr->execute(exec_args);
|
||||||
|
|||||||
@ -27,6 +27,8 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
|||||||
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||||
const std::optional<torch::Tensor>& bias, int64_t handler);
|
const std::optional<torch::Tensor>& bias, int64_t handler);
|
||||||
|
|
||||||
|
bool is_onednn_acl_supported();
|
||||||
|
|
||||||
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
||||||
torch::Tensor& kv_cache, double scale,
|
torch::Tensor& kv_cache, double scale,
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
||||||
@ -181,6 +183,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"int handler) -> ()");
|
"int handler) -> ()");
|
||||||
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
||||||
|
|
||||||
|
// Check if oneDNN was built with ACL backend
|
||||||
|
ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
|
||||||
|
|
||||||
// Create oneDNN W8A8 handler
|
// Create oneDNN W8A8 handler
|
||||||
ops.def(
|
ops.def(
|
||||||
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
|
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
|
||||||
|
|||||||
@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "u4b8",
|
VLLMDataType.u4b8: "u4b8",
|
||||||
VLLMDataType.u8b128: "u8b128",
|
VLLMDataType.u8b128: "u8b128",
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||||
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
|
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||||
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
|||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: 4,
|
VLLMDataType.u4b8: 4,
|
||||||
VLLMDataType.u8b128: 8,
|
VLLMDataType.u8b128: 8,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
@ -67,15 +67,13 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
DataType.f32: "at::ScalarType::Float",
|
DataType.f32: "at::ScalarType::Float",
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMKernelScheduleTag: dict[Union[
|
VLLMKernelScheduleTag: dict[
|
||||||
MixedInputKernelScheduleType, KernelScheduleType], str] = {
|
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
||||||
**KernelScheduleTag, # type: ignore
|
] = {
|
||||||
**{
|
**KernelScheduleTag, # type: ignore
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecialized:
|
**{
|
||||||
"cutlass::gemm::KernelTmaWarpSpecialized",
|
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
|
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
|
||||||
"cutlass::gemm::KernelTmaWarpSpecializedPingpong",
|
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
|
},
|
||||||
"cutlass::gemm::KernelTmaWarpSpecializedCooperative",
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@ -17,25 +17,30 @@ FILE_HEAD = """
|
|||||||
namespace MARLIN_NAMESPACE_NAME {
|
namespace MARLIN_NAMESPACE_NAME {
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
TEMPLATE = ("template __global__ void Marlin<"
|
TEMPLATE = (
|
||||||
"{{scalar_t}}, "
|
"template __global__ void Marlin<"
|
||||||
"{{w_type_id}}, "
|
"{{scalar_t}}, "
|
||||||
"{{s_type_id}}, "
|
"{{w_type_id}}, "
|
||||||
"{{threads}}, "
|
"{{s_type_id}}, "
|
||||||
"{{thread_m_blocks}}, "
|
"{{threads}}, "
|
||||||
"{{thread_n_blocks}}, "
|
"{{thread_m_blocks}}, "
|
||||||
"{{thread_k_blocks}}, "
|
"{{thread_n_blocks}}, "
|
||||||
"{{'true' if m_block_size_8 else 'false'}}, "
|
"{{thread_k_blocks}}, "
|
||||||
"{{stages}}, "
|
"{{'true' if m_block_size_8 else 'false'}}, "
|
||||||
"{{group_blocks}}, "
|
"{{stages}}, "
|
||||||
"{{'true' if is_zp_float else 'false'}}>"
|
"{{group_blocks}}, "
|
||||||
"( MARLIN_KERNEL_PARAMS );")
|
"{{'true' if is_zp_float else 'false'}}>"
|
||||||
|
"( MARLIN_KERNEL_PARAMS );"
|
||||||
|
)
|
||||||
|
|
||||||
# int8 with zero point case (vllm::kU8) is also supported,
|
# int8 with zero point case (vllm::kU8) is also supported,
|
||||||
# we don't add it to reduce wheel size.
|
# we don't add it to reduce wheel size.
|
||||||
SCALAR_TYPES = [
|
SCALAR_TYPES = [
|
||||||
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
|
"vllm::kU4",
|
||||||
"vllm::kFE2M1f"
|
"vllm::kU4B8",
|
||||||
|
"vllm::kU8B128",
|
||||||
|
"vllm::kFE4M3fn",
|
||||||
|
"vllm::kFE2M1f",
|
||||||
]
|
]
|
||||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
|
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
|
||||||
|
|
||||||
@ -58,11 +63,12 @@ def generate_new_kernels():
|
|||||||
all_template_str_list = []
|
all_template_str_list = []
|
||||||
|
|
||||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||||
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
|
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
|
||||||
|
):
|
||||||
# act order case only support gptq-int4 and gptq-int8
|
# act order case only support gptq-int4 and gptq-int8
|
||||||
if group_blocks == 0 and scalar_type not in [
|
if group_blocks == 0 and scalar_type not in [
|
||||||
"vllm::kU4B8", "vllm::kU8B128"
|
"vllm::kU4B8",
|
||||||
|
"vllm::kU8B128",
|
||||||
]:
|
]:
|
||||||
continue
|
continue
|
||||||
if thread_configs[2] == 256:
|
if thread_configs[2] == 256:
|
||||||
|
|||||||
@ -254,7 +254,7 @@ void cutlass_moe_mm(
|
|||||||
bool per_act_token, bool per_out_ch) {
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
if (version_num >= 100) {
|
if (version_num >= 100 && version_num < 110) {
|
||||||
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -262,7 +262,7 @@ void cutlass_moe_mm(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
if (version_num >= 90) {
|
if (version_num >= 90 && version_num < 100) {
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
|
|||||||
@ -14,6 +14,8 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <cutlass/arch/arch.h>
|
#include <cutlass/arch/arch.h>
|
||||||
|
|
||||||
@ -418,3 +420,7 @@ void cutlass_fp4_group_mm(
|
|||||||
"12.8 or above.");
|
"12.8 or above.");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
|
||||||
|
}
|
||||||
|
|||||||
@ -17,28 +17,32 @@ FILE_HEAD = """
|
|||||||
namespace MARLIN_NAMESPACE_NAME {
|
namespace MARLIN_NAMESPACE_NAME {
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
TEMPLATE = ("template __global__ void Marlin<"
|
TEMPLATE = (
|
||||||
"{{scalar_t}}, "
|
"template __global__ void Marlin<"
|
||||||
"{{w_type_id}}, "
|
"{{scalar_t}}, "
|
||||||
"{{s_type_id}}, "
|
"{{w_type_id}}, "
|
||||||
"{{threads}}, "
|
"{{s_type_id}}, "
|
||||||
"{{thread_m_blocks}}, "
|
"{{threads}}, "
|
||||||
"{{thread_n_blocks}}, "
|
"{{thread_m_blocks}}, "
|
||||||
"{{thread_k_blocks}}, "
|
"{{thread_n_blocks}}, "
|
||||||
"{{'true' if m_block_size_8 else 'false'}}, "
|
"{{thread_k_blocks}}, "
|
||||||
"{{stages}}, "
|
"{{'true' if m_block_size_8 else 'false'}}, "
|
||||||
"{{group_blocks}}, "
|
"{{stages}}, "
|
||||||
"{{'true' if is_zp_float else 'false'}}>"
|
"{{group_blocks}}, "
|
||||||
"( MARLIN_KERNEL_PARAMS );")
|
"{{'true' if is_zp_float else 'false'}}>"
|
||||||
|
"( MARLIN_KERNEL_PARAMS );"
|
||||||
|
)
|
||||||
|
|
||||||
# int8 with zero point case (vllm::kU8) is also supported,
|
# int8 with zero point case (vllm::kU8) is also supported,
|
||||||
# we don't add it to reduce wheel size.
|
# we don't add it to reduce wheel size.
|
||||||
SCALAR_TYPES = [
|
SCALAR_TYPES = [
|
||||||
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
|
"vllm::kU4",
|
||||||
"vllm::kFE2M1f"
|
"vllm::kU4B8",
|
||||||
|
"vllm::kU8B128",
|
||||||
|
"vllm::kFE4M3fn",
|
||||||
|
"vllm::kFE2M1f",
|
||||||
]
|
]
|
||||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
|
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
|
||||||
(128, 64, 128)]
|
|
||||||
|
|
||||||
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
|
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
|
||||||
# group_blocks:
|
# group_blocks:
|
||||||
@ -59,11 +63,12 @@ def generate_new_kernels():
|
|||||||
all_template_str_list = []
|
all_template_str_list = []
|
||||||
|
|
||||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||||
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
|
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
|
||||||
|
):
|
||||||
# act order case only support gptq-int4 and gptq-int8
|
# act order case only support gptq-int4 and gptq-int8
|
||||||
if group_blocks == 0 and scalar_type not in [
|
if group_blocks == 0 and scalar_type not in [
|
||||||
"vllm::kU4B8", "vllm::kU8B128"
|
"vllm::kU4B8",
|
||||||
|
"vllm::kU8B128",
|
||||||
]:
|
]:
|
||||||
continue
|
continue
|
||||||
if thread_configs[2] == 256:
|
if thread_configs[2] == 256:
|
||||||
@ -93,8 +98,7 @@ def generate_new_kernels():
|
|||||||
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
|
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
|
||||||
|
|
||||||
is_zp_float_list = [False]
|
is_zp_float_list = [False]
|
||||||
if dtype == "fp16" and scalar_type == "vllm::kU4" and \
|
if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
|
||||||
group_blocks == 4:
|
|
||||||
# HQQ (is_zp_float = true) only supports
|
# HQQ (is_zp_float = true) only supports
|
||||||
# 4bit quantization and fp16
|
# 4bit quantization and fp16
|
||||||
is_zp_float_list.append(True)
|
is_zp_float_list.append(True)
|
||||||
|
|||||||
@ -12,20 +12,21 @@ from functools import reduce
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
# yapf conflicts with isort for this block
|
from vllm_cutlass_library_extension import (
|
||||||
# yapf: disable
|
DataType,
|
||||||
from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
|
EpilogueScheduleTag,
|
||||||
EpilogueScheduleType,
|
EpilogueScheduleType,
|
||||||
MixedInputKernelScheduleType,
|
MixedInputKernelScheduleType,
|
||||||
TileSchedulerTag,
|
TileSchedulerTag,
|
||||||
TileSchedulerType, VLLMDataType,
|
TileSchedulerType,
|
||||||
VLLMDataTypeNames,
|
VLLMDataType,
|
||||||
VLLMDataTypeSize, VLLMDataTypeTag,
|
VLLMDataTypeNames,
|
||||||
VLLMDataTypeTorchDataTypeTag,
|
VLLMDataTypeSize,
|
||||||
VLLMDataTypeVLLMScalarTypeTag,
|
VLLMDataTypeTag,
|
||||||
VLLMKernelScheduleTag)
|
VLLMDataTypeTorchDataTypeTag,
|
||||||
|
VLLMDataTypeVLLMScalarTypeTag,
|
||||||
# yapf: enable
|
VLLMKernelScheduleTag,
|
||||||
|
)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Generator templating
|
# Generator templating
|
||||||
@ -286,18 +287,23 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
|||||||
tile_shape = (
|
tile_shape = (
|
||||||
f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
|
f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
|
||||||
)
|
)
|
||||||
cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
|
cluster_shape = (
|
||||||
f"x{schedule_config.cluster_shape_mnk[1]}" +
|
f"{schedule_config.cluster_shape_mnk[0]}"
|
||||||
f"x{schedule_config.cluster_shape_mnk[2]}")
|
+ f"x{schedule_config.cluster_shape_mnk[1]}"
|
||||||
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
|
+ f"x{schedule_config.cluster_shape_mnk[2]}"
|
||||||
.split("::")[-1]
|
)
|
||||||
epilogue_schedule = EpilogueScheduleTag[
|
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
|
||||||
schedule_config.epilogue_schedule].split("::")[-1]
|
"::"
|
||||||
tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
|
)[-1]
|
||||||
.split("::")[-1]
|
epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
|
||||||
|
"::"
|
||||||
|
)[-1]
|
||||||
|
tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
|
||||||
|
|
||||||
return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
|
return (
|
||||||
f"_{epilogue_schedule}_{tile_scheduler}")
|
f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
|
||||||
|
+ f"_{epilogue_schedule}_{tile_scheduler}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# mostly unique shorter sch_sig
|
# mostly unique shorter sch_sig
|
||||||
@ -316,18 +322,24 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
|
|||||||
|
|
||||||
# unique type_name
|
# unique type_name
|
||||||
def generate_type_signature(kernel_types: TypeConfig):
|
def generate_type_signature(kernel_types: TypeConfig):
|
||||||
return str("".join([
|
return str(
|
||||||
VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
"".join(
|
||||||
for field in fields(TypeConfig)
|
[
|
||||||
]))
|
VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
||||||
|
for field in fields(TypeConfig)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def generate_type_option_name(kernel_types: TypeConfig):
|
def generate_type_option_name(kernel_types: TypeConfig):
|
||||||
return ", ".join([
|
return ", ".join(
|
||||||
f"{field.name.replace('b_', 'with_')+'_type'}=" +
|
[
|
||||||
VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
f"{field.name.replace('b_', 'with_') + '_type'}="
|
||||||
for field in fields(TypeConfig)
|
+ VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
||||||
])
|
for field in fields(TypeConfig)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_power_of_two(n):
|
def is_power_of_two(n):
|
||||||
@ -335,7 +347,6 @@ def is_power_of_two(n):
|
|||||||
|
|
||||||
|
|
||||||
def to_cute_constant(value: list[int]):
|
def to_cute_constant(value: list[int]):
|
||||||
|
|
||||||
def _to_cute_constant(value: int):
|
def _to_cute_constant(value: int):
|
||||||
if is_power_of_two(value):
|
if is_power_of_two(value):
|
||||||
return f"_{value}"
|
return f"_{value}"
|
||||||
@ -350,11 +361,11 @@ def to_cute_constant(value: list[int]):
|
|||||||
|
|
||||||
def unique_schedules(impl_configs: list[ImplConfig]):
|
def unique_schedules(impl_configs: list[ImplConfig]):
|
||||||
# Use dict over set for deterministic ordering
|
# Use dict over set for deterministic ordering
|
||||||
return list({
|
return list(
|
||||||
sch: None
|
{
|
||||||
for impl_config in impl_configs
|
sch: None for impl_config in impl_configs for sch in impl_config.schedules
|
||||||
for sch in impl_config.schedules
|
}.keys()
|
||||||
}.keys())
|
)
|
||||||
|
|
||||||
|
|
||||||
def unsigned_type_with_bitwidth(num_bits):
|
def unsigned_type_with_bitwidth(num_bits):
|
||||||
@ -380,7 +391,7 @@ template_globals = {
|
|||||||
"gen_type_sig": generate_type_signature,
|
"gen_type_sig": generate_type_signature,
|
||||||
"unique_schedules": unique_schedules,
|
"unique_schedules": unique_schedules,
|
||||||
"unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
|
"unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
|
||||||
"gen_type_option_name": generate_type_option_name
|
"gen_type_option_name": generate_type_option_name,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -398,23 +409,28 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
|
|||||||
def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
sources.append((
|
sources.append(
|
||||||
"machete_mm_dispatch",
|
(
|
||||||
mm_dispatch_template.render(impl_configs=impl_configs),
|
"machete_mm_dispatch",
|
||||||
))
|
mm_dispatch_template.render(impl_configs=impl_configs),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
prepack_types = []
|
prepack_types = []
|
||||||
for impl_config in impl_configs:
|
for impl_config in impl_configs:
|
||||||
convert_type = impl_config.types.a \
|
convert_type = (
|
||||||
if impl_config.types.b_group_scale == DataType.void \
|
impl_config.types.a
|
||||||
else impl_config.types.b_group_scale
|
if impl_config.types.b_group_scale == DataType.void
|
||||||
|
else impl_config.types.b_group_scale
|
||||||
|
)
|
||||||
prepack_types.append(
|
prepack_types.append(
|
||||||
PrepackTypeConfig(
|
PrepackTypeConfig(
|
||||||
a=impl_config.types.a,
|
a=impl_config.types.a,
|
||||||
b_num_bits=VLLMDataTypeSize[impl_config.types.b],
|
b_num_bits=VLLMDataTypeSize[impl_config.types.b],
|
||||||
convert=convert_type,
|
convert=convert_type,
|
||||||
accumulator=impl_config.types.accumulator,
|
accumulator=impl_config.types.accumulator,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def prepacked_type_key(prepack_type: PrepackTypeConfig):
|
def prepacked_type_key(prepack_type: PrepackTypeConfig):
|
||||||
# For now, we can just use the first accumulator type seen since
|
# For now, we can just use the first accumulator type seen since
|
||||||
@ -430,10 +446,14 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
|||||||
unique_prepack_types.append(prepack_type)
|
unique_prepack_types.append(prepack_type)
|
||||||
prepack_types_seen.add(key)
|
prepack_types_seen.add(key)
|
||||||
|
|
||||||
sources.append((
|
sources.append(
|
||||||
"machete_prepack",
|
(
|
||||||
prepack_dispatch_template.render(types=unique_prepack_types, ),
|
"machete_prepack",
|
||||||
))
|
prepack_dispatch_template.render(
|
||||||
|
types=unique_prepack_types,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Split up impls across files
|
# Split up impls across files
|
||||||
num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
|
num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
|
||||||
@ -466,10 +486,12 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
|||||||
curr_impl_in_file += len(files_impls[-1][-1].schedules)
|
curr_impl_in_file += len(files_impls[-1][-1].schedules)
|
||||||
|
|
||||||
for part, file_impls in enumerate(files_impls):
|
for part, file_impls in enumerate(files_impls):
|
||||||
sources.append((
|
sources.append(
|
||||||
f"machete_mm_impl_part{part+1}",
|
(
|
||||||
mm_impl_template.render(impl_configs=file_impls),
|
f"machete_mm_impl_part{part + 1}",
|
||||||
))
|
mm_impl_template.render(impl_configs=file_impls),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return sources
|
return sources
|
||||||
|
|
||||||
@ -514,8 +536,7 @@ def generate():
|
|||||||
# For now we use the same heuristic for all types
|
# For now we use the same heuristic for all types
|
||||||
# Heuristic is currently tuned for H100s
|
# Heuristic is currently tuned for H100s
|
||||||
default_heuristic = [
|
default_heuristic = [
|
||||||
(cond, ScheduleConfig(*tile_config,
|
(cond, ScheduleConfig(*tile_config, **sch_common_params)) # type: ignore
|
||||||
**sch_common_params)) # type: ignore
|
|
||||||
for cond, tile_config in default_tile_heuristic_config.items()
|
for cond, tile_config in default_tile_heuristic_config.items()
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -541,14 +562,18 @@ def generate():
|
|||||||
a_token_scale=DataType.void,
|
a_token_scale=DataType.void,
|
||||||
out=a,
|
out=a,
|
||||||
accumulator=DataType.f32,
|
accumulator=DataType.f32,
|
||||||
) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
|
)
|
||||||
for a in (DataType.f16, DataType.bf16))
|
for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
|
||||||
|
for a in (DataType.f16, DataType.bf16)
|
||||||
|
)
|
||||||
|
|
||||||
impl_configs += [
|
impl_configs += [
|
||||||
ImplConfig(x[0], x[1], x[2])
|
ImplConfig(x[0], x[1], x[2])
|
||||||
for x in zip(GPTQ_kernel_type_configs,
|
for x in zip(
|
||||||
itertools.repeat(get_unique_schedules(default_heuristic)),
|
GPTQ_kernel_type_configs,
|
||||||
itertools.repeat(default_heuristic))
|
itertools.repeat(get_unique_schedules(default_heuristic)),
|
||||||
|
itertools.repeat(default_heuristic),
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
AWQ_kernel_type_configs = list(
|
AWQ_kernel_type_configs = list(
|
||||||
@ -561,14 +586,18 @@ def generate():
|
|||||||
a_token_scale=DataType.void,
|
a_token_scale=DataType.void,
|
||||||
out=a,
|
out=a,
|
||||||
accumulator=DataType.f32,
|
accumulator=DataType.f32,
|
||||||
) for b in (DataType.u4, DataType.u8)
|
)
|
||||||
for a in (DataType.f16, DataType.bf16))
|
for b in (DataType.u4, DataType.u8)
|
||||||
|
for a in (DataType.f16, DataType.bf16)
|
||||||
|
)
|
||||||
|
|
||||||
impl_configs += [
|
impl_configs += [
|
||||||
ImplConfig(x[0], x[1], x[2])
|
ImplConfig(x[0], x[1], x[2])
|
||||||
for x in zip(AWQ_kernel_type_configs,
|
for x in zip(
|
||||||
itertools.repeat(get_unique_schedules(default_heuristic)),
|
AWQ_kernel_type_configs,
|
||||||
itertools.repeat(default_heuristic))
|
itertools.repeat(get_unique_schedules(default_heuristic)),
|
||||||
|
itertools.repeat(default_heuristic),
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
# TODO: Support W4A8 when ready
|
# TODO: Support W4A8 when ready
|
||||||
|
|||||||
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
||||||
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
||||||
{stride_tag});
|
{stride_tag});
|
||||||
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
// quantization, as well as bias
|
// quantization, as well as bias
|
||||||
|
|||||||
@ -13,8 +13,13 @@ ARG PYTHON_VERSION=3.12
|
|||||||
# private registries that use a different repository naming conventions.
|
# private registries that use a different repository naming conventions.
|
||||||
#
|
#
|
||||||
# Example:
|
# Example:
|
||||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
|
||||||
|
# Important: We build with an old version of Ubuntu to maintain broad
|
||||||
|
# compatibility with other Linux OSes. The main reason for this is that the
|
||||||
|
# glibc version is baked into the distro, and binaries built with one glibc
|
||||||
|
# version are not backwards compatible with OSes that use an earlier version.
|
||||||
|
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
|
||||||
@ -75,15 +80,20 @@ ARG TARGETPLATFORM
|
|||||||
ARG INSTALL_KV_CONNECTORS=false
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG DEADSNAKES_MIRROR_URL
|
|
||||||
ARG DEADSNAKES_GPGKEY_URL
|
|
||||||
ARG GET_PIP_URL
|
ARG GET_PIP_URL
|
||||||
|
|
||||||
# Install minimal dependencies
|
# Install system dependencies and uv, then create Python virtual environment
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo
|
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
|
||||||
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||||
|
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||||
|
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
||||||
|
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
|
||||||
|
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
|
||||||
|
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
||||||
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@ -91,9 +101,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||||
|
|
||||||
# Install uv and Python
|
# Activate virtual environment and add uv to PATH
|
||||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
||||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
ENV VIRTUAL_ENV="/opt/venv"
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@ -122,7 +132,7 @@ WORKDIR /workspace
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/cuda.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
@ -152,7 +162,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|||||||
ENV UV_LINK_MODE=copy
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
@ -249,7 +259,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
|||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/dev.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
@ -276,12 +286,32 @@ ARG GET_PIP_URL
|
|||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
# Install minimal dependencies
|
# Install Python and other dependencies
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
|
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||||
|
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||||
|
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||||
|
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||||
|
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||||
|
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
||||||
|
fi ; \
|
||||||
|
else \
|
||||||
|
for i in 1 2 3; do \
|
||||||
|
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||||
|
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||||
|
done ; \
|
||||||
|
fi \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
|
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||||
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
|
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||||
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@ -289,9 +319,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||||
|
|
||||||
# Install uv and Python
|
# Install uv for faster pip installs
|
||||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
python3 -m pip install uv
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@ -525,5 +555,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
|||||||
|
|
||||||
FROM vllm-openai-base AS vllm-openai
|
FROM vllm-openai-base AS vllm-openai
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
|
|||||||
@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
||||||
uv pip install dist/*.whl
|
uv pip install dist/*.whl
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@ -314,4 +314,4 @@ WORKDIR /workspace/
|
|||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@ -309,4 +309,4 @@ USER 2000
|
|||||||
WORKDIR /home/vllm
|
WORKDIR /home/vllm
|
||||||
|
|
||||||
# Set the default entrypoint
|
# Set the default entrypoint
|
||||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 127 KiB After Width: | Height: | Size: 119 KiB |
@ -53,7 +53,7 @@ llm = LLM(model="adept/fuyu-8b",
|
|||||||
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
|
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
CUDA graph capture takes up more memory in V1 than in V0.
|
CUDA graph capture increases GPU memory usage. Adjust capture sizes if you need to conserve memory.
|
||||||
|
|
||||||
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
||||||
|
|
||||||
|
|||||||
@ -33,7 +33,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
|
|||||||
|
|
||||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||||
|
|
||||||
In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
|
In vLLM V1, **chunked prefill is always enabled by default** so that behavior is consistent across supported models.
|
||||||
|
|
||||||
With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
|
With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ You can tune the performance by adjusting `max_num_batched_tokens`:
|
|||||||
- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
|
- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
|
||||||
- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
|
- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
|
||||||
- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
|
- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
|
||||||
- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
|
- If `max_num_batched_tokens` is the same as `max_model_len`, the scheduler behaves similarly to the legacy policy where large prefills ran without chunking (while still prioritizing decodes).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|||||||
@ -661,8 +661,7 @@ Benchmark the performance of multi-modal requests in vLLM.
|
|||||||
Start vLLM:
|
Start vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dtype bfloat16 \
|
--dtype bfloat16 \
|
||||||
--limit-mm-per-prompt '{"image": 1}' \
|
--limit-mm-per-prompt '{"image": 1}' \
|
||||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
--allowed-local-media-path /path/to/sharegpt4v/images
|
||||||
@ -688,8 +687,7 @@ vllm bench serve \
|
|||||||
Start vLLM:
|
Start vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dtype bfloat16 \
|
--dtype bfloat16 \
|
||||||
--limit-mm-per-prompt '{"video": 1}' \
|
--limit-mm-per-prompt '{"video": 1}' \
|
||||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||||
|
|||||||
@ -133,8 +133,7 @@ We consider 3 different scenarios:
|
|||||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||||
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
Please avoid reintroducing legacy cache managers such as `MambaCacheManager` or any previously removed code paths from older implementations.
|
||||||
V0-only classes and code will be removed in the very near future.
|
|
||||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
||||||
|
|
||||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||||
|
|||||||
@ -258,17 +258,21 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
|
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
|
image_overrides = mm_options.get("image") if mm_options else None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images,
|
||||||
|
overrides=image_overrides)
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -438,16 +442,20 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
|
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
image_overrides = mm_options.get("image") if mm_options else None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images,
|
||||||
|
overrides=image_overrides)
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -39,8 +39,7 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve meta-llama/Meta-Llama-3-70B
|
||||||
--model meta-llama/Meta-Llama-3-70B
|
|
||||||
```
|
```
|
||||||
|
|
||||||
vllm bench command:
|
vllm bench command:
|
||||||
|
|||||||
@ -19,8 +19,7 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
|
|||||||
1. Start the vLLM server with the supported chat completion model, e.g.
|
1. Start the vLLM server with the supported chat completion model, e.g.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve mistralai/Mistral-7B-Instruct-v0.2
|
||||||
--model mistralai/Mistral-7B-Instruct-v0.2
|
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Call it with AutoGen:
|
1. Call it with AutoGen:
|
||||||
|
|||||||
@ -61,7 +61,7 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
|
|||||||
|
|
||||||
### Method 2: Guided Deployment (Transformers Models)
|
### Method 2: Guided Deployment (Transformers Models)
|
||||||
|
|
||||||
This method applies to models with the `transformers` library tag in their metadata. It allows you to deploy a model directly from the Hub UI without manual configuration.
|
This method applies to models with the [`transformers` library tag](https://huggingface.co/models?library=transformers) in their metadata. It allows you to deploy a model directly from the Hub UI without manual configuration.
|
||||||
|
|
||||||
1. Navigate to a model on [Hugging Face Hub](https://huggingface.co/models).
|
1. Navigate to a model on [Hugging Face Hub](https://huggingface.co/models).
|
||||||
For this example we will use the [`ibm-granite/granite-docling-258M`](https://huggingface.co/ibm-granite/granite-docling-258M) model. You can verify that the model is compatible by checking the front matter in the [README](https://huggingface.co/ibm-granite/granite-docling-258M/blob/main/README.md), where the library is tagged as `library: transformers`.
|
For this example we will use the [`ibm-granite/granite-docling-258M`](https://huggingface.co/ibm-granite/granite-docling-258M) model. You can verify that the model is compatible by checking the front matter in the [README](https://huggingface.co/ibm-granite/granite-docling-258M/blob/main/README.md), where the library is tagged as `library: transformers`.
|
||||||
@ -128,7 +128,7 @@ Some models require manual deployment because they:
|
|||||||
|
|
||||||
These models cannot be deployed using the **Deploy** button on the model card.
|
These models cannot be deployed using the **Deploy** button on the model card.
|
||||||
|
|
||||||
In this guide, we demonstrate manual deployment using the [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) model, an OCR model integrated with vLLM (see vLLM [PR](https://github.com/vllm-project/vllm/pull/24645)).
|
In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.ocr`](https://huggingface.co/rednote-hilab/dots.ocr) model, an OCR model integrated with vLLM (see vLLM [PR](https://github.com/vllm-project/vllm/pull/24645)).
|
||||||
|
|
||||||
1. Start a new deployment. Go to [Inference Endpoints](https://endpoints.huggingface.co/) and click `New`.
|
1. Start a new deployment. Go to [Inference Endpoints](https://endpoints.huggingface.co/) and click `New`.
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
|
vllm serve <model> --host 0.0.0.0 --port 8000
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Start the Open WebUI Docker container:
|
3. Start the Open WebUI Docker container:
|
||||||
|
|||||||
@ -32,6 +32,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
|
PYTHONUNBUFFERED: 1
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -47,9 +48,8 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
python -u -m vllm.entrypoints.openai.api_server \
|
vllm serve $MODEL_NAME \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
--model $MODEL_NAME \
|
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log &
|
2>&1 | tee api_server.log &
|
||||||
@ -131,6 +131,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
|
PYTHONUNBUFFERED: 1
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -146,9 +147,8 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
python -u -m vllm.entrypoints.openai.api_server \
|
vllm serve $MODEL_NAME \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
--model $MODEL_NAME \
|
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
@ -243,6 +243,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
|
PYTHONUNBUFFERED: 1
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -258,9 +259,8 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
python -u -m vllm.entrypoints.openai.api_server \
|
vllm serve $MODEL_NAME \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
--model $MODEL_NAME \
|
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
|
|||||||
5
docs/deployment/integrations/kaito.md
Normal file
5
docs/deployment/integrations/kaito.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# KAITO
|
||||||
|
|
||||||
|
[KAITO](https://kaito-project.github.io/kaito/docs/) is a Kubernetes operator that supports deploying and serving LLMs with vLLM. It offers managing large models via container images with built-in OpenAI-compatible inference, auto-provisioning GPU nodes and curated model presets.
|
||||||
|
|
||||||
|
Please refer to [quick start](https://kaito-project.github.io/kaito/docs/quick-start) for more details.
|
||||||
@ -55,7 +55,7 @@ sudo kubectl port-forward svc/vllm-router-service 30080:80
|
|||||||
And then you can send out a query to the OpenAI-compatible API to check the available models:
|
And then you can send out a query to the OpenAI-compatible API to check the available models:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -o- http://localhost:30080/models
|
curl -o- http://localhost:30080/v1/models
|
||||||
```
|
```
|
||||||
|
|
||||||
??? console "Output"
|
??? console "Output"
|
||||||
@ -78,7 +78,7 @@ curl -o- http://localhost:30080/models
|
|||||||
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST http://localhost:30080/completions \
|
curl -X POST http://localhost:30080/v1/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "facebook/opt-125m",
|
"model": "facebook/opt-125m",
|
||||||
|
|||||||
@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
|||||||
|
|
||||||
- [Helm](frameworks/helm.md)
|
- [Helm](frameworks/helm.md)
|
||||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||||
|
- [KAITO](integrations/kaito.md)
|
||||||
- [KServe](integrations/kserve.md)
|
- [KServe](integrations/kserve.md)
|
||||||
- [KubeRay](integrations/kuberay.md)
|
- [KubeRay](integrations/kuberay.md)
|
||||||
- [kubernetes-sigs/lws](frameworks/lws.md)
|
- [kubernetes-sigs/lws](frameworks/lws.md)
|
||||||
|
|||||||
@ -69,6 +69,11 @@ Sometimes you may see the API server entrypoint used directly instead of via the
|
|||||||
python -m vllm.entrypoints.openai.api_server --model <model>
|
python -m vllm.entrypoints.openai.api_server --model <model>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
|
||||||
|
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
||||||
|
and may become unsupported in a future release.
|
||||||
|
|
||||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||||
|
|
||||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
# Metrics
|
# Metrics
|
||||||
|
|
||||||
Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
|
vLLM exposes a rich set of metrics to support observability and capacity planning for the V1 engine.
|
||||||
|
|
||||||
## Objectives
|
## Objectives
|
||||||
|
|
||||||
- Achieve parity of metrics between v0 and v1.
|
- Provide comprehensive coverage of engine and request level metrics to aid production monitoring.
|
||||||
- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
|
- Prioritize Prometheus integrations, as this is what we expect to be used in production environments.
|
||||||
- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
|
- Offer logging support (i.e. printing metrics to the info log) for ad-hoc testing, debugging, development, and exploratory use cases.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
@ -17,9 +17,9 @@ Metrics in vLLM can be categorized as follows:
|
|||||||
|
|
||||||
The mental model is that server-level metrics help explain the values of request-level metrics.
|
The mental model is that server-level metrics help explain the values of request-level metrics.
|
||||||
|
|
||||||
### v0 Metrics
|
### Metrics Overview
|
||||||
|
|
||||||
In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
The following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix and are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md):
|
||||||
|
|
||||||
- `vllm:num_requests_running` (Gauge)
|
- `vllm:num_requests_running` (Gauge)
|
||||||
- `vllm:num_requests_swapped` (Gauge)
|
- `vllm:num_requests_swapped` (Gauge)
|
||||||
@ -57,8 +57,6 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
|
|||||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||||
|
|
||||||
These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
|
|
||||||
|
|
||||||
### Grafana Dashboard
|
### Grafana Dashboard
|
||||||
|
|
||||||
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||||
@ -86,7 +84,7 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
|
|||||||
|
|
||||||
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
||||||
|
|
||||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
During those migrations we briefly lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||||
@ -97,10 +95,6 @@ http_request_duration_highr_seconds_count 201.0
|
|||||||
http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
|
http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
|
||||||
```
|
```
|
||||||
|
|
||||||
### Multi-process Mode
|
|
||||||
|
|
||||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
|
|
||||||
|
|
||||||
### Built in Python/Process Metrics
|
### Built in Python/Process Metrics
|
||||||
|
|
||||||
The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
|
The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
|
||||||
@ -116,22 +110,7 @@ The following metrics are supported by default by `prometheus_client`, but they
|
|||||||
- `process_open_fds`
|
- `process_open_fds`
|
||||||
- `process_max_fds`
|
- `process_max_fds`
|
||||||
|
|
||||||
This is relevant because if we move away from multiprocess mode in v1,
|
This is relevant because if we move away from multiprocess mode we get these back. However, it's questionable how relevant these are if they don't aggregate these stats for all processes that make up a vLLM instance.
|
||||||
we get these back. However, it's questionable how relevant these are
|
|
||||||
if they don't aggregate these stats for all processes that make up a
|
|
||||||
vLLM instance.
|
|
||||||
|
|
||||||
### v0 PRs and Issues
|
|
||||||
|
|
||||||
For background, these are some of the relevant PRs which added the v0 metrics:
|
|
||||||
|
|
||||||
- <gh-pr:1890>
|
|
||||||
- <gh-pr:2316>
|
|
||||||
- <gh-pr:2730>
|
|
||||||
- <gh-pr:4464>
|
|
||||||
- <gh-pr:7279>
|
|
||||||
|
|
||||||
Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
|
|
||||||
|
|
||||||
## v1 Design
|
## v1 Design
|
||||||
|
|
||||||
@ -396,9 +375,8 @@ recent metric is used, but only from currently running processes.
|
|||||||
|
|
||||||
This was added in <gh-pr:9477> and there is
|
This was added in <gh-pr:9477> and there is
|
||||||
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
||||||
If we revisit this design and deprecate the old metric, we should reduce
|
If we revisit this design and deprecate the old metric, we should
|
||||||
the need for a significant deprecation period by making the change in
|
coordinate with downstream users so they can migrate before the removal.
|
||||||
v0 also and asking this project to move to the new metric.
|
|
||||||
|
|
||||||
### Prefix Cache metrics
|
### Prefix Cache metrics
|
||||||
|
|
||||||
@ -491,7 +469,7 @@ if seq_group.is_finished():
|
|||||||
|
|
||||||
This seems duplicative, and one of them should be removed. The latter
|
This seems duplicative, and one of them should be removed. The latter
|
||||||
is used by the Grafana dashboard, so we should deprecate or remove the
|
is used by the Grafana dashboard, so we should deprecate or remove the
|
||||||
former from v0.
|
former.
|
||||||
|
|
||||||
### Prefix Cache Hit Rate
|
### Prefix Cache Hit Rate
|
||||||
|
|
||||||
@ -500,7 +478,7 @@ See above - we now expose 'queries' and 'hits' counters rather than a
|
|||||||
|
|
||||||
### KV Cache Offloading
|
### KV Cache Offloading
|
||||||
|
|
||||||
Two v0 metrics relate to a "swapped" preemption mode that is no
|
Two legacy metrics relate to a "swapped" preemption mode that is no
|
||||||
longer relevant in v1:
|
longer relevant in v1:
|
||||||
|
|
||||||
- `vllm:num_requests_swapped`
|
- `vllm:num_requests_swapped`
|
||||||
@ -511,7 +489,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
|||||||
memory. This is also known as "KV cache offloading" and is configured
|
memory. This is also known as "KV cache offloading" and is configured
|
||||||
with `--swap-space` and `--preemption-mode`.
|
with `--swap-space` and `--preemption-mode`.
|
||||||
|
|
||||||
In v0, [vLLM has long supported beam search](gh-issue:6226). The
|
Historically, [vLLM has long supported beam search](gh-issue:6226). The
|
||||||
SequenceGroup encapsulated the idea of N Sequences which
|
SequenceGroup encapsulated the idea of N Sequences which
|
||||||
all shared the same prompt kv blocks. This enabled KV cache block
|
all shared the same prompt kv blocks. This enabled KV cache block
|
||||||
sharing between requests, and copy-on-write to do branching. CPU
|
sharing between requests, and copy-on-write to do branching. CPU
|
||||||
@ -524,7 +502,7 @@ and the part of the prompt that was evicted can be recomputed.
|
|||||||
|
|
||||||
SequenceGroup was removed in V1, although a replacement will be
|
SequenceGroup was removed in V1, although a replacement will be
|
||||||
required for "parallel sampling" (`n>1`).
|
required for "parallel sampling" (`n>1`).
|
||||||
[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
|
[Beam search was moved out of the core](gh-issue:8306). There was a
|
||||||
lot of complex code for a very uncommon feature.
|
lot of complex code for a very uncommon feature.
|
||||||
|
|
||||||
In V1, with prefix caching being better (zero over head) and therefore
|
In V1, with prefix caching being better (zero over head) and therefore
|
||||||
@ -535,7 +513,7 @@ better.
|
|||||||
|
|
||||||
### Parallel Sampling
|
### Parallel Sampling
|
||||||
|
|
||||||
Some v0 metrics are only relevant in the context of "parallel
|
Some legacy metrics are only relevant in the context of "parallel
|
||||||
sampling". This is where the `n` parameter in a request is used to
|
sampling". This is where the `n` parameter in a request is used to
|
||||||
request multiple completions from the same prompt.
|
request multiple completions from the same prompt.
|
||||||
|
|
||||||
@ -554,7 +532,7 @@ also add these metrics.
|
|||||||
|
|
||||||
### Speculative Decoding
|
### Speculative Decoding
|
||||||
|
|
||||||
Some v0 metrics are specific to "speculative decoding". This is where
|
Some legacy metrics are specific to "speculative decoding". This is where
|
||||||
we generate candidate tokens using a faster, approximate method or
|
we generate candidate tokens using a faster, approximate method or
|
||||||
model and then validate those tokens with the larger model.
|
model and then validate those tokens with the larger model.
|
||||||
|
|
||||||
@ -566,7 +544,7 @@ model and then validate those tokens with the larger model.
|
|||||||
|
|
||||||
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
||||||
speculative decoding to v1. Other techniques will follow. We should
|
speculative decoding to v1. Other techniques will follow. We should
|
||||||
revisit the v0 metrics in this context.
|
revisit these metrics in this context.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
We should probably expose acceptance rate as separate accepted
|
We should probably expose acceptance rate as separate accepted
|
||||||
@ -639,7 +617,7 @@ metrics are often relatively straightforward to add:
|
|||||||
metrics are usually of very limited use unless they can be enabled
|
metrics are usually of very limited use unless they can be enabled
|
||||||
by default and in production.
|
by default and in production.
|
||||||
3. They have an impact on development and maintenance of the
|
3. They have an impact on development and maintenance of the
|
||||||
project. Every metric added to v0 has made this v1 effort more
|
project. Every metric added over time has made this effort more
|
||||||
time-consuming, and perhaps not all metrics justify this ongoing
|
time-consuming, and perhaps not all metrics justify this ongoing
|
||||||
investment in their maintenance.
|
investment in their maintenance.
|
||||||
|
|
||||||
@ -650,7 +628,7 @@ performance and health. Tracing, on the other hand, tracks individual
|
|||||||
requests as they move through different services and components. Both
|
requests as they move through different services and components. Both
|
||||||
fall under the more general heading of "Observability".
|
fall under the more general heading of "Observability".
|
||||||
|
|
||||||
v0 has support for OpenTelemetry tracing:
|
vLLM has support for OpenTelemetry tracing:
|
||||||
|
|
||||||
- Added by <gh-pr:4687>
|
- Added by <gh-pr:4687>
|
||||||
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
||||||
@ -663,11 +641,11 @@ OpenTelemetry has a
|
|||||||
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
||||||
|
|
||||||
Since metrics is a big enough topic on its own, we are going to tackle
|
Since metrics is a big enough topic on its own, we are going to tackle
|
||||||
the topic of tracing in v1 separately.
|
the topic of tracing separately.
|
||||||
|
|
||||||
### OpenTelemetry Model Forward vs Execute Time
|
### OpenTelemetry Model Forward vs Execute Time
|
||||||
|
|
||||||
In v0, we have the following two metrics:
|
The current implementation exposes the following two metrics:
|
||||||
|
|
||||||
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
||||||
in the model forward pass when this request was in the batch.
|
in the model forward pass when this request was in the batch.
|
||||||
|
|||||||
@ -93,6 +93,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
|
|||||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
||||||
|
|
||||||
|
| marlin experts | standard | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts] |
|
||||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
||||||
@ -114,6 +116,6 @@ The following table shows "families" of modular kernels that are intended to wor
|
|||||||
|
|
||||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
||||||
| deepep_high_throughput,</br>pplx | `DeepEPHTPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8` |
|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||||
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8` |
|
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
|
||||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||||
|
|||||||
@ -60,30 +60,6 @@ Multiple vLLM dependencies indicate either a preference or requirement for using
|
|||||||
It is perhaps more accurate to say that there are known problems with using
|
It is perhaps more accurate to say that there are known problems with using
|
||||||
`fork` after initializing these dependencies.
|
`fork` after initializing these dependencies.
|
||||||
|
|
||||||
## Current State (v0)
|
|
||||||
|
|
||||||
The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
|
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
|
|
||||||
|
|
||||||
When we know we own the process because the `vllm` command was used, we use
|
|
||||||
`spawn` because it's the most widely compatible.
|
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
|
|
||||||
|
|
||||||
The `multiproc_xpu_executor` forces the use of `spawn`.
|
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
|
|
||||||
|
|
||||||
There are other miscellaneous places hard-coding the use of `spawn`:
|
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/all_reduce_utils.py#L135>
|
|
||||||
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
|
|
||||||
|
|
||||||
Related PRs:
|
|
||||||
|
|
||||||
- <gh-pr:8823>
|
|
||||||
|
|
||||||
## Prior State in v1
|
## Prior State in v1
|
||||||
|
|
||||||
There was an environment variable to control whether multiprocessing is used in
|
There was an environment variable to control whether multiprocessing is used in
|
||||||
|
|||||||
@ -49,7 +49,7 @@ Every plugin has three parts:
|
|||||||
|
|
||||||
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
||||||
|
|
||||||
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
|
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
|
||||||
|
|
||||||
## Guidelines for Writing Plugins
|
## Guidelines for Writing Plugins
|
||||||
|
|
||||||
|
|||||||
@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
|
|||||||
|
|
||||||
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
||||||
|
|
||||||
!!! note
|
|
||||||
Cache isolation is not supported in engine V0.
|
|
||||||
|
|
||||||
## Data Structure
|
## Data Structure
|
||||||
|
|
||||||
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
||||||
@ -189,7 +186,7 @@ Time 1:
|
|||||||
Cache Blocks: 0, 1, 3
|
Cache Blocks: 0, 1, 3
|
||||||
```
|
```
|
||||||
|
|
||||||
As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
|
As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. Because the block table in vLLM v1 is append-only, changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
|
||||||
|
|
||||||
### Free
|
### Free
|
||||||
|
|
||||||
|
|||||||
@ -166,7 +166,7 @@ The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" re
|
|||||||
|
|
||||||
### Wrapping an Existing Request-Level Logits Processor
|
### Wrapping an Existing Request-Level Logits Processor
|
||||||
|
|
||||||
Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation:
|
Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. Earlier request-level processors were implemented as `Callable` objects conforming to the following type annotation:
|
||||||
|
|
||||||
``` python
|
``` python
|
||||||
RequestLogitsProcessor = Union[
|
RequestLogitsProcessor = Union[
|
||||||
|
|||||||
@ -8,6 +8,9 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
|
|||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
|
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
|
||||||
|
|
||||||
|
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions.
|
||||||
|
|
||||||
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
|
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
|
||||||
|
|
||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|||||||
@ -64,8 +64,7 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
|
|||||||
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
|
VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
|
||||||
--model Qwen/Qwen3-0.6B \
|
|
||||||
--enable-sleep-mode \
|
--enable-sleep-mode \
|
||||||
--port 8000
|
--port 8000
|
||||||
```
|
```
|
||||||
|
|||||||
@ -16,8 +16,8 @@ Speculative decoding is a technique which improves inter-token latency in memory
|
|||||||
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
In vllm v0.10.0, speculative decoding with a draft model is not supported.
|
Speculative decoding with a draft model requires the V1 engine.
|
||||||
If you use the following code, you will get a `NotImplementedError`.
|
Older releases that predate V1 (such as the 0.10.x series) raise a `NotImplementedError`.
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin
|
|||||||
To perform the same with an online mode launch the server:
|
To perform the same with an online mode launch the server:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve facebook/opt-6.7b \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--model facebook/opt-6.7b \
|
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
-tp 1 \
|
-tp 1 \
|
||||||
--gpu_memory_utilization 0.8 \
|
--gpu_memory_utilization 0.8 \
|
||||||
|
|||||||
@ -191,10 +191,14 @@ VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pyth
|
|||||||
|
|
||||||
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
|
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
|
||||||
|
|
||||||
#### IBM Granite
|
### IBM Granite
|
||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
|
* `ibm-granite/granite-4.0-h-small` and other Granite 4.0 models
|
||||||
|
|
||||||
|
Recommended flags: `--tool-call-parser hermes`
|
||||||
|
|
||||||
* `ibm-granite/granite-3.0-8b-instruct`
|
* `ibm-granite/granite-3.0-8b-instruct`
|
||||||
|
|
||||||
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
|
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
|
||||||
|
|||||||
@ -67,8 +67,7 @@ docker run -it \
|
|||||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve facebook/opt-13b \
|
||||||
--model=facebook/opt-13b \
|
|
||||||
--dtype=bfloat16 \
|
--dtype=bfloat16 \
|
||||||
--max_model_len=1024 \
|
--max_model_len=1024 \
|
||||||
--distributed-executor-backend=mp \
|
--distributed-executor-backend=mp \
|
||||||
|
|||||||
@ -33,8 +33,11 @@ def auto_mock(module, attr, max_mocks=50):
|
|||||||
try:
|
try:
|
||||||
# First treat attr as an attr, then as a submodule
|
# First treat attr as an attr, then as a submodule
|
||||||
with patch("importlib.metadata.version", return_value="0.0.0"):
|
with patch("importlib.metadata.version", return_value="0.0.0"):
|
||||||
return getattr(importlib.import_module(module), attr,
|
return getattr(
|
||||||
importlib.import_module(f"{module}.{attr}"))
|
importlib.import_module(module),
|
||||||
|
attr,
|
||||||
|
importlib.import_module(f"{module}.{attr}"),
|
||||||
|
)
|
||||||
except importlib.metadata.PackageNotFoundError as e:
|
except importlib.metadata.PackageNotFoundError as e:
|
||||||
raise e
|
raise e
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
@ -42,7 +45,8 @@ def auto_mock(module, attr, max_mocks=50):
|
|||||||
sys.modules[e.name] = PydanticMagicMock()
|
sys.modules[e.name] = PydanticMagicMock()
|
||||||
|
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
|
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
latency = auto_mock("vllm.benchmarks", "latency")
|
latency = auto_mock("vllm.benchmarks", "latency")
|
||||||
@ -61,9 +65,7 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
"""Custom formatter that generates markdown for argument groups."""
|
"""Custom formatter that generates markdown for argument groups."""
|
||||||
|
|
||||||
def __init__(self, prog, starting_heading_level=3):
|
def __init__(self, prog, starting_heading_level=3):
|
||||||
super().__init__(prog,
|
super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
|
||||||
max_help_position=float('inf'),
|
|
||||||
width=float('inf'))
|
|
||||||
self._section_heading_prefix = "#" * starting_heading_level
|
self._section_heading_prefix = "#" * starting_heading_level
|
||||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||||
self._markdown_output = []
|
self._markdown_output = []
|
||||||
@ -85,23 +87,19 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
|
|
||||||
def add_arguments(self, actions):
|
def add_arguments(self, actions):
|
||||||
for action in actions:
|
for action in actions:
|
||||||
if (len(action.option_strings) == 0
|
if len(action.option_strings) == 0 or "--help" in action.option_strings:
|
||||||
or "--help" in action.option_strings):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
option_strings = f'`{"`, `".join(action.option_strings)}`'
|
option_strings = f"`{'`, `'.join(action.option_strings)}`"
|
||||||
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
||||||
self._markdown_output.append(heading_md)
|
self._markdown_output.append(heading_md)
|
||||||
|
|
||||||
if choices := action.choices:
|
if choices := action.choices:
|
||||||
choices = f'`{"`, `".join(str(c) for c in choices)}`'
|
choices = f"`{'`, `'.join(str(c) for c in choices)}`"
|
||||||
self._markdown_output.append(
|
self._markdown_output.append(f"Possible choices: {choices}\n\n")
|
||||||
f"Possible choices: {choices}\n\n")
|
elif (metavar := action.metavar) and isinstance(metavar, (list, tuple)):
|
||||||
elif ((metavar := action.metavar)
|
metavar = f"`{'`, `'.join(str(m) for m in metavar)}`"
|
||||||
and isinstance(metavar, (list, tuple))):
|
self._markdown_output.append(f"Possible choices: {metavar}\n\n")
|
||||||
metavar = f'`{"`, `".join(str(m) for m in metavar)}`'
|
|
||||||
self._markdown_output.append(
|
|
||||||
f"Possible choices: {metavar}\n\n")
|
|
||||||
|
|
||||||
if action.help:
|
if action.help:
|
||||||
self._markdown_output.append(f"{action.help}\n\n")
|
self._markdown_output.append(f"{action.help}\n\n")
|
||||||
@ -143,24 +141,17 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
|||||||
|
|
||||||
# Create parsers to document
|
# Create parsers to document
|
||||||
parsers = {
|
parsers = {
|
||||||
"engine_args":
|
"engine_args": create_parser(EngineArgs.add_cli_args),
|
||||||
create_parser(EngineArgs.add_cli_args),
|
"async_engine_args": create_parser(
|
||||||
"async_engine_args":
|
AsyncEngineArgs.add_cli_args, async_args_only=True
|
||||||
create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
|
),
|
||||||
"serve":
|
"serve": create_parser(cli_args.make_arg_parser),
|
||||||
create_parser(cli_args.make_arg_parser),
|
"chat": create_parser(ChatCommand.add_cli_args),
|
||||||
"chat":
|
"complete": create_parser(CompleteCommand.add_cli_args),
|
||||||
create_parser(ChatCommand.add_cli_args),
|
"bench_latency": create_parser(latency.add_cli_args),
|
||||||
"complete":
|
"bench_throughput": create_parser(throughput.add_cli_args),
|
||||||
create_parser(CompleteCommand.add_cli_args),
|
"bench_serve": create_parser(serve.add_cli_args),
|
||||||
"bench_latency":
|
"run-batch": create_parser(run_batch.make_arg_parser),
|
||||||
create_parser(latency.add_cli_args),
|
|
||||||
"bench_throughput":
|
|
||||||
create_parser(throughput.add_cli_args),
|
|
||||||
"bench_serve":
|
|
||||||
create_parser(serve.add_cli_args),
|
|
||||||
"run-batch":
|
|
||||||
create_parser(run_batch.make_arg_parser),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Generate documentation for each parser
|
# Generate documentation for each parser
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import regex as re
|
|||||||
logger = logging.getLogger("mkdocs")
|
logger = logging.getLogger("mkdocs")
|
||||||
|
|
||||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||||
ROOT_DIR_RELATIVE = '../../../../..'
|
ROOT_DIR_RELATIVE = "../../../../.."
|
||||||
EXAMPLE_DIR = ROOT_DIR / "examples"
|
EXAMPLE_DIR = ROOT_DIR / "examples"
|
||||||
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
|
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ def fix_case(text: str) -> str:
|
|||||||
r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16
|
r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16
|
||||||
}
|
}
|
||||||
for pattern, repl in subs.items():
|
for pattern, repl in subs.items():
|
||||||
text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
|
text = re.sub(rf"\b{pattern}\b", repl, text, flags=re.IGNORECASE)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -58,7 +58,8 @@ class Example:
|
|||||||
determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
|
determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
|
||||||
determine_title() -> str: Determines the title of the document.
|
determine_title() -> str: Determines the title of the document.
|
||||||
generate() -> str: Generates the documentation content.
|
generate() -> str: Generates the documentation content.
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
|
|
||||||
path: Path
|
path: Path
|
||||||
category: str = None
|
category: str = None
|
||||||
main_file: Path = field(init=False)
|
main_file: Path = field(init=False)
|
||||||
@ -84,9 +85,8 @@ class Example:
|
|||||||
Markdown file found in the directory.
|
Markdown file found in the directory.
|
||||||
Raises:
|
Raises:
|
||||||
IndexError: If no Markdown files are found in the directory.
|
IndexError: If no Markdown files are found in the directory.
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
return self.path if self.path.is_file() else list(
|
return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
|
||||||
self.path.glob("*.md")).pop()
|
|
||||||
|
|
||||||
def determine_other_files(self) -> list[Path]:
|
def determine_other_files(self) -> list[Path]:
|
||||||
"""
|
"""
|
||||||
@ -98,7 +98,7 @@ class Example:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[Path]: A list of Path objects representing the other files in the directory.
|
list[Path]: A list of Path objects representing the other files in the directory.
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
if self.path.is_file():
|
if self.path.is_file():
|
||||||
return []
|
return []
|
||||||
is_other_file = lambda file: file.is_file() and file != self.main_file
|
is_other_file = lambda file: file.is_file() and file != self.main_file
|
||||||
@ -109,9 +109,9 @@ class Example:
|
|||||||
# Specify encoding for building on Windows
|
# Specify encoding for building on Windows
|
||||||
with open(self.main_file, encoding="utf-8") as f:
|
with open(self.main_file, encoding="utf-8") as f:
|
||||||
first_line = f.readline().strip()
|
first_line = f.readline().strip()
|
||||||
match = re.match(r'^#\s+(?P<title>.+)$', first_line)
|
match = re.match(r"^#\s+(?P<title>.+)$", first_line)
|
||||||
if match:
|
if match:
|
||||||
return match.group('title')
|
return match.group("title")
|
||||||
return fix_case(self.path.stem.replace("_", " ").title())
|
return fix_case(self.path.stem.replace("_", " ").title())
|
||||||
|
|
||||||
def fix_relative_links(self, content: str) -> str:
|
def fix_relative_links(self, content: str) -> str:
|
||||||
@ -127,7 +127,7 @@ class Example:
|
|||||||
"""
|
"""
|
||||||
# Regex to match markdown links [text](relative_path)
|
# Regex to match markdown links [text](relative_path)
|
||||||
# This matches links that don't start with http, https, ftp, or #
|
# This matches links that don't start with http, https, ftp, or #
|
||||||
link_pattern = r'\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)'
|
link_pattern = r"\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)"
|
||||||
|
|
||||||
def replace_link(match):
|
def replace_link(match):
|
||||||
link_text = match.group(1)
|
link_text = match.group(1)
|
||||||
@ -137,7 +137,7 @@ class Example:
|
|||||||
gh_file = (self.main_file.parent / relative_path).resolve()
|
gh_file = (self.main_file.parent / relative_path).resolve()
|
||||||
gh_file = gh_file.relative_to(ROOT_DIR)
|
gh_file = gh_file.relative_to(ROOT_DIR)
|
||||||
|
|
||||||
return f'[{link_text}](gh-file:{gh_file})'
|
return f"[{link_text}](gh-file:{gh_file})"
|
||||||
|
|
||||||
return re.sub(link_pattern, replace_link, content)
|
return re.sub(link_pattern, replace_link, content)
|
||||||
|
|
||||||
@ -150,9 +150,11 @@ class Example:
|
|||||||
code_fence = "``````"
|
code_fence = "``````"
|
||||||
|
|
||||||
if self.is_code:
|
if self.is_code:
|
||||||
content += (f"{code_fence}{self.main_file.suffix[1:]}\n"
|
content += (
|
||||||
f'--8<-- "{self.main_file}"\n'
|
f"{code_fence}{self.main_file.suffix[1:]}\n"
|
||||||
f"{code_fence}\n")
|
f'--8<-- "{self.main_file}"\n'
|
||||||
|
f"{code_fence}\n"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
with open(self.main_file) as f:
|
with open(self.main_file) as f:
|
||||||
# Skip the title from md snippets as it's been included above
|
# Skip the title from md snippets as it's been included above
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from typing import Literal
|
|||||||
|
|
||||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||||
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
|
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
|
||||||
if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
|
if os.getenv("READTHEDOCS_VERSION_TYPE") == "tag":
|
||||||
# remove the warning banner if the version is a tagged release
|
# remove the warning banner if the version is a tagged release
|
||||||
mkdocs_dir = Path(__file__).parent.parent
|
mkdocs_dir = Path(__file__).parent.parent
|
||||||
announcement_path = mkdocs_dir / "overrides/main.html"
|
announcement_path = mkdocs_dir / "overrides/main.html"
|
||||||
|
|||||||
@ -25,8 +25,9 @@ from mkdocs.structure.files import Files
|
|||||||
from mkdocs.structure.pages import Page
|
from mkdocs.structure.pages import Page
|
||||||
|
|
||||||
|
|
||||||
def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
|
def on_page_markdown(
|
||||||
files: Files) -> str:
|
markdown: str, *, page: Page, config: MkDocsConfig, files: Files
|
||||||
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Custom MkDocs plugin hook to rewrite special GitHub reference links
|
Custom MkDocs plugin hook to rewrite special GitHub reference links
|
||||||
in Markdown.
|
in Markdown.
|
||||||
@ -92,11 +93,11 @@ def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
|
|||||||
Example:
|
Example:
|
||||||
[My issue](gh-issue:123) → [:octicons-mark-github-16: My issue](https://github.com/vllm-project/vllm/issues/123)
|
[My issue](gh-issue:123) → [:octicons-mark-github-16: My issue](https://github.com/vllm-project/vllm/issues/123)
|
||||||
"""
|
"""
|
||||||
url = f'{urls[match.group("type")]}/{match.group("path")}'
|
url = f"{urls[match.group('type')]}/{match.group('path')}"
|
||||||
if fragment := match.group("fragment"):
|
if fragment := match.group("fragment"):
|
||||||
url += f"#{fragment}"
|
url += f"#{fragment}"
|
||||||
|
|
||||||
return f'[{gh_icon} {match.group("title")}]({url})'
|
return f"[{gh_icon} {match.group('title')}]({url})"
|
||||||
|
|
||||||
def replace_auto_link(match: re.Match) -> str:
|
def replace_auto_link(match: re.Match) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -17,12 +17,12 @@ These models are what we list in [supported-text-models][supported-text-models]
|
|||||||
|
|
||||||
### Transformers
|
### Transformers
|
||||||
|
|
||||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||||
|
|
||||||
Currently, the Transformers backend works for the following:
|
Currently, the Transformers backend works for the following:
|
||||||
|
|
||||||
- Modalities: embedding models, language models and vision-language models*
|
- Modalities: embedding models, language models and vision-language models*
|
||||||
- Architectures: encoder-only, decoder-only
|
- Architectures: encoder-only, decoder-only, mixture-of-experts
|
||||||
- Attention types: full attention and/or sliding attention
|
- Attention types: full attention and/or sliding attention
|
||||||
|
|
||||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||||
@ -31,8 +31,10 @@ If the Transformers model implementation follows all the steps in [writing a cus
|
|||||||
|
|
||||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||||
- Any combination of the following vLLM parallelisation schemes:
|
- Any combination of the following vLLM parallelisation schemes:
|
||||||
- Pipeline parallel
|
- Data parallel
|
||||||
- Tensor parallel
|
- Tensor parallel
|
||||||
|
- Expert parallel
|
||||||
|
- Pipeline parallel
|
||||||
|
|
||||||
Checking if the modeling backend is Transformers is as simple as:
|
Checking if the modeling backend is Transformers is as simple as:
|
||||||
|
|
||||||
@ -600,8 +602,9 @@ On the other hand, modalities separated by `/` are mutually exclusive.
|
|||||||
See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
|
See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
|
You can control the maximum number of multimodal inputs per prompt by setting
|
||||||
or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
|
`limit_mm_per_prompt` (offline inference) or `--limit-mm-per-prompt` (online
|
||||||
|
serving). For example, to enable passing up to 4 images per text prompt:
|
||||||
|
|
||||||
Offline inference:
|
Offline inference:
|
||||||
|
|
||||||
@ -620,8 +623,6 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
|
|||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
|
||||||
```
|
```
|
||||||
|
|
||||||
**This is no longer required if you are using vLLM V1.**
|
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
|
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
|
||||||
|
|
||||||
@ -676,7 +677,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
|||||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
||||||
@ -729,16 +730,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
|||||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
|
`Gemma3ForConditionalGeneration` uses a simplified attention pattern for text + image inputs:
|
||||||
However, there are differences in how they handle text + image inputs:
|
|
||||||
|
|
||||||
V0 correctly implements the model's attention pattern:
|
|
||||||
- Uses bidirectional attention between the image tokens corresponding to the same image
|
|
||||||
- Uses causal attention for other tokens
|
|
||||||
- Implemented via (naive) PyTorch SDPA with masking tensors
|
|
||||||
- Note: May use significant memory for long prompts with image
|
|
||||||
|
|
||||||
V1 currently uses a simplified attention pattern:
|
|
||||||
- Uses causal attention for all tokens, including image tokens
|
- Uses causal attention for all tokens, including image tokens
|
||||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
||||||
- Will be updated in the future to support the correct behavior
|
- Will be updated in the future to support the correct behavior
|
||||||
@ -796,11 +788,11 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
|||||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
Our PaliGemma implementations currently share the same attention limitation as Gemma 3 (see above).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
|
For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
|
||||||
is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
is currently unsupported because overlapping modalities are not yet supported.
|
||||||
|
|
||||||
#### Transcription
|
#### Transcription
|
||||||
|
|
||||||
@ -827,6 +819,7 @@ The following table lists those that are tested in vLLM.
|
|||||||
|
|
||||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||||
|
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | ✅︎ |
|
||||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
|
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
|
||||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
|
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
|
||||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
|
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
|
||||||
|
|||||||
@ -1,10 +1,9 @@
|
|||||||
# Reproducibility
|
# Reproducibility
|
||||||
|
|
||||||
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
|
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve reproducible results:
|
||||||
reproducible results:
|
|
||||||
|
|
||||||
- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
- Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||||
- For V0: Set the global seed (see below).
|
- Optionally configure the global seed if you need to control random sampling (see below).
|
||||||
|
|
||||||
Example: <gh-file:examples/offline_inference/reproducibility.py>
|
Example: <gh-file:examples/offline_inference/reproducibility.py>
|
||||||
|
|
||||||
@ -30,9 +29,7 @@ However, in some cases, setting the seed will also [change the random state in u
|
|||||||
|
|
||||||
### Default Behavior
|
### Default Behavior
|
||||||
|
|
||||||
In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
|
The `seed` parameter defaults to `0`, which sets the random state for each worker so the results remain consistent for each vLLM run even if `temperature > 0`.
|
||||||
|
|
||||||
In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
|
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
|
|
||||||
@ -43,10 +40,6 @@ In V1, the `seed` parameter defaults to `0` which sets the random state for each
|
|||||||
|
|
||||||
### Locality of random state
|
### Locality of random state
|
||||||
|
|
||||||
The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
|
The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM when the workers run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||||
|
|
||||||
- For V0: The seed is specified.
|
By default, this condition is not active so you can use vLLM without having to worry about accidentally making deterministic subsequent operations that rely on random state.
|
||||||
- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
|
||||||
|
|
||||||
By default, these conditions are not active so you can use vLLM without having to worry about
|
|
||||||
accidentally making deterministic subsequent operations that rely on random state.
|
|
||||||
|
|||||||
@ -66,6 +66,9 @@ Restrict domains that vLLM can access for media URLs by setting
|
|||||||
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
|
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
|
||||||
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
|
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
|
||||||
|
|
||||||
|
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
|
||||||
|
redirects from being followed to bypass domain restrictions.
|
||||||
|
|
||||||
## Security and Firewalls: Protecting Exposed vLLM Systems
|
## Security and Firewalls: Protecting Exposed vLLM Systems
|
||||||
|
|
||||||
While vLLM is designed to allow unsafe network services to be isolated to
|
While vLLM is designed to allow unsafe network services to be isolated to
|
||||||
|
|||||||
@ -1,22 +1,16 @@
|
|||||||
# vLLM V1
|
# vLLM V1
|
||||||
|
|
||||||
!!! announcement
|
|
||||||
|
|
||||||
We have started the process of deprecating V0. Please read [RFC #18571](gh-issue:18571) for more details.
|
|
||||||
|
|
||||||
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||||
|
|
||||||
To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
|
To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
|
||||||
|
|
||||||
## Why vLLM V1?
|
## Why vLLM V1?
|
||||||
|
|
||||||
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
|
vLLM V1 re-architects the engine to reduce accumulated complexity while preserving
|
||||||
|
the stable, battle-tested components users rely on (such as models, GPU kernels,
|
||||||
Building on V0’s success, vLLM V1 retains the stable and proven components from V0
|
and supporting utilities). The scheduler, KV cache manager, worker, sampler, and
|
||||||
(such as the models, GPU kernels, and utilities). At the same time, it significantly
|
API server now operate within a cohesive framework that is easier to extend and
|
||||||
re-architects the core systems, covering the scheduler, KV cache manager, worker,
|
maintain as new capabilities are added.
|
||||||
sampler, and API server, to provide a cohesive, maintainable framework that better
|
|
||||||
accommodates continued growth and innovation.
|
|
||||||
|
|
||||||
Specifically, V1 aims to:
|
Specifically, V1 aims to:
|
||||||
|
|
||||||
@ -88,8 +82,6 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|
|||||||
| **Mamba Models** | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr> |
|
| **Mamba Models** | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr> |
|
||||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||||
|
|
||||||
vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
|
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
|
|
||||||
This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
|
This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
|
||||||
@ -149,8 +141,8 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
|
|||||||
|
|
||||||
#### Semantic Changes to Logprobs
|
#### Semantic Changes to Logprobs
|
||||||
|
|
||||||
vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
|
vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantics
|
||||||
differences compared to V0:
|
to consider:
|
||||||
|
|
||||||
##### Logprobs Calculation
|
##### Logprobs Calculation
|
||||||
|
|
||||||
@ -175,7 +167,7 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
|
|||||||
##### Sampling features
|
##### Sampling features
|
||||||
|
|
||||||
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
|
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
|
||||||
- **Per-Request Logits Processors**: In V0, users could pass custom
|
- **Per-Request Logits Processors**: Previously, users could pass custom
|
||||||
processing functions to adjust logits on a per-request basis. In vLLM V1, this
|
processing functions to adjust logits on a per-request basis. In vLLM V1, this
|
||||||
feature has been deprecated. Instead, the design is moving toward supporting **global logits
|
feature has been deprecated. Instead, the design is moving toward supporting **global logits
|
||||||
processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
|
processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
|
||||||
|
|||||||
@ -576,7 +576,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
|
|
||||||
# Intern-S1
|
# Intern-S1
|
||||||
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
model_name = "internlm/Intern-S1"
|
model_name = "internlm/Intern-S1-mini"
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
|
|||||||
@ -309,7 +309,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
|
|
||||||
|
|
||||||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "internlm/Intern-S1"
|
model_name = "internlm/Intern-S1-mini"
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@ -371,6 +371,115 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=8192,
|
||||||
|
max_num_seqs=5,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=image_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "Kwai-Keye/Keye-VL-1_5-8B"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=32768,
|
||||||
|
max_num_seqs=5,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=image_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=4096,
|
||||||
|
max_num_seqs=4,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||||
|
|
||||||
@ -505,115 +614,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|
||||||
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
trust_remote_code=True,
|
|
||||||
max_model_len=8192,
|
|
||||||
max_num_seqs=5,
|
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
|
||||||
)
|
|
||||||
|
|
||||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*placeholders,
|
|
||||||
{"type": "text", "text": question},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
|
|
||||||
prompt = processor.apply_chat_template(
|
|
||||||
messages, tokenize=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
image_data = [fetch_image(url) for url in image_urls]
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompt=prompt,
|
|
||||||
image_data=image_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
|
|
||||||
model_name = "Kwai-Keye/Keye-VL-1_5-8B"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
trust_remote_code=True,
|
|
||||||
max_model_len=8192,
|
|
||||||
max_num_seqs=5,
|
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
|
||||||
)
|
|
||||||
|
|
||||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*placeholders,
|
|
||||||
{"type": "text", "text": question},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
|
|
||||||
prompt = processor.apply_chat_template(
|
|
||||||
messages, tokenize=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
image_data = [fetch_image(url) for url in image_urls]
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompt=prompt,
|
|
||||||
image_data=image_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|
||||||
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
trust_remote_code=True,
|
|
||||||
max_model_len=4096,
|
|
||||||
max_num_seqs=4,
|
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
|
||||||
)
|
|
||||||
|
|
||||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*placeholders,
|
|
||||||
{"type": "text", "text": question},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
|
|
||||||
prompt = processor.apply_chat_template(
|
|
||||||
messages, tokenize=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompt=prompt,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||||
|
|
||||||
|
|||||||
@ -58,6 +58,30 @@ class ModelRequestData(NamedTuple):
|
|||||||
documents: Optional[ScoreMultiModalParam] = None
|
documents: Optional[ScoreMultiModalParam] = None
|
||||||
|
|
||||||
|
|
||||||
|
def run_clip(query: Query) -> ModelRequestData:
|
||||||
|
if query["modality"] == "text":
|
||||||
|
prompt = query["text"]
|
||||||
|
image = None
|
||||||
|
elif query["modality"] == "image":
|
||||||
|
prompt = "" # For image input, make sure that the prompt text is empty
|
||||||
|
image = query["image"]
|
||||||
|
else:
|
||||||
|
modality = query["modality"]
|
||||||
|
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model="openai/clip-vit-base-patch32",
|
||||||
|
runner="pooling",
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image=image,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_e5_v(query: Query) -> ModelRequestData:
|
def run_e5_v(query: Query) -> ModelRequestData:
|
||||||
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
||||||
|
|
||||||
@ -89,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
|||||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||||
if query["modality"] == "text":
|
if query["modality"] == "text":
|
||||||
text = query["text"]
|
text = query["text"]
|
||||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
prompt = f"Find me an everyday image that matches the given caption: {text}"
|
||||||
image = None
|
image = None
|
||||||
elif query["modality"] == "image":
|
elif query["modality"] == "image":
|
||||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||||
@ -146,7 +170,8 @@ def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
|||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(
|
processor = AutoProcessor.from_pretrained(
|
||||||
model_id,
|
model_id,
|
||||||
# `min_pixels` and `max_pixels` are deprecated
|
# `min_pixels` and `max_pixels` are deprecated for
|
||||||
|
# transformers `preprocessor_config.json`
|
||||||
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
||||||
)
|
)
|
||||||
processor.chat_template = load_chat_template(
|
processor.chat_template = load_chat_template(
|
||||||
@ -172,8 +197,10 @@ def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
|||||||
model=merged_path,
|
model=merged_path,
|
||||||
runner="pooling",
|
runner="pooling",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
trust_remote_code=True,
|
mm_processor_kwargs={
|
||||||
mm_processor_kwargs={"num_crops": 4},
|
"min_pixels": 3136,
|
||||||
|
"max_pixels": 12845056,
|
||||||
|
},
|
||||||
limit_mm_per_prompt={"image": 1},
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -299,6 +326,7 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
|||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
|
"clip": run_clip,
|
||||||
"e5_v": run_e5_v,
|
"e5_v": run_e5_v,
|
||||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||||
|
|||||||
@ -203,9 +203,9 @@ class Proxy:
|
|||||||
async with session.post(
|
async with session.post(
|
||||||
url=url, json=data, headers=headers
|
url=url, json=data, headers=headers
|
||||||
) as response:
|
) as response:
|
||||||
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
|
if 200 <= response.status < 300 or 400 <= response.status < 500:
|
||||||
if use_chunked:
|
if use_chunked:
|
||||||
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
|
async for chunk_bytes in response.content.iter_chunked(
|
||||||
1024
|
1024
|
||||||
):
|
):
|
||||||
yield chunk_bytes
|
yield chunk_bytes
|
||||||
|
|||||||
@ -1,14 +1,9 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
"""Example Python client for multimodal embedding API using vLLM API server
|
"""Example Python client for multimodal embedding API using vLLM API server.
|
||||||
NOTE:
|
|
||||||
start a supported multimodal embeddings model server with `vllm serve`, e.g.
|
Refer to each `run_*` function for the command to run the server for that model.
|
||||||
vllm serve TIGER-Lab/VLM2Vec-Full \
|
|
||||||
--runner pooling \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 4096 \
|
|
||||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@ -47,7 +42,58 @@ def create_chat_embeddings(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_clip(client: OpenAI, model: str):
|
||||||
|
"""
|
||||||
|
Start the server using:
|
||||||
|
|
||||||
|
vllm serve openai/clip-vit-base-patch32 \
|
||||||
|
--runner pooling
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = create_chat_embeddings(
|
||||||
|
client,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
model=model,
|
||||||
|
encoding_format="float",
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Image embedding output:", response.data[0].embedding)
|
||||||
|
|
||||||
|
response = create_chat_embeddings(
|
||||||
|
client,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "a photo of a cat"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
model=model,
|
||||||
|
encoding_format="float",
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Text embedding output:", response.data[0].embedding)
|
||||||
|
|
||||||
|
|
||||||
def run_vlm2vec(client: OpenAI, model: str):
|
def run_vlm2vec(client: OpenAI, model: str):
|
||||||
|
"""
|
||||||
|
Start the server using:
|
||||||
|
|
||||||
|
vllm serve TIGER-Lab/VLM2Vec-Full \
|
||||||
|
--runner pooling \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 4096 \
|
||||||
|
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||||
|
"""
|
||||||
|
|
||||||
response = create_chat_embeddings(
|
response = create_chat_embeddings(
|
||||||
client,
|
client,
|
||||||
messages=[
|
messages=[
|
||||||
@ -103,6 +149,15 @@ def run_vlm2vec(client: OpenAI, model: str):
|
|||||||
|
|
||||||
|
|
||||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||||
|
"""
|
||||||
|
Start the server using:
|
||||||
|
|
||||||
|
vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
|
||||||
|
--runner pooling \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 8192 \
|
||||||
|
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||||
|
"""
|
||||||
response = create_chat_embeddings(
|
response = create_chat_embeddings(
|
||||||
client,
|
client,
|
||||||
messages=[
|
messages=[
|
||||||
@ -156,6 +211,7 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
|
|||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
|
"clip": run_clip,
|
||||||
"vlm2vec": run_vlm2vec,
|
"vlm2vec": run_vlm2vec,
|
||||||
"dse_qwen2_vl": run_dse_qwen2_vl,
|
"dse_qwen2_vl": run_dse_qwen2_vl,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
|
|||||||
done < <(env | grep "^${PREFIX}")
|
done < <(env | grep "^${PREFIX}")
|
||||||
|
|
||||||
# Pass the collected arguments to the main entrypoint
|
# Pass the collected arguments to the main entrypoint
|
||||||
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
|
exec vllm serve "${ARGS[@]}"
|
||||||
@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
|
|||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
# yapf conflicts with isort for this docstring
|
|
||||||
# yapf: disable
|
|
||||||
"""
|
"""
|
||||||
tensorize_vllm_model.py is a script that can be used to serialize and
|
tensorize_vllm_model.py is a script that can be used to serialize and
|
||||||
deserialize vLLM models. These models can be loaded using tensorizer
|
deserialize vLLM models. These models can be loaded using tensorizer
|
||||||
@ -132,7 +130,8 @@ def get_parser():
|
|||||||
"can be loaded using tensorizer directly to the GPU "
|
"can be loaded using tensorizer directly to the GPU "
|
||||||
"extremely quickly. Tensor encryption and decryption is "
|
"extremely quickly. Tensor encryption and decryption is "
|
||||||
"also supported, although libsodium must be installed to "
|
"also supported, although libsodium must be installed to "
|
||||||
"use it.")
|
"use it."
|
||||||
|
)
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -144,13 +143,14 @@ def get_parser():
|
|||||||
"along with the model by instantiating a TensorizerConfig object, "
|
"along with the model by instantiating a TensorizerConfig object, "
|
||||||
"creating a dict from it with TensorizerConfig.to_serializable(), "
|
"creating a dict from it with TensorizerConfig.to_serializable(), "
|
||||||
"and passing it to LoRARequest's initializer with the kwarg "
|
"and passing it to LoRARequest's initializer with the kwarg "
|
||||||
"tensorizer_config_dict."
|
"tensorizer_config_dict.",
|
||||||
)
|
)
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||||
|
|
||||||
serialize_parser = subparsers.add_parser(
|
serialize_parser = subparsers.add_parser(
|
||||||
'serialize', help="Serialize a model to `--serialized-directory`")
|
"serialize", help="Serialize a model to `--serialized-directory`"
|
||||||
|
)
|
||||||
|
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--suffix",
|
"--suffix",
|
||||||
@ -163,7 +163,9 @@ def get_parser():
|
|||||||
"`--suffix` is `v1`, the serialized model tensors will be "
|
"`--suffix` is `v1`, the serialized model tensors will be "
|
||||||
"saved to "
|
"saved to "
|
||||||
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
|
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
|
||||||
"If none is provided, a random UUID will be used."))
|
"If none is provided, a random UUID will be used."
|
||||||
|
),
|
||||||
|
)
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--serialized-directory",
|
"--serialized-directory",
|
||||||
type=str,
|
type=str,
|
||||||
@ -175,108 +177,127 @@ def get_parser():
|
|||||||
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
|
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
|
||||||
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
|
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
|
||||||
"where `suffix` is given by `--suffix` or a random UUID if not "
|
"where `suffix` is given by `--suffix` or a random UUID if not "
|
||||||
"provided.")
|
"provided.",
|
||||||
|
)
|
||||||
|
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--serialization-kwargs",
|
"--serialization-kwargs",
|
||||||
type=tensorizer_kwargs_arg,
|
type=tensorizer_kwargs_arg,
|
||||||
required=False,
|
required=False,
|
||||||
help=("A JSON string containing additional keyword arguments to "
|
help=(
|
||||||
"pass to Tensorizer's TensorSerializer during "
|
"A JSON string containing additional keyword arguments to "
|
||||||
"serialization."))
|
"pass to Tensorizer's TensorSerializer during "
|
||||||
|
"serialization."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--keyfile",
|
"--keyfile",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help=("Encrypt the model weights with a randomly-generated binary key,"
|
help=(
|
||||||
" and save the key at this path"))
|
"Encrypt the model weights with a randomly-generated binary key,"
|
||||||
|
" and save the key at this path"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser = subparsers.add_parser(
|
deserialize_parser = subparsers.add_parser(
|
||||||
'deserialize',
|
"deserialize",
|
||||||
help=("Deserialize a model from `--path-to-tensors`"
|
help=(
|
||||||
" to verify it can be loaded and used."))
|
"Deserialize a model from `--path-to-tensors`"
|
||||||
|
" to verify it can be loaded and used."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--path-to-tensors",
|
"--path-to-tensors",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help="The local path or S3 URI to the model tensors to deserialize. ")
|
help="The local path or S3 URI to the model tensors to deserialize. ",
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--serialized-directory",
|
"--serialized-directory",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help="Directory with model artifacts for loading. Assumes a "
|
help="Directory with model artifacts for loading. Assumes a "
|
||||||
"model.tensors file exists therein. Can supersede "
|
"model.tensors file exists therein. Can supersede "
|
||||||
"--path-to-tensors.")
|
"--path-to-tensors.",
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--keyfile",
|
"--keyfile",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help=("Path to a binary key to use to decrypt the model weights,"
|
help=(
|
||||||
" if the model was serialized with encryption"))
|
"Path to a binary key to use to decrypt the model weights,"
|
||||||
|
" if the model was serialized with encryption"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--deserialization-kwargs",
|
"--deserialization-kwargs",
|
||||||
type=tensorizer_kwargs_arg,
|
type=tensorizer_kwargs_arg,
|
||||||
required=False,
|
required=False,
|
||||||
help=("A JSON string containing additional keyword arguments to "
|
help=(
|
||||||
"pass to Tensorizer's `TensorDeserializer` during "
|
"A JSON string containing additional keyword arguments to "
|
||||||
"deserialization."))
|
"pass to Tensorizer's `TensorDeserializer` during "
|
||||||
|
"deserialization."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
TensorizerArgs.add_cli_args(deserialize_parser)
|
TensorizerArgs.add_cli_args(deserialize_parser)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
|
|
||||||
cfg: TensorizerConfig):
|
def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
|
||||||
for k, v in extra_cfg.items():
|
for k, v in extra_cfg.items():
|
||||||
if hasattr(cfg, k):
|
if hasattr(cfg, k):
|
||||||
setattr(cfg, k, v)
|
setattr(cfg, k, v)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Updating TensorizerConfig with %s from "
|
"Updating TensorizerConfig with %s from "
|
||||||
"--model-loader-extra-config provided", k
|
"--model-loader-extra-config provided",
|
||||||
|
k,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def deserialize(args, tensorizer_config):
|
def deserialize(args, tensorizer_config):
|
||||||
if args.lora_path:
|
if args.lora_path:
|
||||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
load_format="tensorizer",
|
model=args.model,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
load_format="tensorizer",
|
||||||
model_loader_extra_config=tensorizer_config,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
enable_lora=True,
|
model_loader_extra_config=tensorizer_config,
|
||||||
|
enable_lora=True,
|
||||||
)
|
)
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0,
|
temperature=0, max_tokens=256, stop=["[/assistant]"]
|
||||||
max_tokens=256,
|
|
||||||
stop=["[/assistant]"]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Truncating this as the extra text isn't necessary
|
# Truncating this as the extra text isn't necessary
|
||||||
prompts = [
|
prompts = ["[user] Write a SQL query to answer the question based on ..."]
|
||||||
"[user] Write a SQL query to answer the question based on ..."
|
|
||||||
]
|
|
||||||
|
|
||||||
# Test LoRA load
|
# Test LoRA load
|
||||||
print(
|
print(
|
||||||
llm.generate(
|
llm.generate(
|
||||||
prompts,
|
prompts,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
lora_request=LoRARequest("sql-lora",
|
lora_request=LoRARequest(
|
||||||
1,
|
"sql-lora",
|
||||||
args.lora_path,
|
1,
|
||||||
tensorizer_config_dict = tensorizer_config
|
args.lora_path,
|
||||||
.to_serializable())
|
tensorizer_config_dict=tensorizer_config.to_serializable(),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
load_format="tensorizer",
|
model=args.model,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
load_format="tensorizer",
|
||||||
model_loader_extra_config=tensorizer_config
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
|
model_loader_extra_config=tensorizer_config,
|
||||||
)
|
)
|
||||||
return llm
|
return llm
|
||||||
|
|
||||||
@ -285,17 +306,20 @@ def main():
|
|||||||
parser = get_parser()
|
parser = get_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
|
||||||
or os.environ.get("S3_ACCESS_KEY_ID", None))
|
"S3_ACCESS_KEY_ID", None
|
||||||
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
|
)
|
||||||
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
|
s3_secret_access_key = getattr(
|
||||||
s3_endpoint = (getattr(args, 's3_endpoint', None)
|
args, "s3_secret_access_key", None
|
||||||
or os.environ.get("S3_ENDPOINT_URL", None))
|
) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
|
||||||
|
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
|
||||||
|
"S3_ENDPOINT_URL", None
|
||||||
|
)
|
||||||
|
|
||||||
credentials = {
|
credentials = {
|
||||||
"s3_access_key_id": s3_access_key_id,
|
"s3_access_key_id": s3_access_key_id,
|
||||||
"s3_secret_access_key": s3_secret_access_key,
|
"s3_secret_access_key": s3_secret_access_key,
|
||||||
"s3_endpoint": s3_endpoint
|
"s3_endpoint": s3_endpoint,
|
||||||
}
|
}
|
||||||
|
|
||||||
model_ref = args.model
|
model_ref = args.model
|
||||||
@ -309,25 +333,25 @@ def main():
|
|||||||
if args.model_loader_extra_config:
|
if args.model_loader_extra_config:
|
||||||
extra_config = json.loads(args.model_loader_extra_config)
|
extra_config = json.loads(args.model_loader_extra_config)
|
||||||
|
|
||||||
|
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
|
||||||
tensorizer_dir = (args.serialized_directory or
|
tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
|
||||||
extra_config.get("tensorizer_dir"))
|
"tensorizer_uri"
|
||||||
tensorizer_uri = (getattr(args, "path_to_tensors", None)
|
)
|
||||||
or extra_config.get("tensorizer_uri"))
|
|
||||||
|
|
||||||
if tensorizer_dir and tensorizer_uri:
|
if tensorizer_dir and tensorizer_uri:
|
||||||
parser.error("--serialized-directory and --path-to-tensors "
|
parser.error(
|
||||||
"cannot both be provided")
|
"--serialized-directory and --path-to-tensors cannot both be provided"
|
||||||
|
)
|
||||||
|
|
||||||
if not tensorizer_dir and not tensorizer_uri:
|
if not tensorizer_dir and not tensorizer_uri:
|
||||||
parser.error("Either --serialized-directory or --path-to-tensors "
|
parser.error(
|
||||||
"must be provided")
|
"Either --serialized-directory or --path-to-tensors must be provided"
|
||||||
|
)
|
||||||
|
|
||||||
if args.command == "serialize":
|
if args.command == "serialize":
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
|
||||||
input_dir = tensorizer_dir.rstrip('/')
|
input_dir = tensorizer_dir.rstrip("/")
|
||||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||||
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
|
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
|
||||||
if engine_args.tensor_parallel_size > 1:
|
if engine_args.tensor_parallel_size > 1:
|
||||||
@ -339,15 +363,14 @@ def main():
|
|||||||
tensorizer_uri=model_path,
|
tensorizer_uri=model_path,
|
||||||
encryption_keyfile=keyfile,
|
encryption_keyfile=keyfile,
|
||||||
serialization_kwargs=args.serialization_kwargs or {},
|
serialization_kwargs=args.serialization_kwargs or {},
|
||||||
**credentials
|
**credentials,
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.lora_path:
|
if args.lora_path:
|
||||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||||
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
||||||
|
|
||||||
merge_extra_config_with_tensorizer_config(extra_config,
|
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||||
tensorizer_config)
|
|
||||||
tensorize_vllm_model(engine_args, tensorizer_config)
|
tensorize_vllm_model(engine_args, tensorizer_config)
|
||||||
|
|
||||||
elif args.command == "deserialize":
|
elif args.command == "deserialize":
|
||||||
@ -356,11 +379,10 @@ def main():
|
|||||||
tensorizer_dir=args.serialized_directory,
|
tensorizer_dir=args.serialized_directory,
|
||||||
encryption_keyfile=keyfile,
|
encryption_keyfile=keyfile,
|
||||||
deserialization_kwargs=args.deserialization_kwargs or {},
|
deserialization_kwargs=args.deserialization_kwargs or {},
|
||||||
**credentials
|
**credentials,
|
||||||
)
|
)
|
||||||
|
|
||||||
merge_extra_config_with_tensorizer_config(extra_config,
|
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||||
tensorizer_config)
|
|
||||||
deserialize(args, tensorizer_config)
|
deserialize(args, tensorizer_config)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Either serialize or deserialize must be specified.")
|
raise ValueError("Either serialize or deserialize must be specified.")
|
||||||
|
|||||||
@ -1,54 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
exclude = [
|
|
||||||
# External file, leaving license intact
|
|
||||||
"examples/other/fp8/quantizer/quantize.py",
|
|
||||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.isort]
|
|
||||||
known-first-party = ["vllm"]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@ -52,27 +52,10 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi
|
|||||||
where = ["."]
|
where = ["."]
|
||||||
include = ["vllm*"]
|
include = ["vllm*"]
|
||||||
|
|
||||||
[tool.yapfignore]
|
|
||||||
ignore_patterns = [
|
|
||||||
".buildkite/**",
|
|
||||||
"benchmarks/**",
|
|
||||||
"build/**",
|
|
||||||
"examples/**",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
# Allow lines to be as long as 80.
|
|
||||||
line-length = 80
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
"vllm/version.py" = ["F401"]
|
"vllm/version.py" = ["F401"]
|
||||||
"vllm/_version.py" = ["ALL"]
|
"vllm/_version.py" = ["ALL"]
|
||||||
# Python 3.8 typing - skip V0 code
|
|
||||||
"vllm/attention/**/*.py" = ["UP006", "UP035"]
|
|
||||||
"vllm/engine/**/*.py" = ["UP006", "UP035"]
|
|
||||||
"vllm/executor/**/*.py" = ["UP006", "UP035"]
|
|
||||||
"vllm/worker/**/*.py" = ["UP006", "UP035"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = [
|
select = [
|
||||||
@ -87,7 +70,7 @@ select = [
|
|||||||
# flake8-simplify
|
# flake8-simplify
|
||||||
"SIM",
|
"SIM",
|
||||||
# isort
|
# isort
|
||||||
# "I",
|
"I",
|
||||||
# flake8-logging-format
|
# flake8-logging-format
|
||||||
"G",
|
"G",
|
||||||
]
|
]
|
||||||
@ -104,21 +87,15 @@ ignore = [
|
|||||||
"UP007",
|
"UP007",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
||||||
|
|
||||||
[tool.mypy]
|
[tool.mypy]
|
||||||
plugins = ['pydantic.mypy']
|
plugins = ['pydantic.mypy']
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
check_untyped_defs = true
|
check_untyped_defs = true
|
||||||
follow_imports = "silent"
|
follow_imports = "silent"
|
||||||
|
|
||||||
[tool.isort]
|
|
||||||
skip_glob = [
|
|
||||||
".buildkite/*",
|
|
||||||
"benchmarks/*",
|
|
||||||
"examples/*",
|
|
||||||
]
|
|
||||||
use_parentheses = true
|
|
||||||
skip_gitignore = true
|
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
markers = [
|
markers = [
|
||||||
"slow_test",
|
"slow_test",
|
||||||
|
|||||||
@ -49,3 +49,4 @@ pybase64 # fast base64 implementation
|
|||||||
cbor2 # Required for cross-language serialization of hashable objects
|
cbor2 # Required for cross-language serialization of hashable objects
|
||||||
setproctitle # Used to set process names for better debugging and monitoring
|
setproctitle # Used to set process names for better debugging and monitoring
|
||||||
openai-harmony >= 0.0.3 # Required for gpt-oss
|
openai-harmony >= 0.0.3 # Required for gpt-oss
|
||||||
|
gpt-oss >= 0.0.7
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
lmcache
|
lmcache
|
||||||
nixl >= 0.5.1 # Required for disaggregated prefill
|
nixl >= 0.6.0 # Required for disaggregated prefill
|
||||||
|
|||||||
260
setup.py
260
setup.py
@ -34,32 +34,36 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
# cannot import envs directly because it depends on vllm,
|
# cannot import envs directly because it depends on vllm,
|
||||||
# which is not installed yet
|
# which is not installed yet
|
||||||
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
|
envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
|
||||||
|
|
||||||
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
|
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
|
||||||
|
|
||||||
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
|
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
|
||||||
logger.warning(
|
logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
|
||||||
"VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
|
|
||||||
VLLM_TARGET_DEVICE = "cpu"
|
VLLM_TARGET_DEVICE = "cpu"
|
||||||
elif not (sys.platform.startswith("linux")
|
elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")):
|
||||||
or sys.platform.startswith("darwin")):
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"vLLM only supports Linux platform (including WSL) and MacOS."
|
"vLLM only supports Linux platform (including WSL) and MacOS."
|
||||||
"Building on %s, "
|
"Building on %s, "
|
||||||
"so vLLM may not be able to run correctly", sys.platform)
|
"so vLLM may not be able to run correctly",
|
||||||
|
sys.platform,
|
||||||
|
)
|
||||||
VLLM_TARGET_DEVICE = "empty"
|
VLLM_TARGET_DEVICE = "empty"
|
||||||
elif (sys.platform.startswith("linux") and torch.version.cuda is None
|
elif (
|
||||||
and os.getenv("VLLM_TARGET_DEVICE") is None
|
sys.platform.startswith("linux")
|
||||||
and torch.version.hip is None):
|
and torch.version.cuda is None
|
||||||
|
and os.getenv("VLLM_TARGET_DEVICE") is None
|
||||||
|
and torch.version.hip is None
|
||||||
|
):
|
||||||
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
|
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
|
||||||
# fallback to cpu
|
# fallback to cpu
|
||||||
VLLM_TARGET_DEVICE = "cpu"
|
VLLM_TARGET_DEVICE = "cpu"
|
||||||
|
|
||||||
|
|
||||||
def is_sccache_available() -> bool:
|
def is_sccache_available() -> bool:
|
||||||
return which("sccache") is not None and \
|
return which("sccache") is not None and not bool(
|
||||||
not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
|
int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_ccache_available() -> bool:
|
def is_ccache_available() -> bool:
|
||||||
@ -83,8 +87,7 @@ def is_url_available(url: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class CMakeExtension(Extension):
|
class CMakeExtension(Extension):
|
||||||
|
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
|
||||||
def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
|
|
||||||
super().__init__(name, sources=[], py_limited_api=True, **kwa)
|
super().__init__(name, sources=[], py_limited_api=True, **kwa)
|
||||||
self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
|
self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
|
||||||
|
|
||||||
@ -121,8 +124,8 @@ class cmake_build_ext(build_ext):
|
|||||||
if nvcc_threads is not None:
|
if nvcc_threads is not None:
|
||||||
nvcc_threads = int(nvcc_threads)
|
nvcc_threads = int(nvcc_threads)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Using NVCC_THREADS=%d as the number of nvcc threads.",
|
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
|
||||||
nvcc_threads)
|
)
|
||||||
else:
|
else:
|
||||||
nvcc_threads = 1
|
nvcc_threads = 1
|
||||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||||
@ -146,36 +149,36 @@ class cmake_build_ext(build_ext):
|
|||||||
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
|
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
|
||||||
|
|
||||||
cmake_args = [
|
cmake_args = [
|
||||||
'-DCMAKE_BUILD_TYPE={}'.format(cfg),
|
"-DCMAKE_BUILD_TYPE={}".format(cfg),
|
||||||
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
|
"-DVLLM_TARGET_DEVICE={}".format(VLLM_TARGET_DEVICE),
|
||||||
]
|
]
|
||||||
|
|
||||||
verbose = envs.VERBOSE
|
verbose = envs.VERBOSE
|
||||||
if verbose:
|
if verbose:
|
||||||
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
|
cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"]
|
||||||
|
|
||||||
if is_sccache_available():
|
if is_sccache_available():
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
|
"-DCMAKE_C_COMPILER_LAUNCHER=sccache",
|
||||||
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
|
"-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
|
||||||
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
|
"-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
|
||||||
'-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
|
"-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
|
||||||
]
|
]
|
||||||
elif is_ccache_available():
|
elif is_ccache_available():
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
'-DCMAKE_C_COMPILER_LAUNCHER=ccache',
|
"-DCMAKE_C_COMPILER_LAUNCHER=ccache",
|
||||||
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
|
"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
|
||||||
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
|
"-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
|
||||||
'-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
|
"-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Pass the python executable to cmake so it can find an exact
|
# Pass the python executable to cmake so it can find an exact
|
||||||
# match.
|
# match.
|
||||||
cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
|
cmake_args += ["-DVLLM_PYTHON_EXECUTABLE={}".format(sys.executable)]
|
||||||
|
|
||||||
# Pass the python path to cmake so it can reuse the build dependencies
|
# Pass the python path to cmake so it can reuse the build dependencies
|
||||||
# on subsequent calls to python.
|
# on subsequent calls to python.
|
||||||
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
|
cmake_args += ["-DVLLM_PYTHON_PATH={}".format(":".join(sys.path))]
|
||||||
|
|
||||||
# Override the base directory for FetchContent downloads to $ROOT/.deps
|
# Override the base directory for FetchContent downloads to $ROOT/.deps
|
||||||
# This allows sharing dependencies between profiles,
|
# This allows sharing dependencies between profiles,
|
||||||
@ -183,7 +186,7 @@ class cmake_build_ext(build_ext):
|
|||||||
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
|
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
|
||||||
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
|
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
|
||||||
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
|
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
|
||||||
cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
|
cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
|
||||||
|
|
||||||
#
|
#
|
||||||
# Setup parallelism and build tool
|
# Setup parallelism and build tool
|
||||||
@ -191,30 +194,36 @@ class cmake_build_ext(build_ext):
|
|||||||
num_jobs, nvcc_threads = self.compute_num_jobs()
|
num_jobs, nvcc_threads = self.compute_num_jobs()
|
||||||
|
|
||||||
if nvcc_threads:
|
if nvcc_threads:
|
||||||
cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
|
cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
|
||||||
|
|
||||||
if is_ninja_available():
|
if is_ninja_available():
|
||||||
build_tool = ['-G', 'Ninja']
|
build_tool = ["-G", "Ninja"]
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
'-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
|
"-DCMAKE_JOB_POOL_COMPILE:STRING=compile",
|
||||||
'-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
|
"-DCMAKE_JOB_POOLS:STRING=compile={}".format(num_jobs),
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
# Default build tool to whatever cmake picks.
|
# Default build tool to whatever cmake picks.
|
||||||
build_tool = []
|
build_tool = []
|
||||||
# Make sure we use the nvcc from CUDA_HOME
|
# Make sure we use the nvcc from CUDA_HOME
|
||||||
if _is_cuda():
|
if _is_cuda():
|
||||||
cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
|
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
|
||||||
|
|
||||||
|
other_cmake_args = os.environ.get("CMAKE_ARGS")
|
||||||
|
if other_cmake_args:
|
||||||
|
cmake_args += other_cmake_args.split()
|
||||||
|
|
||||||
subprocess.check_call(
|
subprocess.check_call(
|
||||||
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
|
["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args],
|
||||||
cwd=self.build_temp)
|
cwd=self.build_temp,
|
||||||
|
)
|
||||||
|
|
||||||
def build_extensions(self) -> None:
|
def build_extensions(self) -> None:
|
||||||
# Ensure that CMake is present and working
|
# Ensure that CMake is present and working
|
||||||
try:
|
try:
|
||||||
subprocess.check_output(['cmake', '--version'])
|
subprocess.check_output(["cmake", "--version"])
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
raise RuntimeError('Cannot find CMake executable') from e
|
raise RuntimeError("Cannot find CMake executable") from e
|
||||||
|
|
||||||
# Create build directory if it does not exist.
|
# Create build directory if it does not exist.
|
||||||
if not os.path.exists(self.build_temp):
|
if not os.path.exists(self.build_temp):
|
||||||
@ -253,13 +262,18 @@ class cmake_build_ext(build_ext):
|
|||||||
# CMake appends the extension prefix to the install path,
|
# CMake appends the extension prefix to the install path,
|
||||||
# and outdir already contains that prefix, so we need to remove it.
|
# and outdir already contains that prefix, so we need to remove it.
|
||||||
prefix = outdir
|
prefix = outdir
|
||||||
for _ in range(ext.name.count('.')):
|
for _ in range(ext.name.count(".")):
|
||||||
prefix = prefix.parent
|
prefix = prefix.parent
|
||||||
|
|
||||||
# prefix here should actually be the same for all components
|
# prefix here should actually be the same for all components
|
||||||
install_args = [
|
install_args = [
|
||||||
"cmake", "--install", ".", "--prefix", prefix, "--component",
|
"cmake",
|
||||||
target_name(ext.name)
|
"--install",
|
||||||
|
".",
|
||||||
|
"--prefix",
|
||||||
|
prefix,
|
||||||
|
"--component",
|
||||||
|
target_name(ext.name),
|
||||||
]
|
]
|
||||||
subprocess.check_call(install_args, cwd=self.build_temp)
|
subprocess.check_call(install_args, cwd=self.build_temp)
|
||||||
|
|
||||||
@ -270,12 +284,15 @@ class cmake_build_ext(build_ext):
|
|||||||
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
|
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
|
||||||
# directory so that they can be included in the editable build
|
# directory so that they can be included in the editable build
|
||||||
import glob
|
import glob
|
||||||
files = glob.glob(os.path.join(self.build_lib, "vllm",
|
|
||||||
"vllm_flash_attn", "**", "*.py"),
|
files = glob.glob(
|
||||||
recursive=True)
|
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "**", "*.py"),
|
||||||
|
recursive=True,
|
||||||
|
)
|
||||||
for file in files:
|
for file in files:
|
||||||
dst_file = os.path.join("vllm/vllm_flash_attn",
|
dst_file = os.path.join(
|
||||||
file.split("vllm/vllm_flash_attn/")[-1])
|
"vllm/vllm_flash_attn", file.split("vllm/vllm_flash_attn/")[-1]
|
||||||
|
)
|
||||||
print(f"Copying {file} to {dst_file}")
|
print(f"Copying {file} to {dst_file}")
|
||||||
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
|
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
|
||||||
self.copy_file(file, dst_file)
|
self.copy_file(file, dst_file)
|
||||||
@ -285,8 +302,7 @@ class precompiled_build_ext(build_ext):
|
|||||||
"""Disables extension building when using precompiled binaries."""
|
"""Disables extension building when using precompiled binaries."""
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
assert _is_cuda(
|
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||||
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
|
||||||
|
|
||||||
def build_extensions(self) -> None:
|
def build_extensions(self) -> None:
|
||||||
print("Skipping build_ext: using precompiled extensions.")
|
print("Skipping build_ext: using precompiled extensions.")
|
||||||
@ -307,9 +323,9 @@ class precompiled_wheel_utils:
|
|||||||
wheel_filename = wheel_url_or_path.split("/")[-1]
|
wheel_filename = wheel_url_or_path.split("/")[-1]
|
||||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||||
print(f"Downloading wheel from {wheel_url_or_path} "
|
print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
|
||||||
f"to {wheel_path}")
|
|
||||||
from urllib.request import urlretrieve
|
from urllib.request import urlretrieve
|
||||||
|
|
||||||
urlretrieve(wheel_url_or_path, filename=wheel_path)
|
urlretrieve(wheel_url_or_path, filename=wheel_path)
|
||||||
else:
|
else:
|
||||||
wheel_path = wheel_url_or_path
|
wheel_path = wheel_url_or_path
|
||||||
@ -330,25 +346,29 @@ class precompiled_wheel_utils:
|
|||||||
]
|
]
|
||||||
|
|
||||||
compiled_regex = re.compile(
|
compiled_regex = re.compile(
|
||||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
|
||||||
|
)
|
||||||
file_members = list(
|
file_members = list(
|
||||||
filter(lambda x: x.filename in files_to_copy,
|
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
|
||||||
wheel.filelist))
|
)
|
||||||
file_members += list(
|
file_members += list(
|
||||||
filter(lambda x: compiled_regex.match(x.filename),
|
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
|
||||||
wheel.filelist))
|
)
|
||||||
|
|
||||||
for file in file_members:
|
for file in file_members:
|
||||||
print(f"[extract] {file.filename}")
|
print(f"[extract] {file.filename}")
|
||||||
target_path = os.path.join(".", file.filename)
|
target_path = os.path.join(".", file.filename)
|
||||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||||
with wheel.open(file.filename) as src, open(
|
with (
|
||||||
target_path, "wb") as dst:
|
wheel.open(file.filename) as src,
|
||||||
|
open(target_path, "wb") as dst,
|
||||||
|
):
|
||||||
shutil.copyfileobj(src, dst)
|
shutil.copyfileobj(src, dst)
|
||||||
|
|
||||||
pkg = os.path.dirname(file.filename).replace("/", ".")
|
pkg = os.path.dirname(file.filename).replace("/", ".")
|
||||||
package_data_patch.setdefault(pkg, []).append(
|
package_data_patch.setdefault(pkg, []).append(
|
||||||
os.path.basename(file.filename))
|
os.path.basename(file.filename)
|
||||||
|
)
|
||||||
|
|
||||||
return package_data_patch
|
return package_data_patch
|
||||||
finally:
|
finally:
|
||||||
@ -364,10 +384,13 @@ class precompiled_wheel_utils:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Get the latest commit hash of the upstream main branch.
|
# Get the latest commit hash of the upstream main branch.
|
||||||
resp_json = subprocess.check_output([
|
resp_json = subprocess.check_output(
|
||||||
"curl", "-s",
|
[
|
||||||
"https://api.github.com/repos/vllm-project/vllm/commits/main"
|
"curl",
|
||||||
]).decode("utf-8")
|
"-s",
|
||||||
|
"https://api.github.com/repos/vllm-project/vllm/commits/main",
|
||||||
|
]
|
||||||
|
).decode("utf-8")
|
||||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||||
|
|
||||||
# In Docker build context, .git may be immutable or missing.
|
# In Docker build context, .git may be immutable or missing.
|
||||||
@ -377,25 +400,32 @@ class precompiled_wheel_utils:
|
|||||||
# Check if the upstream_main_commit exists in the local repo
|
# Check if the upstream_main_commit exists in the local repo
|
||||||
try:
|
try:
|
||||||
subprocess.check_output(
|
subprocess.check_output(
|
||||||
["git", "cat-file", "-e", f"{upstream_main_commit}"])
|
["git", "cat-file", "-e", f"{upstream_main_commit}"]
|
||||||
|
)
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
# If not present, fetch it from the remote repository.
|
# If not present, fetch it from the remote repository.
|
||||||
# Note that this does not update any local branches,
|
# Note that this does not update any local branches,
|
||||||
# but ensures that this commit ref and its history are
|
# but ensures that this commit ref and its history are
|
||||||
# available in our local repo.
|
# available in our local repo.
|
||||||
subprocess.check_call([
|
subprocess.check_call(
|
||||||
"git", "fetch", "https://github.com/vllm-project/vllm",
|
["git", "fetch", "https://github.com/vllm-project/vllm", "main"]
|
||||||
"main"
|
)
|
||||||
])
|
|
||||||
|
|
||||||
# Then get the commit hash of the current branch that is the same as
|
# Then get the commit hash of the current branch that is the same as
|
||||||
# the upstream main commit.
|
# the upstream main commit.
|
||||||
current_branch = subprocess.check_output(
|
current_branch = (
|
||||||
["git", "branch", "--show-current"]).decode("utf-8").strip()
|
subprocess.check_output(["git", "branch", "--show-current"])
|
||||||
|
.decode("utf-8")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
base_commit = subprocess.check_output([
|
base_commit = (
|
||||||
"git", "merge-base", f"{upstream_main_commit}", current_branch
|
subprocess.check_output(
|
||||||
]).decode("utf-8").strip()
|
["git", "merge-base", f"{upstream_main_commit}", current_branch]
|
||||||
|
)
|
||||||
|
.decode("utf-8")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
return base_commit
|
return base_commit
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
raise ValueError(err) from None
|
raise ValueError(err) from None
|
||||||
@ -403,7 +433,9 @@ class precompiled_wheel_utils:
|
|||||||
logger.warning(
|
logger.warning(
|
||||||
"Failed to get the base commit in the main branch. "
|
"Failed to get the base commit in the main branch. "
|
||||||
"Using the nightly wheel. The libraries in this "
|
"Using the nightly wheel. The libraries in this "
|
||||||
"wheel may not be compatible with your dev branch: %s", err)
|
"wheel may not be compatible with your dev branch: %s",
|
||||||
|
err,
|
||||||
|
)
|
||||||
return "nightly"
|
return "nightly"
|
||||||
|
|
||||||
|
|
||||||
@ -413,12 +445,13 @@ def _no_device() -> bool:
|
|||||||
|
|
||||||
def _is_cuda() -> bool:
|
def _is_cuda() -> bool:
|
||||||
has_cuda = torch.version.cuda is not None
|
has_cuda = torch.version.cuda is not None
|
||||||
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
|
return VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()
|
||||||
|
|
||||||
|
|
||||||
def _is_hip() -> bool:
|
def _is_hip() -> bool:
|
||||||
return (VLLM_TARGET_DEVICE == "cuda"
|
return (
|
||||||
or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
|
VLLM_TARGET_DEVICE == "cuda" or VLLM_TARGET_DEVICE == "rocm"
|
||||||
|
) and torch.version.hip is not None
|
||||||
|
|
||||||
|
|
||||||
def _is_tpu() -> bool:
|
def _is_tpu() -> bool:
|
||||||
@ -457,8 +490,12 @@ def get_rocm_version():
|
|||||||
minor = ctypes.c_uint32()
|
minor = ctypes.c_uint32()
|
||||||
patch = ctypes.c_uint32()
|
patch = ctypes.c_uint32()
|
||||||
|
|
||||||
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
|
if (
|
||||||
ctypes.byref(patch)) == 0):
|
get_rocm_core_version(
|
||||||
|
ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)
|
||||||
|
)
|
||||||
|
== 0
|
||||||
|
):
|
||||||
return f"{major.value}.{minor.value}.{patch.value}"
|
return f"{major.value}.{minor.value}.{patch.value}"
|
||||||
return None
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -471,8 +508,9 @@ def get_nvcc_cuda_version() -> Version:
|
|||||||
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
|
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
|
||||||
"""
|
"""
|
||||||
assert CUDA_HOME is not None, "CUDA_HOME is not set"
|
assert CUDA_HOME is not None, "CUDA_HOME is not set"
|
||||||
nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
|
nvcc_output = subprocess.check_output(
|
||||||
universal_newlines=True)
|
[CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True
|
||||||
|
)
|
||||||
output = nvcc_output.split()
|
output = nvcc_output.split()
|
||||||
release_idx = output.index("release") + 1
|
release_idx = output.index("release") + 1
|
||||||
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
|
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
|
||||||
@ -484,14 +522,20 @@ def get_gaudi_sw_version():
|
|||||||
Returns the driver version.
|
Returns the driver version.
|
||||||
"""
|
"""
|
||||||
# Enable console printing for `hl-smi` check
|
# Enable console printing for `hl-smi` check
|
||||||
output = subprocess.run("hl-smi",
|
output = subprocess.run(
|
||||||
shell=True,
|
"hl-smi",
|
||||||
text=True,
|
shell=True,
|
||||||
capture_output=True,
|
text=True,
|
||||||
env={"ENABLE_CONSOLE": "true"})
|
capture_output=True,
|
||||||
|
env={"ENABLE_CONSOLE": "true"},
|
||||||
|
)
|
||||||
if output.returncode == 0 and output.stdout:
|
if output.returncode == 0 and output.stdout:
|
||||||
return output.stdout.split("\n")[2].replace(
|
return (
|
||||||
" ", "").split(":")[1][:-1].split("-")[0]
|
output.stdout.split("\n")[2]
|
||||||
|
.replace(" ", "")
|
||||||
|
.split(":")[1][:-1]
|
||||||
|
.split("-")[0]
|
||||||
|
)
|
||||||
return "0.0.0" # when hl-smi is not available
|
return "0.0.0" # when hl-smi is not available
|
||||||
|
|
||||||
|
|
||||||
@ -541,8 +585,11 @@ def get_requirements() -> list[str]:
|
|||||||
for line in requirements:
|
for line in requirements:
|
||||||
if line.startswith("-r "):
|
if line.startswith("-r "):
|
||||||
resolved_requirements += _read_requirements(line.split()[1])
|
resolved_requirements += _read_requirements(line.split()[1])
|
||||||
elif not line.startswith("--") and not line.startswith(
|
elif (
|
||||||
"#") and line.strip() != "":
|
not line.startswith("--")
|
||||||
|
and not line.startswith("#")
|
||||||
|
and line.strip() != ""
|
||||||
|
):
|
||||||
resolved_requirements.append(line)
|
resolved_requirements.append(line)
|
||||||
return resolved_requirements
|
return resolved_requirements
|
||||||
|
|
||||||
@ -553,7 +600,7 @@ def get_requirements() -> list[str]:
|
|||||||
cuda_major, cuda_minor = torch.version.cuda.split(".")
|
cuda_major, cuda_minor = torch.version.cuda.split(".")
|
||||||
modified_requirements = []
|
modified_requirements = []
|
||||||
for req in requirements:
|
for req in requirements:
|
||||||
if ("vllm-flash-attn" in req and cuda_major != "12"):
|
if "vllm-flash-attn" in req and cuda_major != "12":
|
||||||
# vllm-flash-attn is built only for CUDA 12.x.
|
# vllm-flash-attn is built only for CUDA 12.x.
|
||||||
# Skip for other versions.
|
# Skip for other versions.
|
||||||
continue
|
continue
|
||||||
@ -568,8 +615,7 @@ def get_requirements() -> list[str]:
|
|||||||
elif _is_xpu():
|
elif _is_xpu():
|
||||||
requirements = _read_requirements("xpu.txt")
|
requirements = _read_requirements("xpu.txt")
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError("Unsupported platform, please use CUDA, ROCm, or CPU.")
|
||||||
"Unsupported platform, please use CUDA, ROCm, or CPU.")
|
|
||||||
return requirements
|
return requirements
|
||||||
|
|
||||||
|
|
||||||
@ -585,14 +631,13 @@ if _is_cuda():
|
|||||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
||||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
||||||
# FA3 requires CUDA 12.3 or later
|
# FA3 requires CUDA 12.3 or later
|
||||||
ext_modules.append(
|
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
||||||
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
|
||||||
# Optional since this doesn't get built (produce an .so file) when
|
# Optional since this doesn't get built (produce an .so file) when
|
||||||
# not targeting a hopper system
|
# not targeting a hopper system
|
||||||
|
ext_modules.append(CMakeExtension(name="vllm._flashmla_C", optional=True))
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CMakeExtension(name="vllm._flashmla_C", optional=True))
|
CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
|
||||||
ext_modules.append(
|
)
|
||||||
CMakeExtension(name="vllm._flashmla_extension_C", optional=True))
|
|
||||||
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
|
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
|
||||||
|
|
||||||
if _build_custom_ops():
|
if _build_custom_ops():
|
||||||
@ -614,6 +659,7 @@ if envs.VLLM_USE_PRECOMPILED:
|
|||||||
wheel_url = wheel_location
|
wheel_url = wheel_location
|
||||||
else:
|
else:
|
||||||
import platform
|
import platform
|
||||||
|
|
||||||
arch = platform.machine()
|
arch = platform.machine()
|
||||||
if arch == "x86_64":
|
if arch == "x86_64":
|
||||||
wheel_tag = "manylinux1_x86_64"
|
wheel_tag = "manylinux1_x86_64"
|
||||||
@ -623,8 +669,11 @@ if envs.VLLM_USE_PRECOMPILED:
|
|||||||
raise ValueError(f"Unsupported architecture: {arch}")
|
raise ValueError(f"Unsupported architecture: {arch}")
|
||||||
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
|
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
|
||||||
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
|
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
|
||||||
nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
|
nightly_wheel_url = (
|
||||||
|
f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
|
||||||
|
)
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with urlopen(wheel_url) as resp:
|
with urlopen(wheel_url) as resp:
|
||||||
if resp.status != 200:
|
if resp.status != 200:
|
||||||
@ -633,8 +682,7 @@ if envs.VLLM_USE_PRECOMPILED:
|
|||||||
print(f"[warn] Falling back to nightly wheel: {e}")
|
print(f"[warn] Falling back to nightly wheel: {e}")
|
||||||
wheel_url = nightly_wheel_url
|
wheel_url = nightly_wheel_url
|
||||||
|
|
||||||
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
|
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
|
||||||
wheel_url)
|
|
||||||
for pkg, files in patch.items():
|
for pkg, files in patch.items():
|
||||||
package_data.setdefault(pkg, []).extend(files)
|
package_data.setdefault(pkg, []).extend(files)
|
||||||
|
|
||||||
@ -645,8 +693,9 @@ if not ext_modules:
|
|||||||
cmdclass = {}
|
cmdclass = {}
|
||||||
else:
|
else:
|
||||||
cmdclass = {
|
cmdclass = {
|
||||||
"build_ext":
|
"build_ext": precompiled_build_ext
|
||||||
precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
|
if envs.VLLM_USE_PRECOMPILED
|
||||||
|
else cmake_build_ext
|
||||||
}
|
}
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
@ -659,8 +708,11 @@ setup(
|
|||||||
"tensorizer": ["tensorizer==2.10.1"],
|
"tensorizer": ["tensorizer==2.10.1"],
|
||||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.14.0"],
|
"runai": ["runai-model-streamer[s3,gcs] >= 0.14.0"],
|
||||||
"audio": ["librosa", "soundfile",
|
"audio": [
|
||||||
"mistral_common[audio]"], # Required for audio processing
|
"librosa",
|
||||||
|
"soundfile",
|
||||||
|
"mistral_common[audio]",
|
||||||
|
], # Required for audio processing
|
||||||
"video": [], # Kept for backwards compatibility
|
"video": [], # Kept for backwards compatibility
|
||||||
# FlashInfer should be updated together with the Dockerfile
|
# FlashInfer should be updated together with the Dockerfile
|
||||||
"flashinfer": ["flashinfer-python==0.3.1"],
|
"flashinfer": ["flashinfer-python==0.3.1"],
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import weakref
|
import weakref
|
||||||
from unittest.mock import Mock
|
from unittest.mock import Mock
|
||||||
@ -37,16 +38,21 @@ def test_vllm_gc_ed():
|
|||||||
|
|
||||||
|
|
||||||
def _fix_prompt_embed_outputs(
|
def _fix_prompt_embed_outputs(
|
||||||
vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
|
vllm_outputs: list[tuple[list[int], str]],
|
||||||
example_prompts: list[str]) -> list[tuple[list[int], str]]:
|
hf_model: HfRunner,
|
||||||
|
example_prompts: list[str],
|
||||||
|
) -> list[tuple[list[int], str]]:
|
||||||
fixed_vllm_outputs = []
|
fixed_vllm_outputs = []
|
||||||
for vllm_output, hf_input, prompt in zip(
|
for vllm_output, hf_input, prompt in zip(
|
||||||
vllm_outputs, hf_model.get_inputs(example_prompts),
|
vllm_outputs, hf_model.get_inputs(example_prompts), example_prompts
|
||||||
example_prompts):
|
):
|
||||||
hf_input_ids = hf_input["input_ids"].tolist()[0]
|
hf_input_ids = hf_input["input_ids"].tolist()[0]
|
||||||
fixed_vllm_outputs.append(
|
fixed_vllm_outputs.append(
|
||||||
(hf_input_ids + vllm_output[0][len(hf_input_ids):],
|
(
|
||||||
prompt + vllm_output[1]))
|
hf_input_ids + vllm_output[0][len(hf_input_ids) :],
|
||||||
|
prompt + vllm_output[1],
|
||||||
|
)
|
||||||
|
)
|
||||||
return fixed_vllm_outputs
|
return fixed_vllm_outputs
|
||||||
|
|
||||||
|
|
||||||
@ -69,8 +75,7 @@ def test_models(
|
|||||||
enable_prompt_embeds: bool,
|
enable_prompt_embeds: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
||||||
pytest.skip(
|
pytest.skip(f"{backend} does not support gemma2 with full context length.")
|
||||||
f"{backend} does not support gemma2 with full context length.")
|
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", backend)
|
m.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||||
@ -78,34 +83,35 @@ def test_models(
|
|||||||
# 5042 tokens for gemma2
|
# 5042 tokens for gemma2
|
||||||
# gemma2 has alternating sliding window size of 4096
|
# gemma2 has alternating sliding window size of 4096
|
||||||
# we need a prompt with more than 4096 tokens to test the sliding window
|
# we need a prompt with more than 4096 tokens to test the sliding window
|
||||||
prompt = "The following numbers of the sequence " + ", ".join(
|
prompt = (
|
||||||
str(i) for i in range(1024)) + " are:"
|
"The following numbers of the sequence "
|
||||||
|
+ ", ".join(str(i) for i in range(1024))
|
||||||
|
+ " are:"
|
||||||
|
)
|
||||||
example_prompts = [prompt]
|
example_prompts = [prompt]
|
||||||
|
|
||||||
with hf_runner(model) as hf_model:
|
with hf_runner(model) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
if enable_prompt_embeds:
|
if enable_prompt_embeds:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
prompt_embeds = hf_model.get_prompt_embeddings(
|
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
|
||||||
example_prompts)
|
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
enable_prompt_embeds=enable_prompt_embeds,
|
enable_prompt_embeds=enable_prompt_embeds,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
async_scheduling=async_scheduling,
|
async_scheduling=async_scheduling,
|
||||||
distributed_executor_backend=model_executor,
|
distributed_executor_backend=model_executor,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
if enable_prompt_embeds:
|
if enable_prompt_embeds:
|
||||||
vllm_outputs = vllm_model.generate_greedy(
|
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
|
||||||
prompt_embeds, max_tokens)
|
|
||||||
vllm_outputs = _fix_prompt_embed_outputs(
|
vllm_outputs = _fix_prompt_embed_outputs(
|
||||||
vllm_outputs, hf_model, example_prompts)
|
vllm_outputs, hf_model, example_prompts
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
vllm_outputs = vllm_model.generate_greedy(
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
example_prompts, max_tokens)
|
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
@ -117,21 +123,18 @@ def test_models(
|
|||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
@multi_gpu_test(num_gpus=2)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model, distributed_executor_backend, attention_backend, "
|
"model, distributed_executor_backend, attention_backend, test_suite, extra_env",
|
||||||
"test_suite, extra_env", [
|
[
|
||||||
("distilbert/distilgpt2", "ray", "", "L4", {}),
|
("distilbert/distilgpt2", "ray", "", "L4", {}),
|
||||||
("distilbert/distilgpt2", "mp", "", "L4", {}),
|
("distilbert/distilgpt2", "mp", "", "L4", {}),
|
||||||
("distilbert/distilgpt2", "ray", "", "L4", {
|
("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
|
||||||
"VLLM_SLEEP_WHEN_IDLE": "1"
|
("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
|
||||||
}),
|
|
||||||
("distilbert/distilgpt2", "mp", "", "L4", {
|
|
||||||
"VLLM_SLEEP_WHEN_IDLE": "1"
|
|
||||||
}),
|
|
||||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
|
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
|
||||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
|
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
|
||||||
("distilbert/distilgpt2", "ray", "", "A100", {}),
|
("distilbert/distilgpt2", "ray", "", "A100", {}),
|
||||||
("distilbert/distilgpt2", "mp", "", "A100", {}),
|
("distilbert/distilgpt2", "mp", "", "A100", {}),
|
||||||
])
|
],
|
||||||
|
)
|
||||||
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
|
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
|
||||||
def test_models_distributed(
|
def test_models_distributed(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
@ -149,11 +152,14 @@ def test_models_distributed(
|
|||||||
pytest.skip(f"Skip test for {test_suite}")
|
pytest.skip(f"Skip test for {test_suite}")
|
||||||
|
|
||||||
with monkeypatch.context() as monkeypatch_context:
|
with monkeypatch.context() as monkeypatch_context:
|
||||||
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
if (
|
||||||
|
model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
and distributed_executor_backend == "ray"
|
||||||
|
and attention_backend == ""
|
||||||
|
and test_suite == "L4"
|
||||||
|
): # noqa
|
||||||
if enable_prompt_embeds:
|
if enable_prompt_embeds:
|
||||||
pytest.skip(
|
pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
|
||||||
"enable_prompt_embeds does not work with ray compiled dag."
|
|
||||||
)
|
|
||||||
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
||||||
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
||||||
|
|
||||||
@ -175,30 +181,26 @@ def test_models_distributed(
|
|||||||
# will hurt multiprocessing backend with fork method
|
# will hurt multiprocessing backend with fork method
|
||||||
# (the default method).
|
# (the default method).
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enable_prompt_embeds=enable_prompt_embeds,
|
enable_prompt_embeds=enable_prompt_embeds,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
if enable_prompt_embeds:
|
if enable_prompt_embeds:
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
prompt_embeds = hf_model.get_prompt_embeddings(
|
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
|
||||||
example_prompts)
|
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
|
||||||
vllm_outputs = vllm_model.generate_greedy(
|
|
||||||
prompt_embeds, max_tokens)
|
|
||||||
vllm_outputs = _fix_prompt_embed_outputs(
|
vllm_outputs = _fix_prompt_embed_outputs(
|
||||||
vllm_outputs, hf_model, example_prompts)
|
vllm_outputs, hf_model, example_prompts
|
||||||
hf_outputs = hf_model.generate_greedy(
|
)
|
||||||
example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
else:
|
else:
|
||||||
vllm_outputs = vllm_model.generate_greedy(
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
example_prompts, max_tokens)
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
example_prompts, max_tokens)
|
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
@ -209,27 +211,23 @@ def test_models_distributed(
|
|||||||
|
|
||||||
|
|
||||||
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
||||||
|
|
||||||
from vllm.envs import VLLM_USE_V1
|
from vllm.envs import VLLM_USE_V1
|
||||||
|
|
||||||
if not VLLM_USE_V1:
|
if not VLLM_USE_V1:
|
||||||
pytest.skip("Skipping V0 test, dump input not supported")
|
pytest.skip("Skipping V0 test, dump input not supported")
|
||||||
|
|
||||||
# Needed to mock an error in the same process
|
# Needed to mock an error in the same process
|
||||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
|
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
|
||||||
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
|
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
|
||||||
v1_test_failed_model_execution(vllm_model)
|
v1_test_failed_model_execution(vllm_model)
|
||||||
|
|
||||||
|
|
||||||
def v1_test_failed_model_execution(vllm_model):
|
def v1_test_failed_model_execution(vllm_model):
|
||||||
|
|
||||||
engine = vllm_model.llm.llm_engine
|
engine = vllm_model.llm.llm_engine
|
||||||
mocked_execute_model = Mock(
|
mocked_execute_model = Mock(side_effect=RuntimeError("Mocked Critical Error"))
|
||||||
side_effect=RuntimeError("Mocked Critical Error"))
|
engine.engine_core.engine_core.model_executor.execute_model = mocked_execute_model
|
||||||
engine.engine_core.engine_core.model_executor.execute_model =\
|
|
||||||
mocked_execute_model
|
|
||||||
|
|
||||||
with pytest.raises(RuntimeError) as exc_info:
|
with pytest.raises(RuntimeError) as exc_info:
|
||||||
prompts = [
|
prompts = [
|
||||||
|
|||||||
@ -5,5 +5,6 @@ from ..utils import compare_two_settings
|
|||||||
|
|
||||||
|
|
||||||
def test_cpu_offload():
|
def test_cpu_offload():
|
||||||
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
|
compare_two_settings(
|
||||||
["--cpu-offload-gb", "1"])
|
"meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
|
||||||
|
)
|
||||||
|
|||||||
@ -23,13 +23,13 @@ def test_python_error():
|
|||||||
tensors = []
|
tensors = []
|
||||||
with allocator.use_memory_pool():
|
with allocator.use_memory_pool():
|
||||||
# allocate 70% of the total memory
|
# allocate 70% of the total memory
|
||||||
x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
|
x = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
|
||||||
tensors.append(x)
|
tensors.append(x)
|
||||||
# release the memory
|
# release the memory
|
||||||
allocator.sleep()
|
allocator.sleep()
|
||||||
|
|
||||||
# allocate more memory than the total memory
|
# allocate more memory than the total memory
|
||||||
y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
|
y = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
|
||||||
tensors.append(y)
|
tensors.append(y)
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(RuntimeError):
|
||||||
# when the allocator is woken up, it should raise an error
|
# when the allocator is woken up, it should raise an error
|
||||||
@ -41,17 +41,17 @@ def test_python_error():
|
|||||||
def test_basic_cumem():
|
def test_basic_cumem():
|
||||||
# some tensors from default memory pool
|
# some tensors from default memory pool
|
||||||
shape = (1024, 1024)
|
shape = (1024, 1024)
|
||||||
x = torch.empty(shape, device='cuda')
|
x = torch.empty(shape, device="cuda")
|
||||||
x.zero_()
|
x.zero_()
|
||||||
|
|
||||||
# some tensors from custom memory pool
|
# some tensors from custom memory pool
|
||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
with allocator.use_memory_pool():
|
with allocator.use_memory_pool():
|
||||||
# custom memory pool
|
# custom memory pool
|
||||||
y = torch.empty(shape, device='cuda')
|
y = torch.empty(shape, device="cuda")
|
||||||
y.zero_()
|
y.zero_()
|
||||||
y += 1
|
y += 1
|
||||||
z = torch.empty(shape, device='cuda')
|
z = torch.empty(shape, device="cuda")
|
||||||
z.zero_()
|
z.zero_()
|
||||||
z += 2
|
z += 2
|
||||||
|
|
||||||
@ -74,16 +74,16 @@ def test_basic_cumem():
|
|||||||
def test_cumem_with_cudagraph():
|
def test_cumem_with_cudagraph():
|
||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
with allocator.use_memory_pool():
|
with allocator.use_memory_pool():
|
||||||
weight = torch.eye(1024, device='cuda')
|
weight = torch.eye(1024, device="cuda")
|
||||||
with allocator.use_memory_pool(tag="discard"):
|
with allocator.use_memory_pool(tag="discard"):
|
||||||
cache = torch.empty(1024, 1024, device='cuda')
|
cache = torch.empty(1024, 1024, device="cuda")
|
||||||
|
|
||||||
def model(x):
|
def model(x):
|
||||||
out = x @ weight
|
out = x @ weight
|
||||||
cache[:out.size(0)].copy_(out)
|
cache[: out.size(0)].copy_(out)
|
||||||
return out + 1
|
return out + 1
|
||||||
|
|
||||||
x = torch.empty(128, 1024, device='cuda')
|
x = torch.empty(128, 1024, device="cuda")
|
||||||
|
|
||||||
# warmup
|
# warmup
|
||||||
model(x)
|
model(x)
|
||||||
@ -109,7 +109,7 @@ def test_cumem_with_cudagraph():
|
|||||||
model_graph.replay()
|
model_graph.replay()
|
||||||
|
|
||||||
# cache content is as expected
|
# cache content is as expected
|
||||||
assert torch.allclose(x, cache[:x.size(0)])
|
assert torch.allclose(x, cache[: x.size(0)])
|
||||||
|
|
||||||
# output content is as expected
|
# output content is as expected
|
||||||
assert torch.allclose(y, x + 1)
|
assert torch.allclose(y, x + 1)
|
||||||
@ -123,7 +123,8 @@ def test_cumem_with_cudagraph():
|
|||||||
("meta-llama/Llama-3.2-1B", True),
|
("meta-llama/Llama-3.2-1B", True),
|
||||||
# sleep mode with pytorch checkpoint
|
# sleep mode with pytorch checkpoint
|
||||||
("facebook/opt-125m", True),
|
("facebook/opt-125m", True),
|
||||||
])
|
],
|
||||||
|
)
|
||||||
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
assert use_v1
|
assert use_v1
|
||||||
|
|||||||
@ -10,8 +10,18 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
|||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_bench_latency():
|
def test_bench_latency():
|
||||||
command = [
|
command = [
|
||||||
"vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
|
"vllm",
|
||||||
"--output-len", "1", "--enforce-eager", "--load-format", "dummy"
|
"bench",
|
||||||
|
"latency",
|
||||||
|
"--model",
|
||||||
|
MODEL_NAME,
|
||||||
|
"--input-len",
|
||||||
|
"32",
|
||||||
|
"--output-len",
|
||||||
|
"1",
|
||||||
|
"--enforce-eager",
|
||||||
|
"--load-format",
|
||||||
|
"dummy",
|
||||||
]
|
]
|
||||||
result = subprocess.run(command, capture_output=True, text=True)
|
result = subprocess.run(command, capture_output=True, text=True)
|
||||||
print(result.stdout)
|
print(result.stdout)
|
||||||
|
|||||||
@ -7,8 +7,11 @@ import numpy as np
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.benchmarks.datasets import (RandomDataset, RandomMultiModalDataset,
|
from vllm.benchmarks.datasets import (
|
||||||
SampleRequest)
|
RandomDataset,
|
||||||
|
RandomMultiModalDataset,
|
||||||
|
SampleRequest,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
@ -27,11 +30,9 @@ class Params(NamedTuple):
|
|||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def random_dataset_params() -> Params:
|
def random_dataset_params() -> Params:
|
||||||
return Params(num_requests=16,
|
return Params(
|
||||||
prefix_len=7,
|
num_requests=16, prefix_len=7, range_ratio=0.3, input_len=50, output_len=20
|
||||||
range_ratio=0.3,
|
)
|
||||||
input_len=50,
|
|
||||||
output_len=20)
|
|
||||||
|
|
||||||
|
|
||||||
def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
|
def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
|
||||||
@ -39,13 +40,15 @@ def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
|
|||||||
return (req.prompt, req.prompt_len, req.expected_output_len)
|
return (req.prompt, req.prompt_len, req.expected_output_len)
|
||||||
|
|
||||||
|
|
||||||
def _collect_samples(dataset: RandomDataset,
|
def _collect_samples(
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
dataset: RandomDataset,
|
||||||
num_requests: int = 16,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
prefix_len: int = 7,
|
num_requests: int = 16,
|
||||||
range_ratio: float = 0.3,
|
prefix_len: int = 7,
|
||||||
input_len: int = 50,
|
range_ratio: float = 0.3,
|
||||||
output_len: int = 20) -> list[tuple[str, int, int]]:
|
input_len: int = 50,
|
||||||
|
output_len: int = 20,
|
||||||
|
) -> list[tuple[str, int, int]]:
|
||||||
samples = dataset.sample(
|
samples = dataset.sample(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
num_requests=num_requests,
|
num_requests=num_requests,
|
||||||
@ -59,8 +62,8 @@ def _collect_samples(dataset: RandomDataset,
|
|||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_random_dataset_same_seed(
|
def test_random_dataset_same_seed(
|
||||||
hf_tokenizer: PreTrainedTokenizerBase,
|
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
|
||||||
random_dataset_params: Params) -> None:
|
) -> None:
|
||||||
"""Same seed should yield identical outputs, even if global RNGs change.
|
"""Same seed should yield identical outputs, even if global RNGs change.
|
||||||
|
|
||||||
This guards against accidental reliance on Python's random or np.random
|
This guards against accidental reliance on Python's random or np.random
|
||||||
@ -70,13 +73,15 @@ def test_random_dataset_same_seed(
|
|||||||
common_seed = 123
|
common_seed = 123
|
||||||
dataset_a = RandomDataset(random_seed=common_seed)
|
dataset_a = RandomDataset(random_seed=common_seed)
|
||||||
dataset_b = RandomDataset(random_seed=common_seed)
|
dataset_b = RandomDataset(random_seed=common_seed)
|
||||||
a = _collect_samples(dataset_a,
|
a = _collect_samples(
|
||||||
hf_tokenizer,
|
dataset_a,
|
||||||
num_requests=p.num_requests,
|
hf_tokenizer,
|
||||||
prefix_len=p.prefix_len,
|
num_requests=p.num_requests,
|
||||||
range_ratio=p.range_ratio,
|
prefix_len=p.prefix_len,
|
||||||
input_len=p.input_len,
|
range_ratio=p.range_ratio,
|
||||||
output_len=p.output_len)
|
input_len=p.input_len,
|
||||||
|
output_len=p.output_len,
|
||||||
|
)
|
||||||
|
|
||||||
# Perturb global RNG state to ensure isolation
|
# Perturb global RNG state to ensure isolation
|
||||||
random.seed(999)
|
random.seed(999)
|
||||||
@ -84,43 +89,50 @@ def test_random_dataset_same_seed(
|
|||||||
np.random.seed(888)
|
np.random.seed(888)
|
||||||
_ = [np.random.random() for _ in range(100)]
|
_ = [np.random.random() for _ in range(100)]
|
||||||
|
|
||||||
b = _collect_samples(dataset_b,
|
b = _collect_samples(
|
||||||
hf_tokenizer,
|
dataset_b,
|
||||||
num_requests=p.num_requests,
|
hf_tokenizer,
|
||||||
prefix_len=p.prefix_len,
|
num_requests=p.num_requests,
|
||||||
range_ratio=p.range_ratio,
|
prefix_len=p.prefix_len,
|
||||||
input_len=p.input_len,
|
range_ratio=p.range_ratio,
|
||||||
output_len=p.output_len)
|
input_len=p.input_len,
|
||||||
|
output_len=p.output_len,
|
||||||
|
)
|
||||||
assert a == b
|
assert a == b
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_random_dataset_different_seeds(
|
def test_random_dataset_different_seeds(
|
||||||
hf_tokenizer: PreTrainedTokenizerBase,
|
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
|
||||||
random_dataset_params: Params) -> None:
|
) -> None:
|
||||||
"""Different seeds should change outputs with overwhelming likelihood."""
|
"""Different seeds should change outputs with overwhelming likelihood."""
|
||||||
p = random_dataset_params
|
p = random_dataset_params
|
||||||
seed_a = 0
|
seed_a = 0
|
||||||
dataset_a = RandomDataset(random_seed=seed_a)
|
dataset_a = RandomDataset(random_seed=seed_a)
|
||||||
a = _collect_samples(dataset_a,
|
a = _collect_samples(
|
||||||
hf_tokenizer,
|
dataset_a,
|
||||||
num_requests=p.num_requests,
|
hf_tokenizer,
|
||||||
prefix_len=p.prefix_len,
|
num_requests=p.num_requests,
|
||||||
range_ratio=p.range_ratio,
|
prefix_len=p.prefix_len,
|
||||||
input_len=p.input_len,
|
range_ratio=p.range_ratio,
|
||||||
output_len=p.output_len)
|
input_len=p.input_len,
|
||||||
|
output_len=p.output_len,
|
||||||
|
)
|
||||||
|
|
||||||
seed_b = 999
|
seed_b = 999
|
||||||
dataset_b = RandomDataset(random_seed=seed_b)
|
dataset_b = RandomDataset(random_seed=seed_b)
|
||||||
# Perturb global RNG with same seed as dataset_a to ensure isolation
|
# Perturb global RNG with same seed as dataset_a to ensure isolation
|
||||||
random.seed(seed_a)
|
random.seed(seed_a)
|
||||||
np.random.seed(seed_a)
|
np.random.seed(seed_a)
|
||||||
b = _collect_samples(dataset_b,
|
b = _collect_samples(
|
||||||
hf_tokenizer,
|
dataset_b,
|
||||||
num_requests=p.num_requests,
|
hf_tokenizer,
|
||||||
prefix_len=p.prefix_len,
|
num_requests=p.num_requests,
|
||||||
range_ratio=p.range_ratio,
|
prefix_len=p.prefix_len,
|
||||||
input_len=p.input_len,
|
range_ratio=p.range_ratio,
|
||||||
output_len=p.output_len)
|
input_len=p.input_len,
|
||||||
|
output_len=p.output_len,
|
||||||
|
)
|
||||||
assert a != b
|
assert a != b
|
||||||
|
|
||||||
|
|
||||||
@ -128,6 +140,7 @@ def test_random_dataset_different_seeds(
|
|||||||
# RandomMultiModalDataset tests
|
# RandomMultiModalDataset tests
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
|
|
||||||
|
|
||||||
def _mm_fingerprint_sample(
|
def _mm_fingerprint_sample(
|
||||||
req: SampleRequest,
|
req: SampleRequest,
|
||||||
) -> tuple[str, int, int, int, list[str]]:
|
) -> tuple[str, int, int, int, list[str]]:
|
||||||
@ -152,8 +165,13 @@ def _mm_fingerprint_sample(
|
|||||||
item_prefixes.append(f"video:{url[:22]}")
|
item_prefixes.append(f"video:{url[:22]}")
|
||||||
else:
|
else:
|
||||||
item_prefixes.append("unknown:")
|
item_prefixes.append("unknown:")
|
||||||
return (req.prompt, req.prompt_len, req.expected_output_len, len(items),
|
return (
|
||||||
item_prefixes)
|
req.prompt,
|
||||||
|
req.prompt_len,
|
||||||
|
req.expected_output_len,
|
||||||
|
len(items),
|
||||||
|
item_prefixes,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _collect_mm_samples(
|
def _collect_mm_samples(
|
||||||
@ -214,6 +232,7 @@ def test_random_mm_different_seeds(
|
|||||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||||
assert fa != fb
|
assert fa != fb
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_random_mm_respects_limits(
|
def test_random_mm_respects_limits(
|
||||||
hf_tokenizer: PreTrainedTokenizerBase,
|
hf_tokenizer: PreTrainedTokenizerBase,
|
||||||
@ -271,9 +290,9 @@ def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
|||||||
for s in samples:
|
for s in samples:
|
||||||
assert s.multi_modal_data == []
|
assert s.multi_modal_data == []
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_random_mm_num_items_per_prompt(
|
def test_random_mm_num_items_per_prompt(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||||
hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
|
||||||
ds = RandomMultiModalDataset(random_seed=0)
|
ds = RandomMultiModalDataset(random_seed=0)
|
||||||
# Fixed number of images per prompt
|
# Fixed number of images per prompt
|
||||||
# set num_mm_items_range_ratio to 0.0
|
# set num_mm_items_range_ratio to 0.0
|
||||||
@ -300,7 +319,6 @@ def test_random_mm_num_items_per_prompt(
|
|||||||
def test_random_mm_bucket_config_not_mutated(
|
def test_random_mm_bucket_config_not_mutated(
|
||||||
hf_tokenizer: PreTrainedTokenizerBase,
|
hf_tokenizer: PreTrainedTokenizerBase,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
ds = RandomMultiModalDataset(random_seed=0)
|
ds = RandomMultiModalDataset(random_seed=0)
|
||||||
# This bucket config is not normalized to sum to 1
|
# This bucket config is not normalized to sum to 1
|
||||||
# and has more buckets than requested images
|
# and has more buckets than requested images
|
||||||
@ -321,7 +339,6 @@ def test_random_mm_bucket_config_not_mutated(
|
|||||||
# Ensure the original dict content is unchanged
|
# Ensure the original dict content is unchanged
|
||||||
assert original == snapshot
|
assert original == snapshot
|
||||||
|
|
||||||
|
|
||||||
# Vary number of mm items per prompt
|
# Vary number of mm items per prompt
|
||||||
# set num_mm_items_range_ratio to 0.5
|
# set num_mm_items_range_ratio to 0.5
|
||||||
samples_varying_items = _collect_mm_samples(
|
samples_varying_items = _collect_mm_samples(
|
||||||
|
|||||||
@ -11,9 +11,7 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
args = [
|
args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"]
|
||||||
"--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
|
|
||||||
]
|
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
@ -46,6 +44,7 @@ def test_bench_serve(server):
|
|||||||
|
|
||||||
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_bench_serve_chat(server):
|
def test_bench_serve_chat(server):
|
||||||
command = [
|
command = [
|
||||||
|
|||||||
@ -10,8 +10,18 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
|||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
def test_bench_throughput():
|
def test_bench_throughput():
|
||||||
command = [
|
command = [
|
||||||
"vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
|
"vllm",
|
||||||
"32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
|
"bench",
|
||||||
|
"throughput",
|
||||||
|
"--model",
|
||||||
|
MODEL_NAME,
|
||||||
|
"--input-len",
|
||||||
|
"32",
|
||||||
|
"--output-len",
|
||||||
|
"1",
|
||||||
|
"--enforce-eager",
|
||||||
|
"--load-format",
|
||||||
|
"dummy",
|
||||||
]
|
]
|
||||||
result = subprocess.run(command, capture_output=True, text=True)
|
result = subprocess.run(command, capture_output=True, text=True)
|
||||||
print(result.stdout)
|
print(result.stdout)
|
||||||
|
|||||||
@ -23,8 +23,7 @@ class LazyInitPass(InductorPass):
|
|||||||
and then immediately invoke it.
|
and then immediately invoke it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pass_cls: type[VllmInductorPass],
|
def __init__(self, pass_cls: type[VllmInductorPass], vllm_config: VllmConfig):
|
||||||
vllm_config: VllmConfig):
|
|
||||||
self.pass_cls = pass_cls
|
self.pass_cls = pass_cls
|
||||||
self.vllm_config = weakref.proxy(vllm_config) # avoid cycle
|
self.vllm_config = weakref.proxy(vllm_config) # avoid cycle
|
||||||
|
|
||||||
@ -45,20 +44,18 @@ class TestBackend:
|
|||||||
Inductor config is default-initialized from VllmConfig.CompilationConfig.
|
Inductor config is default-initialized from VllmConfig.CompilationConfig.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
|
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
|
||||||
None]]):
|
|
||||||
self.custom_passes = list(passes)
|
self.custom_passes = list(passes)
|
||||||
compile_config = get_current_vllm_config().compilation_config
|
compile_config = get_current_vllm_config().compilation_config
|
||||||
self.inductor_config = compile_config.inductor_compile_config
|
self.inductor_config = compile_config.inductor_compile_config
|
||||||
self.inductor_config['force_disable_caches'] = True
|
self.inductor_config["force_disable_caches"] = True
|
||||||
self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
|
self.inductor_config["post_grad_custom_post_pass"] = self.post_pass
|
||||||
|
|
||||||
def __call__(self, graph: fx.GraphModule, example_inputs):
|
def __call__(self, graph: fx.GraphModule, example_inputs):
|
||||||
self.graph_pre_compile = deepcopy(graph)
|
self.graph_pre_compile = deepcopy(graph)
|
||||||
from torch._inductor.compile_fx import compile_fx
|
from torch._inductor.compile_fx import compile_fx
|
||||||
return compile_fx(graph,
|
|
||||||
example_inputs,
|
return compile_fx(graph, example_inputs, config_patches=self.inductor_config)
|
||||||
config_patches=self.inductor_config)
|
|
||||||
|
|
||||||
@with_pattern_match_debug
|
@with_pattern_match_debug
|
||||||
def post_pass(self, graph: fx.Graph):
|
def post_pass(self, graph: fx.Graph):
|
||||||
@ -82,8 +79,7 @@ class TestBackend:
|
|||||||
assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
|
assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
|
||||||
assert num_pre > num_post, f"All nodes remain for op {op.name()}"
|
assert num_pre > num_post, f"All nodes remain for op {op.name()}"
|
||||||
if fully_replaced:
|
if fully_replaced:
|
||||||
assert num_post == 0, \
|
assert num_post == 0, f"Unexpected op {op.name()} in post-pass graph"
|
||||||
f"Unexpected op {op.name()} in post-pass graph"
|
|
||||||
|
|
||||||
def check_after_ops(self, ops: Sequence[OpOverload]):
|
def check_after_ops(self, ops: Sequence[OpOverload]):
|
||||||
for op in ops:
|
for op in ops:
|
||||||
|
|||||||
@ -38,8 +38,8 @@ test_params_full_cudagraph = []
|
|||||||
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
|
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
|
||||||
for mla_backend in MLA_backends:
|
for mla_backend in MLA_backends:
|
||||||
test_params_full_cudagraph.append(
|
test_params_full_cudagraph.append(
|
||||||
pytest.param(
|
pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]))
|
||||||
("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])))
|
)
|
||||||
|
|
||||||
# Qwen/Qwen2-1.5B-Instruct with other backends
|
# Qwen/Qwen2-1.5B-Instruct with other backends
|
||||||
other_backend_configs = [
|
other_backend_configs = [
|
||||||
@ -47,7 +47,8 @@ other_backend_configs = [
|
|||||||
]
|
]
|
||||||
for backend_config in other_backend_configs:
|
for backend_config in other_backend_configs:
|
||||||
test_params_full_cudagraph.append(
|
test_params_full_cudagraph.append(
|
||||||
pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)))
|
pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
@pytest.fixture(scope="class")
|
||||||
@ -55,8 +56,10 @@ def llm_pair(request):
|
|||||||
model, backend_config = request.param
|
model, backend_config = request.param
|
||||||
|
|
||||||
# Dynamically skip test if GPU capability is not met
|
# Dynamically skip test if GPU capability is not met
|
||||||
if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
|
if (
|
||||||
!= current_platform.get_device_capability():
|
backend_config.specific_gpu_arch
|
||||||
|
and backend_config.specific_gpu_arch != current_platform.get_device_capability()
|
||||||
|
):
|
||||||
if backend_config.specific_gpu_arch == (9, 0):
|
if backend_config.specific_gpu_arch == (9, 0):
|
||||||
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
|
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
|
||||||
elif backend_config.specific_gpu_arch == (10, 0):
|
elif backend_config.specific_gpu_arch == (10, 0):
|
||||||
@ -76,8 +79,7 @@ def llm_pair(request):
|
|||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
max_num_seqs=128,
|
max_num_seqs=128,
|
||||||
compilation_config=\
|
compilation_config=CompilationConfig(**backend_config.comp_config),
|
||||||
CompilationConfig(**backend_config.comp_config),
|
|
||||||
generation_config="vllm",
|
generation_config="vllm",
|
||||||
seed=42,
|
seed=42,
|
||||||
)
|
)
|
||||||
@ -113,20 +115,22 @@ class TestFullCUDAGraph:
|
|||||||
meaning there would be multiple LLM instances hogging memory simultaneously.
|
meaning there would be multiple LLM instances hogging memory simultaneously.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@pytest.mark.parametrize(("batch_size", "max_tokens"), [
|
@pytest.mark.parametrize(
|
||||||
(1, 10),
|
("batch_size", "max_tokens"),
|
||||||
(7, 10),
|
[
|
||||||
(16, 10),
|
(1, 10),
|
||||||
(25, 10),
|
(7, 10),
|
||||||
(32, 10),
|
(16, 10),
|
||||||
(45, 10),
|
(25, 10),
|
||||||
(64, 10),
|
(32, 10),
|
||||||
(123, 10),
|
(45, 10),
|
||||||
(8, 5),
|
(64, 10),
|
||||||
(8, 30),
|
(123, 10),
|
||||||
])
|
(8, 5),
|
||||||
def test_full_cudagraph(self, batch_size, max_tokens,
|
(8, 30),
|
||||||
llm_pair: tuple[LLM, LLM]):
|
],
|
||||||
|
)
|
||||||
|
def test_full_cudagraph(self, batch_size, max_tokens, llm_pair: tuple[LLM, LLM]):
|
||||||
"""
|
"""
|
||||||
Test various batch sizes and max_tokens to ensure that the
|
Test various batch sizes and max_tokens to ensure that the
|
||||||
full cudagraph compilation works for padded cases too.
|
full cudagraph compilation works for padded cases too.
|
||||||
@ -137,26 +141,34 @@ class TestFullCUDAGraph:
|
|||||||
prompts = ["the quick brown fox"] * batch_size
|
prompts = ["the quick brown fox"] * batch_size
|
||||||
# Use purely greedy decoding to avoid top-p truncation sensitivity
|
# Use purely greedy decoding to avoid top-p truncation sensitivity
|
||||||
# that can amplify tiny numeric differences across runtimes.
|
# that can amplify tiny numeric differences across runtimes.
|
||||||
sampling_params = SamplingParams(temperature=0.0,
|
sampling_params = SamplingParams(
|
||||||
max_tokens=max_tokens,
|
temperature=0.0, max_tokens=max_tokens, top_p=1.0
|
||||||
top_p=1.0)
|
)
|
||||||
|
|
||||||
piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
|
piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
|
||||||
full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
|
full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
# Check that all responses are the same
|
# Check that all responses are the same
|
||||||
for piecewise_res, full_res in zip(piecewise_responses,
|
for piecewise_res, full_res in zip(piecewise_responses, full_responses):
|
||||||
full_responses):
|
assert (
|
||||||
assert piecewise_res.outputs[0].text.lower() == \
|
piecewise_res.outputs[0].text.lower()
|
||||||
full_res.outputs[0].text.lower()
|
== full_res.outputs[0].text.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
||||||
def test_full_cudagraph_with_invalid_backend():
|
def test_full_cudagraph_with_invalid_backend():
|
||||||
with temporary_environ({
|
with (
|
||||||
"VLLM_USE_V1": "1",
|
temporary_environ(
|
||||||
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION"
|
{
|
||||||
# Flex_Attention is not supported with full cuda graph
|
"VLLM_USE_V1": "1",
|
||||||
}), pytest.raises(RuntimeError):
|
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
|
||||||
LLM(model="Qwen/Qwen2-1.5B-Instruct",
|
# Flex_Attention is not supported with full cuda graph
|
||||||
compilation_config=CompilationConfig(cudagraph_mode="FULL"))
|
}
|
||||||
|
),
|
||||||
|
pytest.raises(RuntimeError),
|
||||||
|
):
|
||||||
|
LLM(
|
||||||
|
model="Qwen/Qwen2-1.5B-Instruct",
|
||||||
|
compilation_config=CompilationConfig(cudagraph_mode="FULL"),
|
||||||
|
)
|
||||||
|
|||||||
@ -10,10 +10,14 @@ from torch import nn
|
|||||||
|
|
||||||
from vllm.compilation.backends import set_model_tag
|
from vllm.compilation.backends import set_model_tag
|
||||||
from vllm.compilation.counter import compilation_counter
|
from vllm.compilation.counter import compilation_counter
|
||||||
from vllm.compilation.decorators import (ignore_torch_compile,
|
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
|
||||||
support_torch_compile)
|
from vllm.config import (
|
||||||
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
CompilationConfig,
|
||||||
VllmConfig, set_current_vllm_config)
|
CompilationLevel,
|
||||||
|
CUDAGraphMode,
|
||||||
|
VllmConfig,
|
||||||
|
set_current_vllm_config,
|
||||||
|
)
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
|
|
||||||
# This import automatically registers `torch.ops.silly.attention`
|
# This import automatically registers `torch.ops.silly.attention`
|
||||||
@ -27,12 +31,7 @@ RANDOM_SEED = 0
|
|||||||
|
|
||||||
@support_torch_compile
|
@support_torch_compile
|
||||||
class ParentModel(nn.Module):
|
class ParentModel(nn.Module):
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
|
||||||
def __init__(self,
|
|
||||||
*,
|
|
||||||
vllm_config: VllmConfig,
|
|
||||||
prefix: str = '',
|
|
||||||
**kwargs) -> None:
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
@ -40,7 +39,6 @@ class ParentModel(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
|
|
||||||
def __init__(self, mlp_size: int, hidden_size: int) -> None:
|
def __init__(self, mlp_size: int, hidden_size: int) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.pre_attn = nn.Linear(mlp_size, hidden_size, bias=False)
|
self.pre_attn = nn.Linear(mlp_size, hidden_size, bias=False)
|
||||||
@ -51,17 +49,21 @@ class Attention(nn.Module):
|
|||||||
nn.init.xavier_normal_(
|
nn.init.xavier_normal_(
|
||||||
self.pre_attn.weight.data,
|
self.pre_attn.weight.data,
|
||||||
generator=torch.Generator().manual_seed(RANDOM_SEED),
|
generator=torch.Generator().manual_seed(RANDOM_SEED),
|
||||||
gain=0.001)
|
gain=0.001,
|
||||||
|
)
|
||||||
nn.init.xavier_normal_(
|
nn.init.xavier_normal_(
|
||||||
self.post_attn.weight.data,
|
self.post_attn.weight.data,
|
||||||
generator=torch.Generator().manual_seed(RANDOM_SEED),
|
generator=torch.Generator().manual_seed(RANDOM_SEED),
|
||||||
gain=0.001)
|
gain=0.001,
|
||||||
|
)
|
||||||
|
|
||||||
def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
|
def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
x_f32 = x.float()
|
x_f32 = x.float()
|
||||||
return (x_f32 * torch.rsqrt(
|
return (
|
||||||
torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6) *
|
x_f32
|
||||||
self.rms_norm_weight).to(x.dtype)
|
* torch.rsqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6)
|
||||||
|
* self.rms_norm_weight
|
||||||
|
).to(x.dtype)
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
x = self.pre_attn(x)
|
x = self.pre_attn(x)
|
||||||
@ -76,14 +78,15 @@ class Attention(nn.Module):
|
|||||||
|
|
||||||
@support_torch_compile
|
@support_torch_compile
|
||||||
class CompiledAttention(nn.Module):
|
class CompiledAttention(nn.Module):
|
||||||
|
def __init__(
|
||||||
def __init__(self,
|
self,
|
||||||
*,
|
*,
|
||||||
mlp_size: int,
|
mlp_size: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
prefix: str = '',
|
prefix: str = "",
|
||||||
**kwargs) -> None:
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.attn = Attention(mlp_size, hidden_size)
|
self.attn = Attention(mlp_size, hidden_size)
|
||||||
|
|
||||||
@ -93,21 +96,21 @@ class CompiledAttention(nn.Module):
|
|||||||
|
|
||||||
@support_torch_compile
|
@support_torch_compile
|
||||||
class CompiledAttentionTwo(CompiledAttention):
|
class CompiledAttentionTwo(CompiledAttention):
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
return self.attn(x) + x
|
return self.attn(x) + x
|
||||||
|
|
||||||
|
|
||||||
@ignore_torch_compile
|
@ignore_torch_compile
|
||||||
class SimpleModelWithTwoGraphs(ParentModel):
|
class SimpleModelWithTwoGraphs(ParentModel):
|
||||||
|
def __init__(
|
||||||
def __init__(self,
|
self,
|
||||||
*,
|
*,
|
||||||
mlp_size: int,
|
mlp_size: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
prefix: str = '',
|
prefix: str = "",
|
||||||
**kwargs) -> None:
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
# Test will fail without set_model_tag here with error:
|
# Test will fail without set_model_tag here with error:
|
||||||
# "ValueError: too many values to unpack (expected 3)"
|
# "ValueError: too many values to unpack (expected 3)"
|
||||||
@ -142,32 +145,45 @@ class SimpleModelWithTwoGraphs(ParentModel):
|
|||||||
|
|
||||||
|
|
||||||
@torch.inference_mode
|
@torch.inference_mode
|
||||||
def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
|
def run_model(
|
||||||
cudagraph_runtime_mode: CUDAGraphMode):
|
vllm_config: VllmConfig,
|
||||||
|
model: nn.Module,
|
||||||
|
inputs: torch.Tensor,
|
||||||
|
cudagraph_runtime_mode: CUDAGraphMode,
|
||||||
|
):
|
||||||
with set_forward_context({}, vllm_config=vllm_config):
|
with set_forward_context({}, vllm_config=vllm_config):
|
||||||
# warmup for the model with cudagraph_mode NONE
|
# warmup for the model with cudagraph_mode NONE
|
||||||
model(inputs)
|
model(inputs)
|
||||||
|
|
||||||
# simulate cudagraphs capturing
|
# simulate cudagraphs capturing
|
||||||
with set_forward_context({},
|
with set_forward_context(
|
||||||
vllm_config=vllm_config,
|
{},
|
||||||
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
vllm_config=vllm_config,
|
||||||
batch_descriptor=BatchDescriptor(
|
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
||||||
num_tokens=2, )):
|
batch_descriptor=BatchDescriptor(
|
||||||
|
num_tokens=2,
|
||||||
|
),
|
||||||
|
):
|
||||||
model(inputs[:2])
|
model(inputs[:2])
|
||||||
with set_forward_context({},
|
with set_forward_context(
|
||||||
vllm_config=vllm_config,
|
{},
|
||||||
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
vllm_config=vllm_config,
|
||||||
batch_descriptor=BatchDescriptor(
|
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
||||||
num_tokens=1, )):
|
batch_descriptor=BatchDescriptor(
|
||||||
|
num_tokens=1,
|
||||||
|
),
|
||||||
|
):
|
||||||
model(inputs[:1])
|
model(inputs[:1])
|
||||||
|
|
||||||
# simulate cudagraphs replay
|
# simulate cudagraphs replay
|
||||||
with set_forward_context({},
|
with set_forward_context(
|
||||||
vllm_config=vllm_config,
|
{},
|
||||||
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
vllm_config=vllm_config,
|
||||||
batch_descriptor=BatchDescriptor(
|
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
||||||
num_tokens=2, )):
|
batch_descriptor=BatchDescriptor(
|
||||||
|
num_tokens=2,
|
||||||
|
),
|
||||||
|
):
|
||||||
output = model(inputs[:2])
|
output = model(inputs[:2])
|
||||||
|
|
||||||
output = output.cpu()
|
output = output.cpu()
|
||||||
@ -178,82 +194,104 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
|||||||
outputs = []
|
outputs = []
|
||||||
|
|
||||||
# piecewise compile
|
# piecewise compile
|
||||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
vllm_config = VllmConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
compilation_config=CompilationConfig(
|
||||||
use_cudagraph=True,
|
level=CompilationLevel.PIECEWISE,
|
||||||
splitting_ops=["silly.attention"],
|
use_cudagraph=True,
|
||||||
cudagraph_capture_sizes=[1, 2],
|
splitting_ops=["silly.attention"],
|
||||||
))
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
)
|
||||||
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
|
model = (
|
||||||
hidden_size=HIDDEN_SIZE,
|
SimpleModelWithTwoGraphs(
|
||||||
vllm_config=vllm_config,
|
mlp_size=MLP_SIZE,
|
||||||
prefix='').eval().cuda()
|
hidden_size=HIDDEN_SIZE,
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
prefix="",
|
||||||
|
)
|
||||||
|
.eval()
|
||||||
|
.cuda()
|
||||||
|
)
|
||||||
|
|
||||||
# Pre-allocate memory for CUDAGraph which expects
|
# Pre-allocate memory for CUDAGraph which expects
|
||||||
# static tensor addresses
|
# static tensor addresses
|
||||||
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
|
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=2, # two graphs for the model
|
num_graphs_seen=2, # two graphs for the model
|
||||||
num_piecewise_graphs_seen=6,
|
num_piecewise_graphs_seen=6,
|
||||||
# attn_one, attn_two each has 3 piecewise graphs
|
# attn_one, attn_two each has 3 piecewise graphs
|
||||||
# (pre attn, post attn, silly_attention) each
|
# (pre attn, post attn, silly_attention) each
|
||||||
num_piecewise_capturable_graphs_seen=4,
|
num_piecewise_capturable_graphs_seen=4,
|
||||||
# attn_one, attn_two has pre attn and post attn each, total=4
|
# attn_one, attn_two has pre attn and post attn each, total=4
|
||||||
num_backend_compilations=4, # num_piecewise_capturable_graphs_seen
|
num_backend_compilations=4, # num_piecewise_capturable_graphs_seen
|
||||||
num_cudagraph_captured=8,
|
num_cudagraph_captured=8,
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
):
|
):
|
||||||
outputs.append(
|
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||||
run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
|
||||||
|
|
||||||
# no compile or cudagraph
|
# no compile or cudagraph
|
||||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
vllm_config = VllmConfig(
|
||||||
level=CompilationLevel.NO_COMPILATION, ))
|
compilation_config=CompilationConfig(
|
||||||
|
level=CompilationLevel.NO_COMPILATION,
|
||||||
|
)
|
||||||
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
|
model = (
|
||||||
hidden_size=HIDDEN_SIZE,
|
SimpleModelWithTwoGraphs(
|
||||||
vllm_config=vllm_config,
|
mlp_size=MLP_SIZE,
|
||||||
prefix='').eval().cuda()
|
hidden_size=HIDDEN_SIZE,
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
prefix="",
|
||||||
|
)
|
||||||
|
.eval()
|
||||||
|
.cuda()
|
||||||
|
)
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=0,
|
num_graphs_seen=0,
|
||||||
num_piecewise_graphs_seen=0,
|
num_piecewise_graphs_seen=0,
|
||||||
num_piecewise_capturable_graphs_seen=0,
|
num_piecewise_capturable_graphs_seen=0,
|
||||||
num_backend_compilations=0,
|
num_backend_compilations=0,
|
||||||
num_cudagraph_captured=0,
|
num_cudagraph_captured=0,
|
||||||
):
|
):
|
||||||
outputs.append(
|
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||||
run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
|
||||||
|
|
||||||
# piecewise compile without CUDA graph
|
# piecewise compile without CUDA graph
|
||||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
vllm_config = VllmConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
compilation_config=CompilationConfig(
|
||||||
use_cudagraph=False,
|
level=CompilationLevel.PIECEWISE,
|
||||||
splitting_ops=["silly.attention"],
|
use_cudagraph=False,
|
||||||
))
|
splitting_ops=["silly.attention"],
|
||||||
|
)
|
||||||
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
|
model = (
|
||||||
hidden_size=HIDDEN_SIZE,
|
SimpleModelWithTwoGraphs(
|
||||||
vllm_config=vllm_config,
|
mlp_size=MLP_SIZE,
|
||||||
prefix='').eval().cuda()
|
hidden_size=HIDDEN_SIZE,
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
prefix="",
|
||||||
|
)
|
||||||
|
.eval()
|
||||||
|
.cuda()
|
||||||
|
)
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=2,
|
num_graphs_seen=2,
|
||||||
num_piecewise_graphs_seen=6,
|
num_piecewise_graphs_seen=6,
|
||||||
num_piecewise_capturable_graphs_seen=4,
|
num_piecewise_capturable_graphs_seen=4,
|
||||||
num_backend_compilations=4,
|
num_backend_compilations=4,
|
||||||
num_cudagraph_captured=0, # no cudagraph captured
|
num_cudagraph_captured=0, # no cudagraph captured
|
||||||
):
|
):
|
||||||
outputs.append(
|
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||||
run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
|
||||||
|
|
||||||
# Generally don't expect outputs with and without inductor
|
# Generally don't expect outputs with and without inductor
|
||||||
# to be bitwise equivalent
|
# to be bitwise equivalent
|
||||||
|
|||||||
@ -11,8 +11,13 @@ from torch import nn
|
|||||||
|
|
||||||
from vllm.compilation.counter import compilation_counter
|
from vllm.compilation.counter import compilation_counter
|
||||||
from vllm.compilation.decorators import support_torch_compile
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
from vllm.config import (
|
||||||
VllmConfig, set_current_vllm_config)
|
CompilationConfig,
|
||||||
|
CompilationLevel,
|
||||||
|
CUDAGraphMode,
|
||||||
|
VllmConfig,
|
||||||
|
set_current_vllm_config,
|
||||||
|
)
|
||||||
from vllm.envs import VLLM_USE_V1
|
from vllm.envs import VLLM_USE_V1
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
from vllm.utils import is_torch_equal_or_newer
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
@ -23,12 +28,7 @@ from ..silly_attention import get_global_counter, reset_global_counter
|
|||||||
|
|
||||||
@support_torch_compile
|
@support_torch_compile
|
||||||
class SillyModel(nn.Module):
|
class SillyModel(nn.Module):
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
|
||||||
def __init__(self,
|
|
||||||
*,
|
|
||||||
vllm_config: VllmConfig,
|
|
||||||
prefix: str = '',
|
|
||||||
**kwargs) -> None:
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
@ -60,53 +60,65 @@ def _run_simple_model(
|
|||||||
expected_num_backend_compilations,
|
expected_num_backend_compilations,
|
||||||
expected_num_cudagraph_captured,
|
expected_num_cudagraph_captured,
|
||||||
):
|
):
|
||||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
vllm_config = VllmConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
compilation_config=CompilationConfig(
|
||||||
use_cudagraph=True,
|
level=CompilationLevel.PIECEWISE,
|
||||||
use_inductor=use_inductor,
|
use_cudagraph=True,
|
||||||
splitting_ops=splitting_ops,
|
use_inductor=use_inductor,
|
||||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
splitting_ops=splitting_ops,
|
||||||
cudagraph_copy_inputs=True,
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
cudagraph_capture_sizes=[1, 2],
|
cudagraph_copy_inputs=True,
|
||||||
))
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
)
|
||||||
|
)
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
model = SillyModel(vllm_config=vllm_config, prefix='')
|
model = SillyModel(vllm_config=vllm_config, prefix="")
|
||||||
|
|
||||||
inputs = torch.randn(100).cuda()
|
inputs = torch.randn(100).cuda()
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with (
|
||||||
|
compilation_counter.expect(
|
||||||
num_graphs_seen=1, # one graph for the model
|
num_graphs_seen=1, # one graph for the model
|
||||||
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||||
num_piecewise_capturable_graphs_seen=
|
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||||
expected_num_piecewise_capturable_graphs_seen,
|
|
||||||
num_backend_compilations=expected_num_backend_compilations,
|
num_backend_compilations=expected_num_backend_compilations,
|
||||||
num_cudagraph_captured=expected_num_cudagraph_captured,
|
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||||
), set_forward_context(None,
|
),
|
||||||
vllm_config=vllm_config): # background context
|
set_forward_context(None, vllm_config=vllm_config),
|
||||||
|
): # background context
|
||||||
# warm up with background context
|
# warm up with background context
|
||||||
model(inputs)
|
model(inputs)
|
||||||
|
|
||||||
# capturing/replaying should under context of cudagraph dispatching
|
# capturing/replaying should under context of cudagraph dispatching
|
||||||
with set_forward_context(
|
with set_forward_context(
|
||||||
None,
|
None,
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
|
||||||
batch_descriptor=BatchDescriptor(num_tokens=2, )):
|
batch_descriptor=BatchDescriptor(
|
||||||
|
num_tokens=2,
|
||||||
|
),
|
||||||
|
):
|
||||||
model(torch.randn(2).cuda())
|
model(torch.randn(2).cuda())
|
||||||
with set_forward_context(
|
with set_forward_context(
|
||||||
None,
|
None,
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
|
||||||
batch_descriptor=BatchDescriptor(num_tokens=1, )):
|
batch_descriptor=BatchDescriptor(
|
||||||
|
num_tokens=1,
|
||||||
|
),
|
||||||
|
):
|
||||||
model(torch.randn(1).cuda())
|
model(torch.randn(1).cuda())
|
||||||
|
|
||||||
input = torch.zeros(2).cuda()
|
input = torch.zeros(2).cuda()
|
||||||
reset_global_counter()
|
reset_global_counter()
|
||||||
with set_forward_context(
|
with set_forward_context(
|
||||||
None,
|
None,
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
|
||||||
batch_descriptor=BatchDescriptor(num_tokens=2, )):
|
batch_descriptor=BatchDescriptor(
|
||||||
|
num_tokens=2,
|
||||||
|
),
|
||||||
|
):
|
||||||
output = model(input)
|
output = model(input)
|
||||||
assert get_global_counter() == 2
|
assert get_global_counter() == 2
|
||||||
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
||||||
@ -120,12 +132,14 @@ def test_simple_piecewise_compile(use_inductor):
|
|||||||
splitting_ops=["silly.attention"],
|
splitting_ops=["silly.attention"],
|
||||||
use_inductor_graph_partition=False,
|
use_inductor_graph_partition=False,
|
||||||
use_inductor=use_inductor,
|
use_inductor=use_inductor,
|
||||||
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
# 2 * num_layers + 1
|
||||||
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
expected_num_piecewise_graphs_seen=5,
|
||||||
expected_num_backend_compilations=
|
# 1 + num_layers
|
||||||
3, # num_piecewise_capturable_graphs_seen
|
expected_num_piecewise_capturable_graphs_seen=3,
|
||||||
expected_num_cudagraph_captured=
|
# num_piecewise_capturable_graphs_seen
|
||||||
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
expected_num_backend_compilations=3,
|
||||||
|
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
|
expected_num_cudagraph_captured=6,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -134,22 +148,19 @@ def test_simple_piecewise_compile(use_inductor):
|
|||||||
def test_simple_inductor_graph_partition(splitting_ops):
|
def test_simple_inductor_graph_partition(splitting_ops):
|
||||||
assert VLLM_USE_V1
|
assert VLLM_USE_V1
|
||||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
pytest.skip("inductor graph partition is only available "
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
"in PyTorch 2.9+")
|
|
||||||
|
|
||||||
_run_simple_model(
|
_run_simple_model(
|
||||||
# inductor graph partition automatically resets splitting_ops
|
# Inductor graph partition automatically resets splitting_ops to an empty list
|
||||||
# to be an empty list
|
|
||||||
splitting_ops=splitting_ops,
|
splitting_ops=splitting_ops,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
use_inductor=True,
|
use_inductor=True,
|
||||||
expected_num_piecewise_graphs_seen=
|
# Since not splitting at fx graph level
|
||||||
1, # since not splitting at fx graph level
|
expected_num_piecewise_graphs_seen=1,
|
||||||
expected_num_piecewise_capturable_graphs_seen=
|
# Since not splitting at fx graph level
|
||||||
1, # since not splitting at fx graph level
|
expected_num_piecewise_capturable_graphs_seen=1,
|
||||||
expected_num_backend_compilations=
|
# Since not splitting at fx graph level
|
||||||
1, # since not splitting at fx graph level
|
expected_num_backend_compilations=1,
|
||||||
expected_num_cudagraph_captured=
|
# Inductor graph partition still captures 6 graph, same as fx graph partition
|
||||||
6, # inductor graph partition still captures 6
|
expected_num_cudagraph_captured=6,
|
||||||
# graph, same as fx graph partition.
|
|
||||||
)
|
)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user