Compare commits

..

4 Commits

Author SHA1 Message Date
6e1e31a66a add code owner
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:19:17 -07:00
50e80db4ef add mixin
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:15:10 -07:00
d3d6afb355 add mixin
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:12:41 -07:00
808fa43d76 add mixin
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:02:15 -07:00
1914 changed files with 145656 additions and 180483 deletions

View File

@ -368,7 +368,7 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE" # we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply( df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
) )
# get markdown tables # get markdown tables

View File

@ -181,14 +181,18 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="vllm serve $model \ server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \ -tp $tp \
--model $model \
--port $port \ --port $port \
$server_args" $server_args"
else else
echo "Key 'fp8' does not exist in common params." echo "Key 'fp8' does not exist in common params."
server_command="vllm serve $model \ server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \ -tp $tp \
--model $model \
--port $port \ --port $port \
$server_args" $server_args"
fi fi

View File

@ -365,7 +365,8 @@ run_serving_tests() {
continue continue
fi fi
server_command="$server_envs vllm serve \ server_command="$server_envs python3 \
-m vllm.entrypoints.openai.api_server \
$server_args" $server_args"
# run the server # run the server
@ -454,6 +455,11 @@ main() {
fi fi
check_hf_token check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)

46
.buildkite/pyproject.toml Normal file
View File

@ -0,0 +1,46 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true

View File

@ -48,7 +48,7 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest # Add job to create multi-arch manifest
@ -150,16 +150,11 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64" - "docker push vllm/vllm-openai:nightly"
- "docker push vllm/vllm-openai:nightly-x86_64" - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14) # Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh" - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins: plugins:
@ -168,4 +163,3 @@ steps:
password-env: DOCKERHUB_TOKEN password-env: DOCKERHUB_TOKEN
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"

View File

@ -8,41 +8,20 @@ set -ex
# DockerHub API endpoint for vllm/vllm-openai repository # DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub credentials from environment # Get DockerHub token from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set" echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1 exit 1
fi fi
if [ -z "$DOCKERHUB_USERNAME" ]; then
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
exit 1
fi
# Get DockerHub bearer token
echo "Getting DockerHub bearer token..."
set +x
BEARER_TOKEN=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
"https://hub.docker.com/v2/users/login" | jq -r '.token')
set -x
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
echo "Error: Failed to get DockerHub bearer token"
exit 1
fi
# Function to get all tags from DockerHub # Function to get all tags from DockerHub
get_all_tags() { get_all_tags() {
local page=1 local page=1
local all_tags="" local all_tags=""
while true; do while true; do
set +x local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100") "$REPO_API_URL?page=$page&page_size=100")
set -x
# Get both last_updated timestamp and tag name, separated by | # Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
echo "Deleting tag: $tag_name" echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
set +x local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"

View File

@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi fi
@ -163,6 +167,12 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "} --ignore=entrypoints/llm/test_prompt_validation.py "}
fi fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py # --ignore=entrypoints/openai/test_oot_registration.py

View File

@ -58,8 +58,11 @@ function cpu_tests() {
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model # Note: disable Bart until supports V1
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -x -v -s tests/models/language/pooling -m cpu_model pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \ pytest -x -v -s tests/models/multimodal/generation \

View File

@ -1,191 +0,0 @@
#!/bin/bash
# This script build the Ascend NPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Base ubuntu image with basic ascend development libraries and python installed
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
VLLM_ASCEND_TMP_DIR=
# Get the test run configuration file from the vllm-ascend repository
fetch_vllm_test_cfg() {
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
cleanup() {
rm -rf "${VLLM_ASCEND_TMP_DIR}"
}
trap cleanup EXIT
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
exit 1
fi
# If the file already exists locally, just overwrite it
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
rm -rf "${VLLM_ASCEND_TMP_DIR}"
trap - EXIT
}
# Downloads test run configuration file from a remote URL.
# Loads the configuration into the current script environment.
get_config() {
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1
fi
source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0
}
# get test running configuration.
fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script.
if [ $? -ne 0 ]; then
exit 1
fi
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install for pytest to make the docker build cache layer always valid
RUN --mount=type=cache,target=/root/.cache/pip \
pip install pytest>=6.0 modelscope
WORKDIR /workspace/vllm
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements/common.txt
COPY . .
# Install vLLM
RUN --mount=type=cache,target=/root/.cache/pip \
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton
# Install vllm-ascend
WORKDIR /workspace
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
ARG VLLM_ASCEND_TAG=main
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r /workspace/vllm-ascend/requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV VLLM_USE_MODELSCOPE=True
WORKDIR /workspace/vllm-ascend
CMD ["/bin/bash"]
EOF
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
index="${BASH_REMATCH[1]}"
cards_num="${BASH_REMATCH[2]}"
else
echo "parse error" >&2
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p ${model_cache_dir}
docker run \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
pytest -v -s tests/e2e/vllm_interface/
'

View File

@ -62,11 +62,12 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH= export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---" echo "--- Hardware Information ---"
# tpu-info # tpu-info

View File

@ -62,11 +62,12 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH= export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---" echo "--- Hardware Information ---"
# tpu-info # tpu-info

View File

@ -35,15 +35,16 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests cd tests
pytest -v -s v1/core pytest -v -s v1/core
pytest -v -s v1/engine pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_metrics
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_utils.py
pytest -v -s v1/test_metrics_reader.py
' '

View File

@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
vllm serve meta-llama/Llama-2-7b-chat-hf & python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
server_pid=$! server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

View File

@ -1,59 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1 TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048 MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.7 EXPECTED_THROUGHPUT=10.0
INPUT_LEN=1800 INPUT_LEN=1800
OUTPUT_LEN=128 OUTPUT_LEN=128

View File

@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG" echo "logging to $VLLM_LOG"
echo echo
vllm serve $MODEL \ VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \ --seed 42 \
--max-num-seqs $MAX_NUM_SEQS \ --max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

View File

@ -6,28 +6,24 @@
# to generate the final pipeline yaml file. # to generate the final pipeline yaml file.
# Documentation # Documentation
# label(str): the name of the test. emojis allowed. # label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. # torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on the fastcheck pipeline only # fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands. # command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for the test. incompatible with command. # commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, # num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on the first host, the second # in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host. # command runs on the second host.
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB # source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test # When adding a test
# - If the test belongs to an existing group, add it there # - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step # - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step. # - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel. # Note that all steps execute in parallel.
@ -50,28 +46,23 @@ steps:
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/multimodal - tests/async_engine
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/transformers_utils - tests/transformers_utils
no_gpu: true
commands: commands:
- python3 standalone_tests/lazy_imports.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s multimodal
- pytest -v -s transformers_utils - pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
- pytest -v -s transformers_utils # transformers_utils
- label: Python-only Installation Test # 10min - label: Python-only Installation Test # 10min
timeout_in_minutes: 20 timeout_in_minutes: 20
@ -91,12 +82,14 @@ steps:
- vllm/ - vllm/
- tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py - tests/basic_correctness/test_cumem.py
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Entrypoints Unit Tests # 5min - label: Entrypoints Unit Tests # 5min
timeout_in_minutes: 10 timeout_in_minutes: 10
@ -121,9 +114,10 @@ steps:
- tests/entrypoints/offline_mode - tests/entrypoints/offline_mode
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min - label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130 timeout_in_minutes: 130
@ -161,6 +155,7 @@ steps:
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/core/
- tests/distributed/test_utils - tests/distributed/test_utils
- tests/distributed/test_pynccl - tests/distributed/test_pynccl
- tests/distributed/test_events - tests/distributed/test_events
@ -168,34 +163,28 @@ steps:
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed - tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/test_internal_lb_dp.py
- tests/v1/test_hybrid_lb_dp.py
- tests/v1/engine/test_engine_core_client.py - tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands: commands:
# test with torchrun tp=2 and external_dp=2 # test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2 # test with tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
- pushd ../examples/offline_inference - pushd ../examples/offline_inference
@ -296,35 +285,23 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core - pytest -v -s v1/core
- pytest -v -s v1/executor - pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
- pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode - pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s v1/metrics
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py - pytest -v -s v1/test_metrics_reader.py
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Test others (CPU) # 5 mins
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Examples Test # 30min - label: Examples Test # 30min
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -343,13 +320,12 @@ steps:
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- label: Platform Tests (CUDA) # 4min - label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15 timeout_in_minutes: 15
@ -398,8 +374,9 @@ steps:
- pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py - pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py - pytest -v -s compile/test_noop_elimination.py
@ -431,9 +408,8 @@ steps:
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- tests/kernels/core - tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands: commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py - pytest -v -s kernels/core
- label: Kernels Attention Test %N # 23min - label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
@ -477,23 +453,33 @@ steps:
source_file_dependencies: source_file_dependencies:
- csrc/mamba/ - csrc/mamba/
- tests/kernels/mamba - tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands: commands:
- pytest -v -s kernels/mamba - pytest -v -s kernels/mamba
- label: Model Executor Test # 23min - label: Tensorizer Test # 14min
timeout_in_minutes: 35 timeout_in_minutes: 25
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/model_executor - vllm/model_executor/model_loader
- tests/model_executor - tests/tensorizer_loader
- tests/entrypoints/openai/test_tensorizer_entrypoint.py - tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor - pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- label: Benchmarks # 11min - label: Benchmarks # 11min
timeout_in_minutes: 20 timeout_in_minutes: 20
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -527,7 +513,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
timeout_in_minutes: 75 timeout_in_minutes: 75
@ -555,17 +541,10 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
- tests/mistral_tool_use
commands: commands:
- pytest -v -s -m 'not cpu_test' tool_use - pytest -v -s tool_use
- pytest -v -s mistral_tool_use
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
##### models test ##### ##### models test #####
@ -605,19 +584,13 @@ steps:
- vllm/ - vllm/
- tests/models/test_transformers.py - tests/models/test_transformers.py
- tests/models/test_registry.py - tests/models/test_registry.py
commands:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
timeout_in_minutes: 10
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_utils.py - tests/models/test_utils.py
- tests/models/test_vision.py - tests/models/test_vision.py
no_gpu: true
commands: commands:
- pytest -v -s models/test_utils.py models/test_vision.py - pytest -v -s models/test_transformers.py \
models/test_registry.py \
models/test_utils.py \
models/test_vision.py
- label: Language Models Tests (Standard) - label: Language Models Tests (Standard)
timeout_in_minutes: 25 timeout_in_minutes: 25
@ -787,13 +760,11 @@ steps:
commands: commands:
- pip install --upgrade git+https://github.com/huggingface/transformers - pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min - label: Blackwell Test # 38 min
timeout_in_minutes: 60 timeout_in_minutes: 60
@ -828,20 +799,18 @@ steps:
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
# Fusion # Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- label: Blackwell GPT-OSS Eval - label: GPT-OSS Eval (Blackwell)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
optional: true # run on nightlies optional: true # disable while debugging
source_file_dependencies: source_file_dependencies:
- tests/evals/gpt_oss - tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py - vllm/model_executor/models/gpt_oss.py
@ -849,34 +818,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
commands: commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5' - uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
@ -920,58 +862,47 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 110min
timeout_in_minutes: 90 timeout_in_minutes: 150
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/compilation/
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
- vllm/executor/ - vllm/executor/
- vllm/worker/worker_base.py - vllm/model_executor/models/
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/ - tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py - vllm/compilation
- tests/v1/distributed - vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown - vllm/v1/engine/
- tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Model Tests (2 GPUs) # 37min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/
- tests/basic_correctness/
- tests/model_executor/model_loader/test_sharded_state_loader.py
- tests/models/
commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# TODO: investigate and fix
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s models/multimodal/generation/test_maverick.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60 timeout_in_minutes: 60
@ -1094,8 +1025,6 @@ steps:
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
@ -1107,16 +1036,3 @@ steps:
num_gpus: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh

36
.github/CODEOWNERS vendored
View File

@ -4,14 +4,19 @@
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/v1/attention @LucasWilkinson
/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang /vllm/reasoning @aarnphm @chaunceyjiang
@ -23,22 +28,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact, # Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people # so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/worker/kv_cache_initializer_mixin.py @heheda12345
/vllm/v1/offloading @ApostaC /vllm/v1/offloading @ApostaC
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
@ -47,6 +50,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
@ -55,35 +59,23 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee /tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep /tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC /tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC /tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers.py @hmellor
/tests/models/test_transformers.py @hmellor
# Docs # Docs
/docs/mkdocs @hmellor /docs @hmellor
/docs/**/*.yml @hmellor
/requirements/docs.txt @hmellor
.readthedocs.yaml @hmellor
mkdocs.yaml @hmellor mkdocs.yaml @hmellor
# Linting
.markdownlint.yaml @hmellor
.pre-commit-config.yaml @hmellor
/tools/pre_commit @hmellor
# CPU # CPU
/vllm/v1/worker/cpu* @bigPYJ1151 /vllm/v1/worker/^cpu @bigPYJ1151
/csrc/cpu @bigPYJ1151 /csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151 /vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151 /cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151 /docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU # Intel GPU
/vllm/v1/worker/xpu* @jikunshang /vllm/v1/worker/^xpu @jikunshang
/vllm/platforms/xpu.py @jikunshang /vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang /docker/Dockerfile.xpu @jikunshang

View File

@ -43,6 +43,10 @@ body:
Any other things you would like to mention. Any other things you would like to mention.
validations: validations:
required: false required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes - type: checkboxes
id: askllm id: askllm
attributes: attributes:

54
.github/mergify.yml vendored
View File

@ -2,7 +2,6 @@ pull_request_rules:
- name: label-documentation - name: label-documentation
description: Automatically apply documentation label description: Automatically apply documentation label
conditions: conditions:
- label != stale
- or: - or:
- files~=^[^/]+\.md$ - files~=^[^/]+\.md$
- files~=^docs/ - files~=^docs/
@ -11,13 +10,10 @@ pull_request_rules:
label: label:
add: add:
- documentation - documentation
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build - name: label-ci-build
description: Automatically apply ci/build label description: Automatically apply ci/build label
conditions: conditions:
- label != stale
- or: - or:
- files~=^\.github/ - files~=^\.github/
- files~=\.buildkite/ - files~=\.buildkite/
@ -34,7 +30,6 @@ pull_request_rules:
- name: label-deepseek - name: label-deepseek
description: Automatically apply deepseek label description: Automatically apply deepseek label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*deepseek.*\.py - files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py - files~=^tests/.*deepseek.*\.py
@ -51,7 +46,6 @@ pull_request_rules:
- name: label-frontend - name: label-frontend
description: Automatically apply frontend label description: Automatically apply frontend label
conditions: conditions:
- label != stale
- files~=^vllm/entrypoints/ - files~=^vllm/entrypoints/
actions: actions:
label: label:
@ -61,7 +55,6 @@ pull_request_rules:
- name: label-llama - name: label-llama
description: Automatically apply llama label description: Automatically apply llama label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*llama.*\.py - files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py - files~=^tests/.*llama.*\.py
@ -77,7 +70,6 @@ pull_request_rules:
- name: label-multi-modality - name: label-multi-modality
description: Automatically apply multi-modality label description: Automatically apply multi-modality label
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/multimodal/ - files~=^vllm/multimodal/
- files~=^tests/multimodal/ - files~=^tests/multimodal/
@ -91,7 +83,6 @@ pull_request_rules:
- name: label-new-model - name: label-new-model
description: Automatically apply new-model label description: Automatically apply new-model label
conditions: conditions:
- label != stale
- and: - and:
- files~=^vllm/model_executor/models/ - files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py - files=vllm/model_executor/models/registry.py
@ -103,7 +94,6 @@ pull_request_rules:
- name: label-performance - name: label-performance
description: Automatically apply performance label description: Automatically apply performance label
conditions: conditions:
- label != stale
- or: - or:
- files~=^benchmarks/ - files~=^benchmarks/
- files~=^vllm/benchmarks/ - files~=^vllm/benchmarks/
@ -117,7 +107,6 @@ pull_request_rules:
- name: label-qwen - name: label-qwen
description: Automatically apply qwen label description: Automatically apply qwen label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*qwen.*\.py - files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py - files~=^tests/.*qwen.*\.py
@ -132,7 +121,6 @@ pull_request_rules:
- name: label-gpt-oss - name: label-gpt-oss
description: Automatically apply gpt-oss label description: Automatically apply gpt-oss label
conditions: conditions:
- label != stale
- or: - or:
- files~=^examples/.*gpt[-_]?oss.*\.py - files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py - files~=^tests/.*gpt[-_]?oss.*\.py
@ -154,7 +142,6 @@ pull_request_rules:
- name: label-rocm - name: label-rocm
description: Automatically apply rocm label description: Automatically apply rocm label
conditions: conditions:
- label != stale
- or: - or:
- files~=^csrc/rocm/ - files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm - files~=^docker/Dockerfile.rocm
@ -175,7 +162,6 @@ pull_request_rules:
- name: label-structured-output - name: label-structured-output
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:
- label != stale
- or: - or:
- files~=^benchmarks/structured_schemas/ - files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/benchmark_serving_structured_output.py
@ -185,7 +171,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/ - files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/ - files~=^vllm/v1/structured_output/
actions: actions:
label: label:
@ -195,7 +181,6 @@ pull_request_rules:
- name: label-speculative-decoding - name: label-speculative-decoding
description: Automatically apply speculative-decoding label description: Automatically apply speculative-decoding label
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/v1/spec_decode/ - files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/ - files~=^tests/v1/spec_decode/
@ -211,7 +196,6 @@ pull_request_rules:
- name: label-v1 - name: label-v1
description: Automatically apply v1 label description: Automatically apply v1 label
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/v1/ - files~=^vllm/v1/
- files~=^tests/v1/ - files~=^tests/v1/
@ -224,7 +208,6 @@ pull_request_rules:
description: Automatically apply tpu label description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions # Keep this list in sync with `label-tpu-remove` conditions
conditions: conditions:
- label != stale
- or: - or:
- files~=tpu.py - files~=tpu.py
- files~=_tpu - files~=_tpu
@ -240,7 +223,6 @@ pull_request_rules:
description: Automatically remove tpu label description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions # Keep this list in sync with `label-tpu` conditions
conditions: conditions:
- label != stale
- and: - and:
- -files~=tpu.py - -files~=tpu.py
- -files~=_tpu - -files~=_tpu
@ -255,9 +237,9 @@ pull_request_rules:
- name: label-tool-calling - name: label-tool-calling
description: Automatically add tool-calling label description: Automatically add tool-calling label
conditions: conditions:
- label != stale
- or: - or:
- files~=^tests/tool_use/ - files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/ - files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/ - files~=^vllm/entrypoints/openai/tool_parsers/
@ -274,9 +256,8 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label - name: ping author on conflicts and add 'needs-rebase' label
conditions: conditions:
- label != stale - conflict
- conflict - -closed
- -closed
actions: actions:
label: label:
add: add:
@ -290,12 +271,10 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes - name: assign reviewer for tensorizer changes
conditions: conditions:
- label != stale
- or:
- files~=^vllm/model_executor/model_loader/tensorizer.py - files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/model_executor/model_loader/tensorizer_loader/ - files~=^tests/tensorizer_loader/
actions: actions:
assign: assign:
users: users:
@ -303,7 +282,6 @@ pull_request_rules:
- name: assign reviewer for modelopt changes - name: assign reviewer for modelopt changes
conditions: conditions:
- label != stale
- or: - or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -318,27 +296,9 @@ pull_request_rules:
- name: remove 'needs-rebase' label when conflict is resolved - name: remove 'needs-rebase' label when conflict is resolved
conditions: conditions:
- -conflict - -conflict
- -closed - -closed
actions: actions:
label: label:
remove: remove:
- needs-rebase - needs-rebase
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:
- label != stale
- or:
- files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.*
- files~=^examples/others/lmcache/
- files~=^tests/v1/kv_connector/
- files~=^vllm/distributed/kv_transfer/
- title~=(?i)\bP/?D\b
- title~=(?i)NIXL
- title~=(?i)LMCache
actions:
label:
add:
- kv-connector

View File

@ -13,7 +13,7 @@ jobs:
actions: write actions: write
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with: with:
# Increasing this value ensures that changes to this workflow # Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months # propagate to all issues and PRs in days rather than months

View File

@ -6,18 +6,30 @@ default_stages:
- manual # Run in CI - manual # Run in CI
exclude: 'vllm/third_party/.*' exclude: 'vllm/third_party/.*'
repos: repos:
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/google/yapf
rev: v0.14.0 rev: v0.43.0
hooks: hooks:
- id: ruff-check - id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
- id: ruff
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos - repo: https://github.com/crate-ci/typos
rev: v1.38.1 rev: v1.35.5
hooks: hooks:
- id: typos - id: typos
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v21.1.2 rev: v20.1.3
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -34,10 +46,10 @@ repos:
hooks: hooks:
- id: actionlint - id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit - repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.9.1 rev: 0.6.17
hooks: hooks:
- id: pip-compile - id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:
@ -48,32 +60,38 @@ repos:
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- id: mypy-local - id: mypy-local
name: Run mypy for local Python installation name: Run mypy for local Python installation
entry: python tools/pre_commit/mypy.py 0 "local" entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
<<: &mypy_common - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
language: python name: Run mypy for Python 3.9
types_or: [python, pyi] entry: tools/mypy.sh 1 "3.9"
require_serial: true language: python
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10 name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10" entry: tools/mypy.sh 1 "3.10"
<<: *mypy_common language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11 name: Run mypy for Python 3.11
entry: python tools/pre_commit/mypy.py 1 "3.11" entry: tools/mypy.sh 1 "3.11"
<<: *mypy_common language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12 name: Run mypy for Python 3.12
entry: python tools/pre_commit/mypy.py 1 "3.12" entry: tools/mypy.sh 1 "3.12"
<<: *mypy_common language: python
stages: [manual] # Only run in CI types: [python]
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward additional_dependencies: *mypy_deps
name: Run mypy for Python 3.13
entry: python tools/pre_commit/mypy.py 1 "3.13"
<<: *mypy_common
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
- id: shellcheck - id: shellcheck
name: Lint shell scripts name: Lint shell scripts
@ -137,15 +155,18 @@ repos:
additional_dependencies: [regex] additional_dependencies: [regex]
- id: check-pickle-imports - id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports name: Prevent new pickle/cloudpickle imports
entry: python tools/pre_commit/check_pickle_imports.py entry: python tools/check_pickle_imports.py
language: python language: python
types: [python] types: [python]
additional_dependencies: [regex] pass_filenames: false
additional_dependencies: [pathspec, regex]
- id: validate-config - id: validate-config
name: Validate configuration has default values and that each field has a docstring name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py entry: python tools/validate_config.py
language: python language: python
additional_dependencies: [regex] types: [python]
pass_filenames: true
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@ -13,7 +13,6 @@ build:
mkdocs: mkdocs:
configuration: mkdocs.yaml configuration: mkdocs.yaml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:

View File

@ -34,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
# #
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures. # Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else() else()
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
#
# Set compression mode for CUDA >=13.x.
#
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
endif()
# #
# Set CUDA include flags for CXX compiler. # Set CUDA include flags for CXX compiler.
# #
@ -269,8 +257,8 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu" "csrc/sampler.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/w8a8/fp8/common.cu" "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu" "csrc/quantization/activation_kernels.cu"
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -314,13 +302,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp" "csrc/cutlass_extensions/common.cpp"
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu" "csrc/quantization/fp8/per_token_group_quant.cu")
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}" SRCS "${VLLM_EXT_SRC}"
@ -424,11 +412,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu") "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -452,16 +440,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later # CUDA 12.8 or later
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
) )
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
@ -486,16 +470,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later # require CUDA 12.8 or later
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
) )
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
@ -526,7 +506,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu") set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -570,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later # CUDA 12.8 or later
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -593,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# FP4 Archs and flags # FP4 Archs and flags
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -619,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# CUTLASS MLA Archs and flags # CUTLASS MLA Archs and flags
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS set(SRCS
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu") "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
@ -649,7 +617,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output. # if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu") set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -667,13 +635,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -692,13 +656,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
# moe_data.cu is used by all CUTLASS MoE kernels. # moe_data.cu is used by all CUTLASS MoE kernels.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu") set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}") CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -715,13 +675,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -1007,7 +963,6 @@ endif()
# For CUDA we also build and ship some external projects. # For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions # vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake) include(cmake/external_projects/vllm_flash_attn.cmake)

View File

@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥 *Latest News* 🔥
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@ -149,7 +148,6 @@ Compute Resources:
- Trainy - Trainy
- UC Berkeley - UC Berkeley
- UC San Diego - UC San Diego
- Volcengine
Slack Sponsor: Anyscale Slack Sponsor: Anyscale

View File

@ -74,7 +74,7 @@ start_server() {
local vllm_log=$4 local vllm_log=$4
local profile_dir=$5 local profile_dir=$5
pkill -if "vllm serve" || true pkill -if vllm
# Define the common arguments as a bash array. # Define the common arguments as a bash array.
# Each argument and its value are separate elements. # Each argument and its value are separate elements.
@ -96,22 +96,17 @@ start_server() {
# This correctly passes each element as a separate argument. # This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled # Start server with profiling enabled
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else else
# Start server without profiling # Start server without profiling
VLLM_SERVER_DEV_MODE=1 \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi fi
local server_pid=$!
# wait for 10 minutes... # wait for 10 minutes...
server_started=0 server_started=0
for i in {1..60}; do for i in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -123,7 +118,7 @@ start_server() {
done done
if (( ! server_started )); then if (( ! server_started )); then
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log". echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1 return 1
else else
return 0 return 0
@ -139,7 +134,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
pkill -if "vllm serve" || true pkill -if vllm
echo "starting server..." echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead # Call start_server without a profile_dir to avoid profiling overhead
@ -232,7 +227,7 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if "vllm serve" || true pkill -if vllm
sleep 10 sleep 10
echo "====================" echo "===================="
return 0 return 0
@ -308,6 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else else
echo "No configuration met the latency requirements. Skipping final profiling run." echo "No configuration met the latency requirements. Skipping final profiling run."
fi fi
pkill -if "vllm serve" || true pkill -if vllm
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.block_pool import BlockPool

View File

@ -1,31 +1,17 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc import gc
import time
from unittest import mock
import numpy as np import numpy as np
from benchmark_utils import TimeCollector
from tabulate import tabulate from tabulate import tabulate
from vllm.config import ( from benchmark_utils import TimeCollector
CacheConfig, from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
def benchmark_propose(args): def main(args):
rows = [] rows = []
for max_ngram in args.max_ngram: for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US) collector = TimeCollector(TimeCollector.US)
@ -83,88 +69,10 @@ def benchmark_propose(args):
) )
def benchmark_batched_propose(args):
NUM_SPECULATIVE_TOKENS_NGRAM = 10
PROMPT_LOOKUP_MIN = 5
PROMPT_LOOKUP_MAX = 15
MAX_MODEL_LEN = int(1e7)
DEVICE = current_platform.device_type
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
speculative_config = SpeculativeConfig(
target_model_config=model_config,
target_parallel_config=ParallelConfig(),
method="ngram",
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
prompt_lookup_max=PROMPT_LOOKUP_MAX,
prompt_lookup_min=PROMPT_LOOKUP_MIN,
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
mock_pp_group = mock.MagicMock()
mock_pp_group.world_size = 1
with mock.patch(
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
):
runner = GPUModelRunner(vllm_config, DEVICE)
# hack max model len
runner.max_model_len = MAX_MODEL_LEN
runner.drafter.max_model_len = MAX_MODEL_LEN
dummy_input_batch = InputBatch(
max_num_reqs=args.num_req,
max_model_len=MAX_MODEL_LEN,
max_num_batched_tokens=args.num_req * args.num_token,
device=DEVICE,
pin_memory=False,
vocab_size=256000,
block_sizes=[16],
)
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
dummy_input_batch.spec_decode_unsupported_reqs = ()
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
dummy_input_batch.token_ids_cpu = np.random.randint(
0, 20, (args.num_req, args.num_token)
)
runner.input_batch = dummy_input_batch
sampled_token_ids = [[0]] * args.num_req
print("Starting benchmark")
# first run is warmup so ignore it
for _ in range(args.num_iteration):
start = time.time()
runner.drafter.propose(
sampled_token_ids,
dummy_input_batch.req_ids,
dummy_input_batch.num_tokens_no_spec,
dummy_input_batch.token_ids_cpu,
dummy_input_batch.spec_decode_unsupported_reqs,
)
end = time.time()
print(f"Iteration time (s): {end - start}")
def invoke_main() -> None: def invoke_main() -> None:
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting" description="Benchmark the performance of N-gram speculative decode drafting"
) )
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
)
parser.add_argument( parser.add_argument(
"--num-iteration", "--num-iteration",
type=int, type=int,
@ -197,17 +105,8 @@ def invoke_main() -> None:
help="Number of speculative tokens to generate", help="Number of speculative tokens to generate",
) )
args = parser.parse_args() args = parser.parse_args()
main(args)
if not args.batched:
benchmark_propose(args)
else:
benchmark_batched_propose(args)
"""
# Example command lines:
# time python3 benchmarks/benchmark_ngram_proposer.py
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
""" # noqa: E501
if __name__ == "__main__": if __name__ == "__main__":
invoke_main() # pragma: no cover invoke_main() # pragma: no cover

View File

@ -37,13 +37,14 @@ from typing import Optional
import datasets import datasets
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from backend_request_func import ( from backend_request_func import (
ASYNC_REQUEST_FUNCS, ASYNC_REQUEST_FUNCS,
RequestFuncInput, RequestFuncInput,
RequestFuncOutput, RequestFuncOutput,
) )
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try: try:
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
@ -448,8 +449,7 @@ async def benchmark(
def prepare_extra_body(request) -> dict: def prepare_extra_body(request) -> dict:
extra_body = {} extra_body = {}
# Add the schema to the extra_body # Add the schema to the extra_body
extra_body["structured_outputs"] = {} extra_body[request.structure_type] = request.schema
extra_body["structured_outputs"][request.structure_type] = request.schema
return extra_body return extra_body
print("Starting initial single prompt test run...") print("Starting initial single prompt test run...")
@ -696,11 +696,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual): def _eval_correctness(expected, actual):
if args.structure_type == "json": if args.structure_type == "guided_json":
return _eval_correctness_json(expected, actual) return _eval_correctness_json(expected, actual)
elif args.structure_type == "regex": elif args.structure_type == "guided_regex":
return _eval_correctness_regex(expected, actual) return _eval_correctness_regex(expected, actual)
elif args.structure_type == "choice": elif args.structure_type == "guided_choice":
return _eval_correctness_choice(expected, actual) return _eval_correctness_choice(expected, actual)
else: else:
return None return None
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
) )
if args.dataset == "grammar": if args.dataset == "grammar":
args.structure_type = "grammar" args.structure_type = "guided_grammar"
elif args.dataset == "regex": elif args.dataset == "regex":
args.structure_type = "regex" args.structure_type = "guided_regex"
elif args.dataset == "choice": elif args.dataset == "choice":
args.structure_type = "choice" args.structure_type = "guided_choice"
else: else:
args.structure_type = "json" args.structure_type = "guided_json"
if args.no_structured_output: if args.no_structured_output:
args.structured_output_ratio = 0 args.structured_output_ratio = 0
if args.save_results: if args.save_results:
result_file_name = f"{args.structured_output_ratio}so" result_file_name = f"{args.structured_output_ratio}guided"
result_file_name += f"_{backend}" result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}" result_file_name += f"_{args.model.split('/')[-1]}"
@ -909,13 +909,13 @@ def create_argument_parser():
parser.add_argument( parser.add_argument(
"--tokenizer", "--tokenizer",
type=str, type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.", help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
) )
parser.add_argument( parser.add_argument(
"--tokenizer-mode", "--tokenizer-mode",
type=str, type=str,
default="auto", default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.", help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
) )
parser.add_argument( parser.add_argument(
"--num-prompts", "--num-prompts",

View File

@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_triton_block_scaled_mm, w8a8_block_fp8_matmul,
) )
from vllm.utils import FlexibleArgumentParser, cdiv from vllm.utils import FlexibleArgumentParser, cdiv
@ -158,7 +158,7 @@ def bench_fp8(
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
), ),
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm( "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
), ),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(

View File

@ -55,7 +55,9 @@ benchmark() {
output_len=$2 output_len=$2
CUDA_VISIBLE_DEVICES=0 vllm serve $model \ CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \ --port 8100 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 vllm serve $model \ CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \ --port 8200 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \

View File

@ -38,12 +38,16 @@ wait_for_server() {
launch_chunked_prefill() { launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct" model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill # disagg prefill
CUDA_VISIBLE_DEVICES=0 vllm serve $model \ CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \ --port 8100 \
--max-model-len 10000 \ --max-model-len 10000 \
--enable-chunked-prefill \ --enable-chunked-prefill \
--gpu-memory-utilization 0.6 & --gpu-memory-utilization 0.6 &
CUDA_VISIBLE_DEVICES=1 vllm serve $model \ CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \ --port 8200 \
--max-model-len 10000 \ --max-model-len 10000 \
--enable-chunked-prefill \ --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
launch_disagg_prefill() { launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct" model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill # disagg prefill
CUDA_VISIBLE_DEVICES=0 vllm serve $model \ CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \ --port 8100 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 vllm serve $model \ CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \ --port 8200 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \

View File

@ -1,191 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"mxfp4": dict(no_a_quant=False, enabled=True),
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_mxfp4(
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
b, forward_hadamard_matrix, method="abs_max"
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
return weight_hf_e2m1, weight_hf_scale_block
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
b, forward_hadamard_matrix, device
)
alpha = torch.tensor([1.0], device="cuda")
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
def run():
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs MXFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_mxfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_mxfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -3,7 +3,6 @@
import argparse import argparse
import copy import copy
import itertools import itertools
import os
import torch import torch
from weight_shapes import WEIGHT_SHAPES from weight_shapes import WEIGHT_SHAPES
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True), "torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True), "nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True), "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
} }
_needs_fbgemm = any(
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
)
if _needs_fbgemm:
try:
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
triton_scale_nvfp4_quant,
)
except ImportError:
print(
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
"These providers will be skipped. Please install fbgemm_gpu with: "
"'pip install fbgemm-gpu-genai' to run them."
)
# Disable FBGEMM providers so the benchmark can run.
for cfg in PROVIDER_CFGS.values():
if cfg.get("fbgemm"):
cfg["enabled"] = False
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg): def _quant_weight_nvfp4(b: torch.Tensor, device: str):
# Compute global scale for weight # Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32) b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
if "fbgemm" in cfg and cfg["fbgemm"]: b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
else:
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device): def build_nvfp4_runner(cfg, a, b, dtype, device):
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg) b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
# Compute global scale for activation # Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint. # NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
# Alpha for the GEMM operation # Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale) alpha = 1.0 / (a_global_scale * b_global_scale)
if "fbgemm" in cfg and cfg["fbgemm"]:
if cfg["no_a_quant"]:
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
def run():
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
else:
def run():
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
if cfg["no_a_quant"]: if cfg["no_a_quant"]:
# Pre-quantize activation # Pre-quantize activation
@ -184,13 +130,10 @@ if __name__ == "__main__":
for K, N, model in prepare_shapes(args): for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:") print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
os.makedirs(save_dir, exist_ok=True)
benchmark.run( benchmark.run(
print_data=True, print_data=True,
show_plots=True, show_plots=True,
save_path=save_dir, save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N, N=N,
K=K, K=K,
) )

View File

@ -1,207 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_nvfp4(
b: torch.Tensor,
forward_hadamard_matrix: torch.Tensor,
global_scale: torch.Tensor,
device: str,
M: int,
N: int,
K: int,
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
b, forward_hadamard_matrix, global_scale
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
-1, K // 16
)
return weight_hf_e2m1, weight_hf_scale_block
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
alpha = torch.tensor([1.0], device="cuda")
global_scale = torch.tensor([1.0], device="cuda")
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
b, forward_hadamard_matrix, global_scale, device, M, N, K
)
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
def run():
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs NVFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_nvfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [16, 32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -51,7 +51,7 @@ def calculate_diff(
): ):
"""Calculate the difference between Inductor and CUDA implementations.""" """Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda") device = torch.device("cuda")
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device) x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False) quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
@ -59,25 +59,23 @@ def calculate_diff(
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x) torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
cuda_out, cuda_scale = quant_fp8.forward_cuda(x) cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
try: out_allclose = lambda o1, o2: torch.allclose(
torch.testing.assert_close( o1.to(torch.float32),
cuda_out.to(torch.float32), o2.to(torch.float32),
torch_out.to(torch.float32), rtol=1e-3,
rtol=1e-3, atol=1e-5,
atol=1e-5, )
) scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
torch.testing.assert_close( if (
cuda_out.to(torch.float32), out_allclose(cuda_out, torch_out)
torch_eager_out.to(torch.float32), and scale_allclose(cuda_scale, torch_scale)
rtol=1e-3, and out_allclose(cuda_out, torch_eager_out)
atol=1e-5, and scale_allclose(cuda_scale, torch_eager_scale)
) ):
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
print("✅ All implementations match") print("✅ All implementations match")
except AssertionError as e: else:
print("❌ Implementations differ") print("❌ Implementations differ")
print(e)
configs = [] configs = []
@ -93,7 +91,7 @@ def benchmark_quantization(
): ):
device = torch.device("cuda") device = torch.device("cuda")
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype) x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8] quantiles = [0.5, 0.2, 0.8]
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major) quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -159,21 +157,21 @@ if __name__ == "__main__":
) )
parser.add_argument("-c", "--check", action="store_true") parser.add_argument("-c", "--check", action="store_true")
parser.add_argument( parser.add_argument(
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
) )
parser.add_argument( parser.add_argument(
"--hidden-sizes", "--hidden-sizes",
type=int, type=int,
nargs="+", nargs="+",
default=[896, 1024, 2048, 4096, 7168], default=None,
help="Hidden sizes to benchmark", help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
) )
parser.add_argument( parser.add_argument(
"--batch-sizes", "--batch-sizes",
type=int, type=int,
nargs="+", nargs="+",
default=[1, 16, 128, 512, 1024], default=None,
help="Batch sizes to benchmark", help="Batch sizes to benchmark (default: 1,16,32,64,128)",
) )
parser.add_argument( parser.add_argument(
"--group-sizes", "--group-sizes",
@ -194,8 +192,8 @@ if __name__ == "__main__":
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
hidden_sizes = args.hidden_sizes hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
batch_sizes = args.batch_sizes batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
if args.group_sizes is not None: if args.group_sizes is not None:
group_shapes = [] group_shapes = []

View File

@ -1,406 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
but use different quantization strategies and backends.
"""
import nvtx
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
WEIGHT_SHAPES_MOE = {
"mixtral-8x7b": [
[8, 2, 4096, 14336],
],
"deepseek-v2": [
[160, 6, 5120, 12288],
],
"custom-small": [
[8, 2, 2048, 7168],
],
"glm45-fp8": [
[128, 8, 4096, 1408],
],
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
[128, 1, 5120, 8192],
],
}
DEFAULT_MODELS = [
"mixtral-8x7b",
]
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
DEFAULT_TP_SIZES = [1]
PER_ACT_TOKEN_OPTS = [False, True]
PER_OUT_CH_OPTS = [False, True]
FP8_DTYPE = current_platform.fp8_dtype()
def bench_run(
results: list,
model: str,
num_experts: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
mkn: tuple[int, int, int],
):
(m, k, n) = mkn
dtype = torch.half
device = "cuda"
# Create input activations
a = torch.randn((m, k), device=device, dtype=dtype) / 10
# Create weights
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
# Create FP8 quantized weights and scales for both kernels
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
# Create scales based on quantization strategy
if per_out_ch:
# Per-channel quantization
w1_scale = torch.empty(
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
)
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
else:
# Per-tensor quantization
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
# Quantize weights
for expert in range(num_experts):
if per_out_ch:
# Per-channel quantization - not yet implemented properly
# For now, fall back to per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Expand scalar scales to the expected per-channel shape
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
w2_scale[expert] = w2_scale_temp.expand(k, 1)
else:
# Per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Store scalar scales in [1, 1] tensors
w1_scale[expert, 0, 0] = w1_scale_temp
w2_scale[expert, 0, 0] = w2_scale_temp
# Prepare weights for CUTLASS (no transpose needed)
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
# Create router scores and get topk
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
# Force per-tensor quantization for all cases to match working e2e setup
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
# Force per-tensor quantization for all cases
per_act_token = False
# Create stride tensors for CUTLASS
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
def run_cutlass_moe_fp8(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
cutlass_moe_fp8(
a=a,
w1_q=w1,
w2_q=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
# Pre-create quantization config to avoid creating it inside CUDA graph
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
cutlass_moe_fp8(
a=a,
w1_q=w1_fp8q_cutlass,
w2_q=w2_fp8q_cutlass,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
torch.cuda.synchronize()
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(triton_graph, stream=triton_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
fused_experts(
a,
w1_fp8q,
w2_fp8q,
topk_weights,
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
"""Benchmark CUDA graph using events like benchmark_moe.py"""
# Warmup
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
# Timing
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies = []
for _ in range(num_iters):
torch.cuda.synchronize()
start_event.record()
graph.replay()
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
# Divide by 10 since graph contains 10 calls
return sum(latencies) / (num_iters * 10)
# Benchmark parameters
num_warmup = 5
num_iters = 100
# Benchmark only CUDA graphs (more reliable and faster)
# Benchmark Triton MoE with CUDA graphs
triton_graph_time = bench_cuda_graph(
triton_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Benchmark CUTLASS MoE with CUDA graphs
cutlass_graph_time = bench_cuda_graph(
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Convert ms to us and return results
triton_time_us = triton_graph_time * 1000
cutlass_time_us = cutlass_graph_time * 1000
return {
"batch_size": m,
"triton_time_us": triton_time_us,
"cutlass_time_us": cutlass_time_us,
}
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
all_results = []
for model in args.models:
for tp in args.tp_sizes:
for layer in WEIGHT_SHAPES_MOE[model]:
num_experts = layer[0]
topk = layer[1]
size_k = layer[2]
size_n = layer[3] // tp
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for per_act_token in args.per_act_token_opts:
for per_out_ch in args.per_out_ch_opts:
print(
f"\n=== {model}, experts={num_experts}, topk={topk},"
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
)
config_results = []
for size_m in args.batch_sizes:
mkn = (size_m, size_k, size_n)
result = bench_run(
[], # Not used anymore
model,
num_experts,
topk,
per_act_token,
per_out_ch,
mkn,
)
if result:
config_results.append(result)
# Print results table for this configuration
if config_results:
print(
f"\n{'Batch Size':<12}"
f"{'Triton (us)':<15}"
f"{'CUTLASS (us)':<15}"
)
print("-" * 45)
for result in config_results:
print(
f"{result['batch_size']:<12}"
f"{result['triton_time_us']:<15.2f}"
f"{result['cutlass_time_us']:<15.2f}"
)
all_results.extend(config_results)
print(f"\nTotal benchmarks completed: {len(all_results)}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
across specified models/shapes/batches
Example usage:
python benchmark_cutlass_moe_fp8.py \
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
--tp-sizes 8 \
--batch-size 2 4 8 \
--per-act-token-opts false \
--per-out-ch-opts false
"""
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES_MOE.keys(),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
parser.add_argument(
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument(
"--per-act-token-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-activation token quantization options (true/false)",
)
parser.add_argument(
"--per-out-ch-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-output channel quantization options (true/false)",
)
args = parser.parse_args()
main(args)

View File

@ -7,10 +7,6 @@ Benchmark script for device communicators:
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
and SymmMemCommunicator (multimem, two-shot). and SymmMemCommunicator (multimem, two-shot).
for NCCL symmetric memory you need to set the environment variables
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
not use fast NVLS implementation for all reduce.
Usage: Usage:
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options] torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
@ -30,13 +26,7 @@ import torch.distributed as dist
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
from vllm.distributed.device_communicators.pynccl import ( from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
PyNcclCommunicator,
register_nccl_symmetric_ops,
)
from vllm.distributed.device_communicators.pynccl_allocator import (
set_graph_pool_id,
)
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@ -108,7 +98,6 @@ class CommunicatorBenchmark:
) )
if not self.pynccl_comm.disabled: if not self.pynccl_comm.disabled:
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
register_nccl_symmetric_ops(self.pynccl_comm)
else: else:
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
self.pynccl_comm = None self.pynccl_comm = None
@ -205,15 +194,6 @@ class CommunicatorBenchmark:
None, # no env variable needed None, # no env variable needed
) )
) )
communicators.append(
(
"pynccl-symm",
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
lambda t: True, # Always available if initialized
nullcontext(),
None, # no env variable needed
)
)
if self.symm_mem_comm_multimem is not None: if self.symm_mem_comm_multimem is not None:
comm = self.symm_mem_comm_multimem comm = self.symm_mem_comm_multimem
@ -291,9 +271,7 @@ class CommunicatorBenchmark:
# Capture the graph using context manager # Capture the graph using context manager
with context: with context:
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
graph_pool = torch.cuda.graph_pool_handle() with torch.cuda.graph(graph):
set_graph_pool_id(graph_pool)
with torch.cuda.graph(graph, pool=graph_pool):
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input) allreduce_fn(graph_input)

View File

@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
def make_rand_tensors( def make_rand_tensors(
a_shape: tuple[int, ...], a_shape: tuple[int],
b_shape: tuple[int, ...], b_shape: tuple[int],
c_shape: tuple[int, ...], c_shape: tuple[int],
a_dtype: torch.dtype, a_dtype: torch.dtype,
b_dtype: torch.dtype, b_dtype: torch.dtype,
c_dtype: torch.dtype, c_dtype: torch.dtype,
@ -243,7 +243,7 @@ class OpType(Enum):
lora_rank: int, lora_rank: int,
num_loras: int, num_loras: int,
num_slices: int, num_slices: int,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: ) -> tuple[tuple[int], tuple[int], tuple[int]]:
""" """
Given num_slices, return the shapes of the A, B, and C matrices Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type in A x B = C, for the op_type

View File

@ -579,22 +579,18 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size intermediate_size = config.ffn_config.ffn_hidden_size
hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM": elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts E = config.num_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in ( elif config.architectures[0] in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM", "DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM", "DeepseekV2ForCausalLM",
"Glm4MoeForCausalLM", "Glm4MoeForCausalLM",
): ):
E = config.n_routed_experts E = config.n_routed_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in ( elif config.architectures[0] in (
"Qwen2MoeForCausalLM", "Qwen2MoeForCausalLM",
"Qwen3MoeForCausalLM", "Qwen3MoeForCausalLM",
@ -603,18 +599,10 @@ def main(args: argparse.Namespace):
E = config.num_experts E = config.num_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
text_config = config.get_text_config()
E = text_config.num_experts
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts E = config.num_experts
topk = config.moe_topk[0] topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0] intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
else: else:
# Support for llama4 # Support for llama4
config = config.get_text_config() config = config.get_text_config()
@ -622,7 +610,6 @@ def main(args: argparse.Namespace):
E = config.num_local_experts E = config.num_local_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
enable_ep = bool(args.enable_expert_parallel) enable_ep = bool(args.enable_expert_parallel)
if enable_ep: if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts") ensure_divisibility(E, args.tp_size, "Number of experts")
@ -631,6 +618,7 @@ def main(args: argparse.Namespace):
else: else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"

View File

@ -1,174 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)
logger = init_logger(__name__)
@torch.inference_mode()
def run_benchmark(
num_tokens: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
kv_cache_dtype: str,
num_iters: int,
benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
current_platform.seed_everything(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
value = torch.randn_like(key)
# prepare the slot mapping.
# each token is assigned a unique slot in the KV-cache.
num_slots = block_size * num_blocks
if num_tokens > num_slots:
raise ValueError("num_tokens cannot exceed the total number of cache slots")
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
key_caches, value_caches = create_kv_caches_with_random(
num_blocks,
block_size,
1, # num_layers
num_heads,
head_size,
kv_cache_dtype,
dtype,
device=device,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# to free unused memory
del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
function_under_test = lambda: ops.reshape_and_cache(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
if benchmark_mode == "cudagraph":
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
# warm-up
run_cuda_benchmark(3)
lat = run_cuda_benchmark(num_iters)
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.cuda.empty_cache()
return lat
def main(args):
rows = []
for exp in range(1, 17):
n_tok = 2**exp
lat = run_benchmark(
num_tokens=n_tok,
num_heads=args.num_heads,
head_size=args.head_size,
block_size=args.block_size,
num_blocks=args.num_blocks,
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
kv_cache_dtype=args.kv_cache_dtype,
num_iters=args.iters,
benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, lat * 1e6]) # convert to microseconds
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--num-heads", type=int, default=128)
parser.add_argument(
"--head-size",
type=int,
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128,
)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--num-blocks", type=int, default=128 * 128)
parser.add_argument(
"--dtype",
type=str,
choices=["half", "bfloat16", "float"],
default="bfloat16",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8"],
default="auto",
)
parser.add_argument("--iters", type=int, default=200)
parser.add_argument(
"--mode",
type=str,
choices=["cudagraph", "no_graph"],
default="cudagraph",
)
args = parser.parse_args()
main(args)

View File

@ -9,9 +9,6 @@ import torch
from tabulate import tabulate from tabulate import tabulate
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import ( from vllm.utils import (
@ -34,8 +31,6 @@ def run_benchmark(
kv_cache_dtype: str, kv_cache_dtype: str,
kv_cache_layout: str, kv_cache_layout: str,
num_iters: int, num_iters: int,
implementation: str,
benchmark_mode: str,
device: str = "cuda", device: str = "cuda",
) -> float: ) -> float:
"""Return latency (seconds) for given num_tokens.""" """Return latency (seconds) for given num_tokens."""
@ -43,14 +38,6 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16: if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
if implementation not in ("cuda", "triton"):
raise ValueError(
f"Unsupported implementation: {implementation}. "
"Only 'cuda' and 'triton' are supported."
)
if implementation == "triton" and kv_cache_layout == "HND":
return float("nan") # Triton does not support HND layout yet.
current_platform.seed_everything(42) current_platform.seed_everything(42)
torch.set_default_device(device) torch.set_default_device(device)
@ -78,49 +65,27 @@ def run_benchmark(
cache_layout=kv_cache_layout, cache_layout=kv_cache_layout,
) )
key_cache, value_cache = key_caches[0], value_caches[0] key_cache, value_cache = key_caches[0], value_caches[0]
# to free unused memory
del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used). # compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32) k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32) v_scale = (value.amax() / 64.0).to(torch.float32)
if implementation == "cuda":
function_under_test = lambda: ops.reshape_and_cache_flash(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
else:
function_under_test = lambda: triton_reshape_and_cache_flash(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
if benchmark_mode == "cudagraph":
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float: def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize() torch.cuda.synchronize()
start = time.perf_counter() start = time.perf_counter()
for _ in range(n_iters): for _ in range(n_iters):
function_under_test() ops.reshape_and_cache_flash(
torch.cuda.synchronize() key,
value,
key_cache,
value_cache,
slot_mapping,
kv_cache_dtype,
k_scale,
v_scale,
)
torch.cuda.synchronize()
end = time.perf_counter() end = time.perf_counter()
return (end - start) / n_iters return (end - start) / n_iters
@ -151,16 +116,10 @@ def main(args):
kv_cache_dtype=args.kv_cache_dtype, kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout, kv_cache_layout=layout,
num_iters=args.iters, num_iters=args.iters,
implementation=args.implementation,
benchmark_mode=args.mode,
device="cuda", device="cuda",
) )
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
print(
f"Benchmark results for implementation {args.implementation}"
f" (measuring with {args.mode}):"
)
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
@ -192,21 +151,6 @@ if __name__ == "__main__":
) )
parser.add_argument("--iters", type=int, default=100) parser.add_argument("--iters", type=int, default=100)
parser.add_argument(
"--implementation",
type=str,
choices=["cuda", "triton"],
default="cuda",
)
parser.add_argument(
"--mode",
type=str,
choices=["cudagraph", "no_graph"],
default="cudagraph",
)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,19 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Comprehensive 3-way SiLU Benchmark Suite
This benchmark compares three SiLU implementations:
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
2. Triton Kernel - Triton-based implementation
The suite generates detailed performance comparisons including:
- Memory bandwidth utilization
- Speedup ratios (baseline vs optimized implementations)
- Performance across different expert configurations and token distributions
"""
from collections.abc import Callable from collections.abc import Callable
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -21,7 +7,7 @@ import numpy as np
import torch import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant, silu_mul_fp8_quant_deep_gemm_cuda,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
@ -108,7 +94,6 @@ def silu_mul_fp8_quant_deep_gemm_triton(
num_parallel_tokens, num_parallel_tokens,
group_size: int = 128, group_size: int = 128,
eps: float = 1e-10, eps: float = 1e-10,
expert_offsets: torch.Tensor = None,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
@ -189,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
# Parse generation strategies # Parse generation strategies
strategies = ["random_imbalanced", "uniform", "max_t"] strategies = ["uniform", "max_t", "first_t"]
def benchmark( def benchmark(
@ -210,27 +195,15 @@ def benchmark(
current_platform.seed_everything(42 + seed_offset) current_platform.seed_everything(42 + seed_offset)
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
if gen_strategy == "random_imbalanced": if gen_strategy == "uniform":
r = torch.rand(size=(E,), device="cuda")
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
mean = total_tokens // n_e
min_max = mean // ratio
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
e[0] = min_max
r = torch.rand(size=(E - 1,))
r /= r.sum()
r *= total_tokens - min_max
r = r.round().long()
e[1:] = r.to(device=device)
return e
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
elif gen_strategy == "uniform":
r = torch.rand(size=(E,))
r /= r.sum() r /= r.sum()
r *= total_tokens r *= total_tokens
r = r.round().long() tokens_per_expert = r.int()
tokens_per_expert = r tokens_per_expert = torch.minimum(
tokens_per_expert,
torch.ones((E,), device=r.device, dtype=torch.int) * T,
)
elif gen_strategy == "max_t": elif gen_strategy == "max_t":
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
tokens_per_expert.fill_(total_tokens / E) tokens_per_expert.fill_(total_tokens / E)
@ -308,34 +281,40 @@ def benchmark(
def create_comparison_plot( def create_comparison_plot(
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id ratio, cuda_times, baseline_times, config_labels, strategy_name, id
): ):
fig, ax = plt.subplots(1, 1, figsize=(18, 6)) """Create a comparison plot for a specific generation strategy"""
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
# Configure x-axis positions # Configure x-axis positions
x = np.arange(len(config_labels)) x = np.arange(len(config_labels))
width = 0.25 width = 0.35
# Execution Time plot (lower is better) # Execution Time plot (lower is better)
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
ax.bar( ax.bar(
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green" x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
)
ax.bar(
x + width / 2,
baseline_times,
width,
label="Baseline",
alpha=0.8,
color="orange",
) )
# Add speedup labels over each bar trio # Add speedup labels over each bar pair
for i in range(len(x)): for i in range(len(x)):
triton_v2_speedup = ratios[i][1] # triton/v2 speedup = ratio[i]
max_height = max(silu_v2_times[i], triton_times[i]) max_height = max(cuda_times[i], baseline_times[i])
# Triton/V2 speedup
ax.text( ax.text(
x[i] + width / 2, x[i],
max_height + max_height * 0.02, max_height + max_height * 0.02,
f"{triton_v2_speedup:.2f}x", f"{speedup:.2f}x",
ha="center", ha="center",
va="bottom", va="bottom",
fontweight="bold", fontweight="bold",
fontsize=8, fontsize=9,
) )
ax.set_xlabel("Configuration") ax.set_xlabel("Configuration")
@ -353,75 +332,56 @@ def create_comparison_plot(
def create_combined_plot(all_results): def create_combined_plot(all_results):
"""Create a combined plot with all strategies in one PNG"""
num_strategies = len(all_results) num_strategies = len(all_results)
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies)) fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
if num_strategies == 1: if num_strategies == 1:
axes = [axes] axes = [axes]
for idx, ( for idx, (
strategy_name, strategy_name,
all_ratios, ratio,
all_silu_v2_results, cuda_times,
all_triton_results, baseline_times,
config_labels, config_labels,
config_x_axis,
) in enumerate(all_results): ) in enumerate(all_results):
ax = axes[idx] ax = axes[idx]
# Flatten the nested results to get bandwidth percentages for plotting
silu_v2_bandwidths = []
triton_bandwidths = []
flat_ratios = []
for config_results in all_silu_v2_results:
for result in config_results:
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
for config_results in all_triton_results:
for result in config_results:
triton_bandwidths.append(result[3]) # bandwidth percentage
for config_ratios in all_ratios:
for ratio in config_ratios:
flat_ratios.append(ratio)
# Configure x-axis positions # Configure x-axis positions
x = np.arange(len(config_labels)) x = np.arange(len(config_labels))
width = 0.25 width = 0.35
# Bandwidth utilization plot (higher is better) # Execution Time plot (lower is better)
ax.bar( ax.bar(
x, x - width / 2,
silu_v2_bandwidths, cuda_times,
width, width,
label="SiLU V2 (CUDA)", label="CUDA Kernel",
alpha=0.8, alpha=0.8,
color="blue", color="blue",
) )
ax.bar( ax.bar(
x + width, x + width / 2,
triton_bandwidths, baseline_times,
width, width,
label="Triton Kernel", label="Baseline",
alpha=0.8, alpha=0.8,
color="green", color="orange",
) )
# Add speedup labels over each bar trio # Add speedup labels over each bar pair
for i in range(len(x)): for i in range(len(x)):
triton_v2_speedup = flat_ratios[i] # triton/v2 speedup = ratio[i]
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i]) max_height = max(cuda_times[i], baseline_times[i])
# Triton/V2 speedup
ax.text( ax.text(
x[i] + width / 2, x[i],
max_height + max_height * 0.02, max_height + max_height * 0.02,
f"{triton_v2_speedup:.2f}x", f"{speedup:.2f}x",
ha="center", ha="center",
va="bottom", va="bottom",
fontweight="bold", fontweight="bold",
fontsize=8, fontsize=9,
) )
ax.set_xlabel("Configuration") ax.set_xlabel("Configuration")
@ -435,7 +395,7 @@ def create_combined_plot(all_results):
ax.grid(True, alpha=0.3) ax.grid(True, alpha=0.3)
plt.tight_layout() plt.tight_layout()
filename = "silu_benchmark_combined_3way.png" filename = "../../silu_bench/silu_benchmark_combined.png"
plt.savefig(filename, dpi=300, bbox_inches="tight") plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show() plt.show()
@ -445,9 +405,7 @@ def create_combined_plot(all_results):
outer_dim = 7168 outer_dim = 7168
configs = [ configs = [
# DeepSeekV3 Configs # DeepSeekV3 Configs
# (1, 56, 7168),
(8, 1024, 7168), (8, 1024, 7168),
# (32, 56, 7168),
# DeepSeekV3 Configs # DeepSeekV3 Configs
(32, 1024, 7168), (32, 1024, 7168),
# DeepSeekV3 Configs # DeepSeekV3 Configs
@ -459,7 +417,6 @@ num_warmups = 20
strategy_descriptions = { strategy_descriptions = {
"uniform": "Uniform Random", "uniform": "Uniform Random",
"random_imbalanced": "Imbalanced Random",
"max_t": "Even Assignment", "max_t": "Even Assignment",
"first_t": "experts[0] = T, experts[1:] = 0", "first_t": "experts[0] = T, experts[1:] = 0",
} }
@ -476,31 +433,28 @@ for id, strategy in enumerate(strategies):
print(f"Testing strategy: {strategy_descriptions[strategy]}") print(f"Testing strategy: {strategy_descriptions[strategy]}")
print(f"{'=' * 60}") print(f"{'=' * 60}")
# Collect benchmark data for all three algorithms # Collect benchmark data for both algorithms
config_labels = [] config_labels = []
config_x_axis = [] config_x_axis = []
all_silu_v2_results = [] all_cuda_results = []
all_triton_results = [] all_baseline_results = []
all_ratios = [] all_ratios = []
for E, T, H in configs: for E, T, H in configs:
total_tokens_config = [] total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
for i in [8, 16, 32, 64, 128, 256, 512]:
if i <= T:
total_tokens_config.append(i * E)
config_x_axis.append(total_tokens_config) config_x_axis.append(total_tokens_config)
silu_v2_results = [] cuda_results = []
triton_results = [] baseline_results = []
ratios = [] ratios = []
for total_tokens in total_tokens_config: for total_tokens in total_tokens_config:
config_label = f"E={E},T={T},H={H},TT={total_tokens}" config_label = f"E={E},T={T},H={H},TT={total_tokens}"
config_labels.append(config_label) config_labels.append(config_label)
# SiLU V2 (CUDA kernel) results # CUDA kernel results
time_ms_silu_v2, gflops, gbps, perc = benchmark( time_ms_cuda, gflops, gbps, perc = benchmark(
persistent_masked_m_silu_mul_quant, silu_mul_fp8_quant_deep_gemm_cuda,
E, E,
T, T,
H, H,
@ -509,9 +463,9 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups, num_warmups=num_warmups,
gen_strategy=strategy, gen_strategy=strategy,
) )
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc)) cuda_results.append((time_ms_cuda, gflops, gbps, perc))
# Triton kernel results # Baseline results
time_ms_triton, gflops, gbps, perc = benchmark( time_ms_triton, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_triton, silu_mul_fp8_quant_deep_gemm_triton,
E, E,
@ -522,20 +476,12 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups, num_warmups=num_warmups,
gen_strategy=strategy, gen_strategy=strategy,
) )
triton_results.append((time_ms_triton, gflops, gbps, perc)) baseline_results.append((time_ms_triton, gflops, gbps, perc))
ratios.append(time_ms_triton / time_ms_cuda)
# Calculate speedup ratios (triton baseline / implementation) print(f"Completed: {config_label}")
triton_v2_ratio = time_ms_triton / time_ms_silu_v2 all_cuda_results.append(cuda_results)
ratios.append(triton_v2_ratio) all_baseline_results.append(baseline_results)
print(
f"Completed: {config_label}:"
f" V2: {time_ms_silu_v2:.3f}ms,"
f" Triton: {time_ms_triton:.3f}ms"
)
all_silu_v2_results.append(silu_v2_results)
all_triton_results.append(triton_results)
all_ratios.append(ratios) all_ratios.append(ratios)
# Store results for combined plotting # Store results for combined plotting
@ -543,8 +489,8 @@ for id, strategy in enumerate(strategies):
( (
strategy_descriptions[strategy], strategy_descriptions[strategy],
all_ratios, all_ratios,
all_silu_v2_results, all_cuda_results,
all_triton_results, all_baseline_results,
config_labels, config_labels,
config_x_axis, config_x_axis,
) )
@ -552,18 +498,15 @@ for id, strategy in enumerate(strategies):
# Print summary table for this strategy # Print summary table for this strategy
print(f"\nSummary Table - {strategy_descriptions[strategy]}:") print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}") print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
print("-" * 90) print("-" * 60)
for i, (E, T, H) in enumerate(configs): for i, (E, T, H) in enumerate(configs):
# Get the first result for each config (simplifying for summary) speedup = baseline_results[i][0] / cuda_results[i][0]
v2_time = silu_v2_results[i][0]
triton_time = triton_results[i][0]
triton_v2_speedup = triton_time / v2_time
config_label = f"E={E:3d},T={T:4d},H={H:4d}" config_label = f"E={E:3d},T={T:4d},H={H:4d}"
print( print(
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} " f"{config_label:<20} {cuda_results[i][0]:8.5f} "
f"{triton_v2_speedup:8.2f}x" f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
) )
@ -571,14 +514,15 @@ def create_total_tokens_plot(all_results):
num_strategies = len(all_results) num_strategies = len(all_results)
num_configs = len(configs) num_configs = len(configs)
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
fig, axs = plt.subplots( fig, axs = plt.subplots(
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies) num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
) )
# Add main title to the entire figure # Add main title to the entire figure
fig.suptitle( fig.suptitle(
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)", "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
fontsize=18, fontsize=16,
fontweight="bold", fontweight="bold",
y=0.98, y=0.98,
) )
@ -595,8 +539,8 @@ def create_total_tokens_plot(all_results):
( (
strategy_name, strategy_name,
all_ratios, all_ratios,
all_silu_v2_results, all_cuda_results,
all_triton_results, all_baseline_results,
config_labels, config_labels,
config_x_axis, config_x_axis,
) = result ) = result
@ -611,54 +555,42 @@ def create_total_tokens_plot(all_results):
ratios = all_ratios[config_idx] ratios = all_ratios[config_idx]
total_tokens_values = config_x_axis[config_idx] total_tokens_values = config_x_axis[config_idx]
# Extract speedup ratios # Extract CUDA and Triton bandwidth percentages
triton_v2_ratios = [ratio for ratio in ratios] cuda_bandwidth_percentages = [
result[3] for result in all_cuda_results[config_idx]
# Extract bandwidth percentages for all implementations
v2_bandwidth_percentages = [
result[3] for result in all_silu_v2_results[config_idx]
] ]
triton_bandwidth_percentages = [ triton_bandwidth_percentages = [
result[3] for result in all_triton_results[config_idx] result[3] for result in all_baseline_results[config_idx]
] ]
# Plot speedup ratios vs total tokens (left plot) # Plot speedup ratios vs total tokens (left plot)
ax_speedup.plot( ax_speedup.plot(
total_tokens_values, total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
triton_v2_ratios,
"go-",
linewidth=3,
markersize=8,
label="Triton/V2 Speedup",
) )
ax_speedup.set_title( ax_speedup.set_title(
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}", f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
fontsize=12, fontsize=12,
fontweight="bold", fontweight="bold",
) )
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
ax_speedup.legend(prop={"weight": "bold"})
ax_speedup.grid(True, alpha=0.3) ax_speedup.grid(True, alpha=0.3)
# Plot bandwidth utilization (right plot)
ax_bandwidth.plot( ax_bandwidth.plot(
total_tokens_values, total_tokens_values,
v2_bandwidth_percentages, cuda_bandwidth_percentages,
"o-", "ro-",
linewidth=3, linewidth=3,
markersize=8, markersize=8,
label="SiLU V2", label="CUDA",
color="blue",
) )
ax_bandwidth.plot( ax_bandwidth.plot(
total_tokens_values, total_tokens_values,
triton_bandwidth_percentages, triton_bandwidth_percentages,
"o-", "go-",
linewidth=3, linewidth=3,
markersize=8, markersize=8,
label="Triton", label="Triton",
color="green",
) )
ax_bandwidth.set_title( ax_bandwidth.set_title(
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -686,12 +618,38 @@ def create_total_tokens_plot(all_results):
for label in ax.get_xticklabels() + ax.get_yticklabels(): for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontweight("bold") label.set_fontweight("bold")
# Add value labels on Triton/V2 speedup points # Add value labels on speedup points
for x, y in zip(total_tokens_values, triton_v2_ratios): for x, y in zip(total_tokens_values, ratios):
ax_speedup.annotate( ax_speedup.annotate(
f"{y:.2f}x", f"{y:.2f}x",
(x, y), (x, y),
textcoords="offset points", textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=10,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
)
# Add value labels on CUDA bandwidth points
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=9,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
)
# Add value labels on Triton bandwidth points
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, -15), xytext=(0, -15),
ha="center", ha="center",
fontsize=9, fontsize=9,
@ -701,20 +659,17 @@ def create_total_tokens_plot(all_results):
plt.tight_layout() plt.tight_layout()
plt.subplots_adjust(top=0.93) # Make room for main title plt.subplots_adjust(top=0.93) # Make room for main title
filename = "silu_benchmark_total_tokens_3way.png" filename = "silu_benchmark_total_tokens.png"
plt.savefig(filename, dpi=300, bbox_inches="tight") plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show() plt.show()
return filename return filename
# Create comprehensive 3-way comparison plots # Create combined plot with all strategies
combined_plot_filename = create_combined_plot(all_results) combined_plot_filename = create_total_tokens_plot(all_results)
total_tokens_plot_filename = create_total_tokens_plot(all_results)
print(f"\n{'=' * 80}") print(f"\n{'=' * 60}")
print("3-Way Benchmark Suite Complete!") print("Benchmark Complete!")
print(f"Generated combined comparison plot: {combined_plot_filename}") print(f"Generated combined plot: {combined_plot_filename}")
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}") print(f"{'=' * 60}")
print("Compared: SiLU V2 (CUDA), and Triton implementations")
print(f"{'=' * 80}")

View File

@ -11,13 +11,13 @@ from datetime import datetime
from typing import Any from typing import Any
import torch import torch
import triton
from tqdm import tqdm from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_triton_block_scaled_mm, _w8a8_block_fp8_matmul,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True) mp.set_start_method("spawn", force=True)
@ -83,7 +83,7 @@ def w8a8_block_matmul(
) )
if A.dtype == torch.float8_e4m3fn: if A.dtype == torch.float8_e4m3fn:
kernel = _w8a8_triton_block_scaled_mm kernel = _w8a8_block_fp8_matmul
else: else:
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501 # ruff: noqa: E501
import time import time
@ -7,33 +8,27 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8, per_token_group_quant_fp8,
w8a8_triton_block_scaled_mm, w8a8_block_fp8_matmul,
) )
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils.deep_gemm import ( from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
calc_diff,
fp8_gemm_nt,
get_col_major_tma_aligned_tensor,
per_block_cast_to_fp8,
)
def benchmark_shape( def benchmark_shape(m: int,
m: int, n: int,
n: int, k: int,
k: int, warmup: int = 100,
warmup: int = 100, repeat: int = 10000,
repeat: int = 10000, verbose: bool = False) -> dict:
verbose: bool = False,
) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape.""" """Benchmark all implementations for a specific (m, n, k) shape."""
if verbose: if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors # Create test tensors
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16) B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
# Reference result in BF16 # Reference result in BF16
torch.cuda.synchronize() torch.cuda.synchronize()
@ -50,39 +45,34 @@ def benchmark_shape(
# Pre-quantize A for all implementations # Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16) C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True A, block_size[1], column_major_scales=True)
)
# === DeepGEMM Implementation === # === DeepGEMM Implementation ===
def deepgemm_gemm(): def deepgemm_gemm():
fp8_gemm_nt( fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm (B_deepgemm, B_scale_deepgemm),
) C_deepgemm)
return C_deepgemm return C_deepgemm
# === vLLM Triton Implementation === # === vLLM Triton Implementation ===
def vllm_triton_gemm(): def vllm_triton_gemm():
return w8a8_triton_block_scaled_mm( return w8a8_block_fp8_matmul(A_vllm,
A_vllm, B_vllm,
B_vllm, A_scale_vllm,
A_scale_vllm, B_scale_vllm,
B_scale_vllm, block_size,
block_size, output_dtype=torch.bfloat16)
output_dtype=torch.bfloat16,
)
# === vLLM CUTLASS Implementation === # === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm(): def vllm_cutlass_gemm():
return ops.cutlass_scaled_mm( return ops.cutlass_scaled_mm(A_vllm_cutlass,
A_vllm_cutlass, B_vllm.T,
B_vllm.T, scale_a=A_scale_vllm_cutlass,
scale_a=A_scale_vllm_cutlass, scale_b=B_scale_vllm.T,
scale_b=B_scale_vllm.T, out_dtype=torch.bfloat16)
out_dtype=torch.bfloat16,
)
# Run correctness check first # Run correctness check first
if verbose: if verbose:
@ -99,23 +89,26 @@ def benchmark_shape(
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
print( print("vLLM Triton vs DeepGEMM difference: "
"vLLM Triton vs DeepGEMM difference: " f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}" print("vLLM CUTLASS vs DeepGEMM difference: "
) f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
print(
"vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
)
# Benchmark implementations # Benchmark implementations
implementations = { implementations = {
"DeepGEMM": deepgemm_gemm, "DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm, "vLLM Triton": vllm_triton_gemm,
"vLLM CUTLASS": vllm_cutlass_gemm, "vLLM CUTLASS": vllm_cutlass_gemm
} }
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}} benchmark_results = {
"shape": {
"m": m,
"n": n,
"k": k
},
"implementations": {}
}
for name, func in implementations.items(): for name, func in implementations.items():
# Warmup # Warmup
@ -143,36 +136,38 @@ def benchmark_shape(
"tflops": tflops, "tflops": tflops,
"gb_s": gb_s, "gb_s": gb_s,
"diff": { "diff": {
"DeepGEMM": 0.0 "DeepGEMM":
if name == "DeepGEMM" 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
else calc_diff(func(), C_deepgemm), "Reference":
"Reference": deepgemm_diff deepgemm_diff if name == "DeepGEMM" else
if name == "DeepGEMM" (vllm_triton_diff
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff), if name == "vLLM Triton" else vllm_cutlass_diff)
}, }
} }
if verbose: if verbose:
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s") print(
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
)
# Calculate speedups # Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items(): for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM": if name != "DeepGEMM":
speedup = baseline / data["time_ms"] speedup = baseline / data["time_ms"]
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup benchmark_results["implementations"][name][
"speedup_vs_deepgemm"] = speedup
if verbose: if verbose:
print( print(f"DeepGEMM is {1/speedup:.2f}x "
f"DeepGEMM is {1 / speedup:.2f}x " f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
)
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"] vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"] "time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
"time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = ( benchmark_results["implementations"]["vLLM CUTLASS"][
cutlass_vs_triton "speedup_vs_triton"] = cutlass_vs_triton
)
if verbose: if verbose:
print( print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -184,7 +179,8 @@ def benchmark_shape(
def format_table_row(values, widths): def format_table_row(values, widths):
"""Format a row with specified column widths.""" """Format a row with specified column widths."""
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |" return "| " + " | ".join(f"{val:{w}}"
for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None): def print_table(headers, rows, title=None):
@ -292,50 +288,38 @@ def run_benchmarks(verbose: bool = False):
for result in all_results: for result in all_results:
shape = result["shape"] shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"] impl_data = result["implementations"]["DeepGEMM"]
deepgemm_rows.append( deepgemm_rows.append([
[ shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
shape["m"], f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
shape["n"], ])
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
]
)
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:") print_table(deepgemm_headers,
deepgemm_rows,
title="DeepGEMM Implementation:")
# Print vLLM Triton table # Print vLLM Triton table
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"] triton_headers = [
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
]
triton_rows = [] triton_rows = []
for result in all_results: for result in all_results:
shape = result["shape"] shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"] impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0) speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
triton_rows.append( triton_rows.append([
[ shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
shape["m"], f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
shape["n"], format_speedup(speedup)
shape["k"], ])
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(speedup),
]
)
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:") print_table(triton_headers,
triton_rows,
title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table # Print vLLM CUTLASS table
cutlass_headers = [ cutlass_headers = [
"m", "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
"n", "vs Triton"
"k",
"Time (μs)",
"TFLOPS",
"GB/s",
"vs DeepGEMM",
"vs Triton",
] ]
cutlass_rows = [] cutlass_rows = []
for result in all_results: for result in all_results:
@ -343,27 +327,28 @@ def run_benchmarks(verbose: bool = False):
impl_data = result["implementations"]["vLLM CUTLASS"] impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0) vs_triton = impl_data.get("speedup_vs_triton", 1.0)
cutlass_rows.append( cutlass_rows.append([
[ shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
shape["m"], f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
shape["n"], format_speedup(vs_deepgemm),
shape["k"], format_speedup(vs_triton)
f"{impl_data['time_us']:.1f}", ])
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton),
]
)
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:") print_table(cutlass_headers,
cutlass_rows,
title="vLLM CUTLASS Implementation:")
# Calculate and print averages # Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====") print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = { avg_metrics = {
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations impl: {
"tflops": 0,
"gb_s": 0,
"time_ms": 0
}
for impl in implementations
} }
for result in all_results: for result in all_results:
@ -381,9 +366,9 @@ def run_benchmarks(verbose: bool = False):
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes avg_time = avg_metrics[impl]["time_ms"] / num_shapes
avg_rows.append( avg_rows.append([
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"] impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
) ])
print_table(avg_headers, avg_rows) print_table(avg_headers, avg_rows)
@ -391,19 +376,21 @@ def run_benchmarks(verbose: bool = False):
avg_speedups = { avg_speedups = {
"DeepGEMM vs vLLM Triton": 0, "DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0, "DeepGEMM vs vLLM CUTLASS": 0,
"vLLM CUTLASS vs vLLM Triton": 0, "vLLM CUTLASS vs vLLM Triton": 0
} }
for result in all_results: for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"] vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
"time_ms"]
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time avg_speedups[
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += ( avg_speedups[
vllm_triton_time / vllm_cutlass_time "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
) avg_speedups[
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
print("\n===== AVERAGE SPEEDUPS =====") print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"] speedup_headers = ["Comparison", "Speedup"]
@ -421,7 +408,8 @@ def run_benchmarks(verbose: bool = False):
for result in all_results: for result in all_results:
for impl in implementations: for impl in implementations:
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"] avg_diff[impl] += result["implementations"][impl]["diff"][
"Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"] diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = [] diff_rows = []

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum from enum import Enum
from http import HTTPStatus from http import HTTPStatus
from statistics import mean from statistics import mean
from typing import NamedTuple, Union from typing import NamedTuple, Optional, Union
import aiohttp # type: ignore import aiohttp # type: ignore
import numpy as np # type: ignore import numpy as np # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
class ClientArgs(NamedTuple): class ClientArgs(NamedTuple):
seed: int seed: int
max_num_requests: int | None max_num_requests: Optional[int]
skip_first_turn: bool skip_first_turn: bool
max_turns: int | None max_turns: Optional[int]
max_active_conversations: int max_active_conversations: int
verbose: bool verbose: bool
print_content: bool print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
class MetricStats: class MetricStats:
def __init__(self) -> None: def __init__(self) -> None:
self.min: float | None = None self.min: Optional[float] = None
self.max: float | None = None self.max: Optional[float] = None
self.avg: float | None = None self.avg: Optional[float] = None
self.sum = 0.0 self.sum = 0.0
self.count = 0 self.count = 0
@ -143,7 +143,7 @@ class MovingAverage:
self.index = 0 self.index = 0
self.sum = 0.0 self.sum = 0.0
self.count = 0 self.count = 0
self.avg: float | None = None self.avg: Optional[float] = None
def update(self, new_value: float) -> None: def update(self, new_value: float) -> None:
if self.count < self.window_size: if self.count < self.window_size:
@ -198,6 +198,14 @@ class DebugStats:
self.logger.info("-" * 50) self.logger.info("-" * 50)
# Must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text
def nanosec_to_millisec(value: float) -> float: def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0 return value / 1000000.0
@ -212,8 +220,8 @@ async def send_request(
chat_url: str, chat_url: str,
model: str, model: str,
stream: bool = True, stream: bool = True,
min_tokens: int | None = None, min_tokens: Optional[int] = None,
max_tokens: int | None = None, max_tokens: Optional[int] = None,
) -> ServerResponse: ) -> ServerResponse:
payload = { payload = {
"model": model, "model": model,
@ -242,9 +250,9 @@ async def send_request(
timeout = aiohttp.ClientTimeout(total=timeout_sec) timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True valid_response = True
ttft: float | None = None ttft: Optional[float] = None
chunk_delay: list[int] = [] chunk_delay: list[int] = []
latency: float | None = None latency: Optional[float] = None
first_chunk = "" first_chunk = ""
generated_text = "" generated_text = ""
@ -261,7 +269,7 @@ async def send_request(
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
if chunk == "[DONE]": if chunk == "[DONE]":
# End of stream # End of stream
latency = time.perf_counter_ns() - start_time latency = time.perf_counter_ns() - start_time
@ -356,7 +364,7 @@ async def send_turn(
req_args: RequestArgs, req_args: RequestArgs,
verbose: bool, verbose: bool,
verify_output: bool, verify_output: bool,
) -> RequestStats | None: ) -> Optional[RequestStats]:
assert messages_to_use > 0 assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages) assert messages_to_use <= len(conversation_messages)
@ -761,7 +769,7 @@ def get_client_config(
"Number of conversations must be equal or larger than the number of clients" "Number of conversations must be equal or larger than the number of clients"
) )
max_req_per_client: int | None = None max_req_per_client: Optional[int] = None
if args.max_num_requests is not None: if args.max_num_requests is not None:
# Max number of requests per client # Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients req_per_client = args.max_num_requests // args.num_clients
@ -1024,7 +1032,7 @@ def process_statistics(
warmup_percentages: list[float], warmup_percentages: list[float],
test_params: dict, test_params: dict,
verbose: bool, verbose: bool,
gen_conv_args: GenConvArgs | None = None, gen_conv_args: Optional[GenConvArgs] = None,
excel_output: bool = False, excel_output: bool = False,
) -> None: ) -> None:
if len(client_metrics) == 0: if len(client_metrics) == 0:

49
benchmarks/pyproject.toml Normal file
View File

@ -0,0 +1,49 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.lint.isort]
known-first-party = ["vllm"]
[tool.ruff.format]
docstring-code-format = true

View File

@ -101,7 +101,6 @@ else()
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND) find_isa(${CPUINFO} "S390" S390_FOUND)
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
endif() endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -178,14 +177,8 @@ elseif (S390_FOUND)
"-mzvector" "-mzvector"
"-march=native" "-march=native"
"-mtune=native") "-mtune=native")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
if(RVV_FOUND)
message(FAIL_ERROR "Can't support rvv now.")
else()
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
endif()
else() else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.") message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
endif() endif()
# #
@ -213,7 +206,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif() endif()
set(ONEDNN_AARCH64_USE_ACL "ON") set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif() endif()
set(ONEDNN_LIBRARY_TYPE "STATIC") set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -227,7 +219,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
set(ONEDNN_ENABLE_ITT_TASKS "OFF") set(ONEDNN_ENABLE_ITT_TASKS "OFF")
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
set(ONEDNN_VERBOSE "ON") set(ONEDNN_VERBOSE "OFF")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
FetchContent_MakeAvailable(oneDNN) FetchContent_MakeAvailable(oneDNN)
@ -266,8 +258,7 @@ set(VLLM_EXT_SRC
"csrc/cpu/layernorm.cpp" "csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp" "csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp" "csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp" "csrc/cpu/torch_bindings.cpp")
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC

View File

@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
else() else()
FetchContent_Declare( FetchContent_Declare(
flashmla flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
@ -33,64 +33,23 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later. # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with # Only build FlashMLA kernels if we are building for something compatible with
# sm90a # sm90a
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
set(SUPPORT_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
list(APPEND SUPPORT_ARCHS 9.0a)
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
list(APPEND SUPPORT_ARCHS 10.0a)
endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
set(FlashMLA_SOURCES set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
${flashmla_SOURCE_DIR}/csrc/pybind.cpp ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
)
set(FlashMLA_Extension_SOURCES
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
)
set(FlashMLA_INCLUDES set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/cutlass/include ${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include ${flashmla_SOURCE_DIR}/csrc)
)
set(FlashMLA_Extension_INCLUDES
${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
)
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}" SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}") CUDA_ARCHS "${FLASH_MLA_ARCHS}")
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_Extension_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
define_gpu_extension_target( define_gpu_extension_target(
_flashmla_C _flashmla_C
DESTINATION vllm DESTINATION vllm
@ -101,32 +60,8 @@ if(FLASH_MLA_ARCHS)
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES} INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
target_compile_options(_flashmla_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
define_gpu_extension_target(
_flashmla_extension_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${FlashMLA_Extension_SOURCES}
COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
USE_SABI 3
WITH_SOABI)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
target_compile_options(_flashmla_extension_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
else() else()
# Create empty targets for setup.py when not targeting sm90a systems # Create an empty target for setup.py when not targeting sm90a systems
add_custom_target(_flashmla_C) add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C)
endif() endif()

View File

@ -1,97 +0,0 @@
include(FetchContent)
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
if(DEFINED ENV{QUTLASS_SRC_DIR})
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
endif()
if(QUTLASS_SRC_DIR)
FetchContent_Declare(
qutlass
SOURCE_DIR ${QUTLASS_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
qutlass
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif()
if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif()
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
if(QUTLASS_ARCHS MATCHES "10\\.0a")
set(QUTLASS_TARGET_CC 100)
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
set(QUTLASS_TARGET_CC 120)
else()
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
endif()
set(QUTLASS_SOURCES
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
)
set(QUTLASS_INCLUDES
${qutlass_SOURCE_DIR}
${qutlass_SOURCE_DIR}/qutlass
${qutlass_SOURCE_DIR}/qutlass/csrc/include
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
)
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
else()
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
endif()
set_gencode_flags_for_srcs(
SRCS "${QUTLASS_SOURCES}"
CUDA_ARCHS "${QUTLASS_ARCHS}"
)
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
target_compile_definitions(_C PRIVATE
QUTLASS_DISABLE_PYBIND=1
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
)
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
)
else()
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
message(STATUS
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
else()
message(STATUS
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
"CUDA_ARCHS='${CUDA_ARCHS}'.")
endif()
endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1 GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -16,7 +16,7 @@ import shutil
from torch.utils.hipify.hipify_python import hipify from torch.utils.hipify.hipify_python import hipify
if __name__ == "__main__": if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
# Project directory where all the source + include files live. # Project directory where all the source + include files live.
@ -34,14 +34,15 @@ if __name__ == "__main__":
) )
# Source files to convert. # Source files to convert.
parser.add_argument( parser.add_argument("sources",
"sources", help="Source files to hipify.", nargs="*", default=[] help="Source files to hipify.",
) nargs="*",
default=[])
args = parser.parse_args() args = parser.parse_args()
# Limit include scope to project_dir only # Limit include scope to project_dir only
includes = [os.path.join(args.project_dir, "*")] includes = [os.path.join(args.project_dir, '*')]
# Get absolute path for all source files. # Get absolute path for all source files.
extra_files = [os.path.abspath(s) for s in args.sources] extra_files = [os.path.abspath(s) for s in args.sources]
@ -50,31 +51,25 @@ if __name__ == "__main__":
# The directory might already exist to hold object files so we ignore that. # The directory might already exist to hold object files so we ignore that.
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
hipify_result = hipify( hipify_result = hipify(project_directory=args.project_dir,
project_directory=args.project_dir, output_directory=args.output_dir,
output_directory=args.output_dir, header_include_dirs=[],
header_include_dirs=[], includes=includes,
includes=includes, extra_files=extra_files,
extra_files=extra_files, show_detailed=True,
show_detailed=True, is_pytorch_extension=True,
is_pytorch_extension=True, hipify_extra_files_only=True)
hipify_extra_files_only=True,
)
hipified_sources = [] hipified_sources = []
for source in args.sources: for source in args.sources:
s_abs = os.path.abspath(source) s_abs = os.path.abspath(source)
hipified_s_abs = ( hipified_s_abs = (hipify_result[s_abs].hipified_path if
hipify_result[s_abs].hipified_path (s_abs in hipify_result
if ( and hipify_result[s_abs].hipified_path is not None)
s_abs in hipify_result else s_abs)
and hipify_result[s_abs].hipified_path is not None
)
else s_abs
)
hipified_sources.append(hipified_s_abs) hipified_sources.append(hipified_s_abs)
assert len(hipified_sources) == len(args.sources) assert (len(hipified_sources) == len(args.sources))
# Print hipified source files. # Print hipified source files.
print("\n".join(hipified_sources)) print("\n".join(hipified_sources))

View File

@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
list(REMOVE_DUPLICATES _PTX_ARCHS) list(REMOVE_DUPLICATES _PTX_ARCHS)
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
set(_CUDA_ARCHS) set(_CUDA_ARCHS)
foreach(_arch ${_SRC_CUDA_ARCHS}) foreach(_arch ${_SRC_CUDA_ARCHS})
if(_arch MATCHES "[af]$") if(_arch MATCHES "\\a$")
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
string(REGEX REPLACE "[af]$" "" _base "${_arch}") string(REPLACE "a" "" _base "${_arch}")
if ("${_base}" IN_LIST TGT_CUDA_ARCHS) if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
list(APPEND _CUDA_ARCHS "${_arch}") list(APPEND _CUDA_ARCHS "${_arch}")

View File

@ -28,10 +28,10 @@
#ifdef USE_ROCM #ifdef USE_ROCM
#include <hip/hip_bf16.h> #include <hip/hip_bf16.h>
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh" #include "../quantization/fp8/amd/quant_utils.cuh"
typedef __hip_bfloat16 __nv_bfloat16; typedef __hip_bfloat16 __nv_bfloat16;
#else #else
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh" #include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif #endif
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))

View File

@ -135,10 +135,10 @@ public:
max_splits = min(16, max_splits); max_splits = min(16, max_splits);
// TODO: This avoids a hang when the batch size larger than 1 and // TODO: This avoids a hang when the batch size larger than 1 and
// there is more than 1 kv_splits. // there is more than 4 kv_splits.
// Discuss with NVIDIA how this can be fixed. // Discuss with NVIDIA how this can be fixed.
if (B > 1) { if (B > 1) {
max_splits = min(1, max_splits); max_splits = min(2, max_splits);
} }
// printf(" max_splits = %d\n", max_splits); // printf(" max_splits = %d\n", max_splits);

View File

@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) { for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord(); auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape; auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv; auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) { if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) { if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
} }
} }
if (local_split_kv <= get<3>(blk_coord)) if (local_split_kv <= get<3>(blk_coord))
continue; continue;
load_page_table( load_page_table(
blk_coord, blk_coord,
problem_shape, problem_shape,
params.mainloop, params.mainloop,
shared_storage.tensors, shared_storage.tensors,
pipeline_page_table, pipeline_pt_producer_state, pipeline_page_table, pipeline_pt_producer_state,
local_split_kv local_split_kv
); );
} }
} }
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) { for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord(); auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape; auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv; auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) { if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) { if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
} }
} }
if (local_split_kv <= get<3>(blk_coord)) if (local_split_kv <= get<3>(blk_coord))
continue; continue;
load_cpasync( load_cpasync(
blk_coord, blk_coord,
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
params.mainloop_params, params.mainloop_params,
shared_storage.tensors, shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state,
local_split_kv, local_split_kv,
/* must be shared pipe */ /* must be shared pipe */
pipeline_page_table, pipeline_pt_consumer_state pipeline_page_table, pipeline_pt_consumer_state
); );
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) { for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord(); auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape; auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv; auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) { if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) { if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
} }
} }
if (local_split_kv <= get<3>(blk_coord)) if (local_split_kv <= get<3>(blk_coord))
continue; continue;
load_tma</* paged= */ true>( load_tma</* paged= */ true>(
blk_coord, blk_coord,
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
shared_storage.tensors, shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state,
pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state,
local_split_kv local_split_kv
); );
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait(); cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
} }
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) { for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord(); auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape; auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv; auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) { if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) { if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
} }
} }
if (local_split_kv <= get<3>(blk_coord)) if (local_split_kv <= get<3>(blk_coord))
continue; continue;
load_tma<false>( load_tma<false>(
blk_coord, blk_coord,
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
shared_storage.tensors, shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state,
pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state,
local_split_kv local_split_kv
); );
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait(); cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
} }
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) { for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord(); auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape; auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv; auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) { if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) { if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
} }
} }
if (local_split_kv <= get<3>(blk_coord)) if (local_split_kv <= get<3>(blk_coord))
continue; continue;
mma(blk_coord, mma(blk_coord,
problem_shape, problem_shape,
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
pipeline_mma_s, pipeline_mma_s_producer_state, pipeline_mma_s, pipeline_mma_s_producer_state,
pipeline_p_mma, pipeline_p_mma_consumer_state, pipeline_p_mma, pipeline_p_mma_consumer_state,
pipeline_mma_o, pipeline_mma_o_producer_state, pipeline_mma_o, pipeline_mma_o_producer_state,
local_split_kv local_split_kv
); );
} }
} }
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) { for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord(); auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape; auto problem_shape = params.problem_shape;
auto split_kv = params.split_kv; auto split_kv = params.split_kv;
auto local_split_kv = split_kv; auto local_split_kv = split_kv;
if (params.mainloop.ptr_seq != nullptr) { if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) { if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
} }
} }
if (local_split_kv <= get<3>(blk_coord)) if (local_split_kv <= get<3>(blk_coord))
continue; continue;
compute( compute(
blk_coord, blk_coord,
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
pipeline_mma_s, pipeline_mma_s_consumer_state, pipeline_mma_s, pipeline_mma_s_consumer_state,
pipeline_p_mma, pipeline_p_mma_producer_state, pipeline_p_mma, pipeline_p_mma_producer_state,
pipeline_mma_o, pipeline_mma_o_consumer_state, pipeline_mma_o, pipeline_mma_o_consumer_state,
local_split_kv local_split_kv
); );
} }
@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
cutlass::arch::NamedBarrier( cutlass::arch::NamedBarrier(
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
kNamedBarrierEpilogue kNamedBarrierEpilogue
).arrive_and_wait(); ).arrive();
return; return;
} }

View File

@ -56,19 +56,3 @@ void cp_gather_cache(
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1] torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt); int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
// Indexer K quantization and cache function
void indexer_k_quant_and_cache(
torch::Tensor& k, // [num_tokens, head_dim]
torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& slot_mapping, // [num_tokens]
int64_t quant_block_size, // quantization block size
const std::string& scale_fmt);
// Extract function to gather quantized K cache
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& dst_k, // [num_tokens, head_dim]
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
const torch::Tensor& block_table, // [batch_size, num_blocks]
const torch::Tensor& cu_seq_lens); // [batch_size + 1]

View File

@ -9,14 +9,15 @@
#include "quantization/vectorization_utils.cuh" #include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM #ifdef USE_ROCM
#include "quantization/w8a8/fp8/amd/quant_utils.cuh" #include "quantization/fp8/amd/quant_utils.cuh"
#else #else
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh" #include "quantization/fp8/nvidia/quant_utils.cuh"
#endif #endif
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cfloat> #include <map>
#include <vector>
#ifdef USE_ROCM #ifdef USE_ROCM
#include <hip/hip_bf16.h> #include <hip/hip_bf16.h>
@ -208,20 +209,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
namespace vllm { namespace vllm {
// Used to copy/convert one element
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
struct CopyWithScaleOp {
float scale;
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst = static_cast<OutT>(src);
} else {
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
}
}
};
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt> template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_kernel( __global__ void reshape_and_cache_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
@ -237,51 +224,59 @@ __global__ void reshape_and_cache_kernel(
const int64_t token_idx = blockIdx.x; const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx]; const int64_t slot_idx = slot_mapping[token_idx];
if (slot_idx < 0) { if (slot_idx < 0) {
// Padding token that should be ignored.
return; return;
} }
const int64_t block_idx = slot_idx / block_size; const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size; const int64_t block_offset = slot_idx % block_size;
const int h_block_count = head_size / x; // head_size//x
const int h_block_idx = threadIdx.x; const int n = num_heads * head_size;
if (h_block_idx >= num_heads * h_block_count) { for (int i = threadIdx.x; i < n; i += blockDim.x) {
return; const int64_t src_key_idx = token_idx * key_stride + i;
} const int64_t src_value_idx = token_idx * value_stride + i;
const int head_idx = h_block_idx / h_block_count; const int head_idx = i / head_size;
const int h_block = h_block_idx % h_block_count; const int head_offset = i % head_size;
const int x_idx = head_offset / x;
const int x_offset = head_offset % x;
const scalar_t* __restrict__ key_src = const int64_t tgt_key_idx =
key + token_idx * key_stride + head_idx * head_size + h_block * x; block_idx * num_heads * (head_size / x) * block_size * x +
const int64_t src_value_start = head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
token_idx * value_stride + head_idx * head_size + h_block * x; block_offset * x + x_offset;
const int64_t tgt_value_idx =
cache_t* __restrict__ key_dst = block_idx * num_heads * head_size * block_size +
key_cache + block_idx * num_heads * h_block_count * block_size * x + head_idx * head_size * block_size + head_offset * block_size +
head_idx * h_block_count * block_size * x + h_block * block_size * x + block_offset;
block_offset * x; scalar_t tgt_key = key[src_key_idx];
const int64_t tgt_value_start = scalar_t tgt_value = value[src_value_idx];
block_idx * num_heads * h_block_count * x * block_size + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
head_idx * h_block_count * x * block_size + h_block * x * block_size + key_cache[tgt_key_idx] = tgt_key;
block_offset; value_cache[tgt_value_idx] = tgt_value;
} else {
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; key_cache[tgt_key_idx] =
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val}; value_cache[tgt_value_idx] =
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val}; }
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
const scalar_t* __restrict__ value_src = value + src_value_start;
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
#pragma unroll
for (int i = 0; i < x; i++) {
v_op(value_dst[i * block_size], value_src[i]);
} }
} }
// Used by vectorization_utils to copy/convert one element
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
struct CopyWithScaleOp {
float scale;
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst = static_cast<OutT>(src);
} else {
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
}
}
};
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt> template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_flash_kernel( __global__ void reshape_and_cache_flash_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
@ -401,241 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
} }
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void concat_and_cache_ds_mla_kernel(
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
// + pe_dim)]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride, //
const int entry_stride, //
const int kv_c_stride, //
const int k_pe_stride, //
const int kv_lora_rank, //
const int pe_dim, //
const int block_size, //
const float* scale //
) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
if (slot_idx < 0) {
return;
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
const int64_t dst_idx_start =
block_idx * block_stride + block_offset * entry_stride;
// For the NoPE part, each tile of 128 elements is handled by half of one warp
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
// So in total, we use 3 warps (96 threads) per block.
// Cast kv_cache to 16_bit for RoPE values
scalar_t* kv_cache_16bit =
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
// The last warp handles the RoPE part
if (threadIdx.x >= 64) {
// Each thread handles two elements of RoPE
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
// Vectorized load of two 16-bit values, performed as one 32-bit load
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
// RoPE values start after the packed 8-bit NoPE values and the
// 32-bit scales
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
// Vectorized store of two 16-bit values, performed as one 32-bit store
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
return;
}
// The first two warps handle the NoPE part
const int8_t warp_idx = threadIdx.x >> 5;
const int8_t lane_idx = threadIdx.x & 31;
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
// Each thread handles 8 elements of NoPE
// Load the NoPE elements for this thread into registers
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
// Vectorized load of eight 16-bit values, performed as an int4 load
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
// Max absolute value of this thread's elements
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
// Warp-level reduction to find the max absolute value in each half-warp
#pragma unroll
for (int offset = 8; offset > 0; offset /= 2) {
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
}
// Compute the scale for the tile
float tile_scale = max_abs / 448.f;
tile_scale = fmaxf(tile_scale, FLT_MIN);
// The first lane of each half-warp writes the scale to kv_cache
if ((lane_idx == 0) || (lane_idx == 16)) {
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
kv_cache_32bit[dst_idx] = tile_scale;
}
// Now all threads in the block scale and write their elements
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
uint8_t result[8];
#pragma unroll
for (int i = 0; i < 8; i++) {
result[i] =
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
vals[i], tile_scale);
}
// Store as aligned 64-bit writes
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
*reinterpret_cast<const uint64_t*>(result);
}
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void indexer_k_quant_and_cache_kernel(
const scalar_t* __restrict__ k, // [num_tokens, head_dim]
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, cache_stride]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int head_dim, // dimension of each head
const int quant_block_size, // quantization block size
const int cache_block_size, // cache block size
const int cache_stride, // stride for each token in kv_cache
const bool use_ue8m0 // use ue8m0 scale format
) {
constexpr int VEC_SIZE = 4;
const int64_t token_idx = blockIdx.x;
const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
threadIdx.y * blockDim.x + threadIdx.x) *
VEC_SIZE;
const int64_t slot_idx = slot_mapping[token_idx];
const int64_t block_idx = slot_idx / cache_block_size;
const int64_t block_offset = slot_idx % cache_block_size;
// NOTE: slot_idx can be -1 if the token is padded
if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
return;
}
float2 k_val = (reinterpret_cast<const float2*>(
k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
float amax = 0.0f;
for (int i = 0; i < VEC_SIZE; i++) {
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
}
#ifndef USE_ROCM
__syncwarp();
#endif
// Reduced amax
for (int mask = 16; mask > 0; mask /= 2) {
#ifdef USE_ROCM
amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
#else
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
#endif
}
#ifndef USE_ROCM
__syncwarp();
#endif
float scale = fmaxf(amax, 1e-4) / 448.0f;
if (use_ue8m0) {
scale = exp2f(ceilf(log2f(scale)));
}
const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
block_offset * head_dim + head_dim_idx;
for (int i = 0; i < VEC_SIZE; i++) {
kv_cache[dst_offset + i] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
}
if (threadIdx.x == 0) {
const int64_t dst_scale_idx =
block_idx * cache_block_size * cache_stride +
cache_block_size * head_dim +
(block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
}
}
template <int BLOCK_Y_SIZE>
__global__ void cp_gather_indexer_k_quant_cache_kernel(
const char* __restrict__ kv_cache, // [num_blocks, block_size,
// cache_stride]
char* __restrict__ dst_k, // [num_tokens, head_dim]
char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size *
// 4]
const int* __restrict__ block_table, // [batch_size, num_blocks]
const int* __restrict__ cu_seq_lens, // [batch_size + 1]
const int batch_size, // batch size
const int64_t token_stride, // stride for each token in dst_k
const int64_t head_dim, // dimension of each head
const int64_t block_stride, // stride for each block in kv_cache
const int64_t cache_token_stride, // stride for each token in kv_cache
const int64_t cache_block_size, // num_tokens for each block in kv_cache
const int num_blocks, // number of blocks
const int num_tokens, // number of tokens
const int quant_block_size // quantization block size
) {
constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
// Find batch index within a block
__shared__ int batch_idx[BLOCK_Y_SIZE];
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
iter++) {
int tid = iter * blockDim.x + threadIdx.x;
if (tid < batch_size) {
const int seq_start = cu_seq_lens[tid];
const int seq_end = cu_seq_lens[tid + 1];
if (token_idx >= seq_start && token_idx < seq_end) {
batch_idx[threadIdx.y] = tid;
}
}
}
#ifndef USE_ROCM
__syncwarp();
#endif
if (head_idx >= head_dim || token_idx >= num_tokens) {
return;
}
const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
inbatch_seq_idx / cache_block_size];
const int64_t src_block_offset = block_idx * block_stride;
const int64_t cache_inblock_offset =
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
;
if (threadIdx.x == 0) {
const int64_t src_scale_offset =
src_block_offset + cache_block_size * head_dim +
cache_inblock_offset * 4 / quant_block_size;
reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
}
}
} // namespace vllm } // namespace vllm
// KV_T is the data type of key and value tensors. // KV_T is the data type of key and value tensors.
@ -671,15 +431,14 @@ void reshape_and_cache(
int key_stride = key.stride(0); int key_stride = key.stride(0);
int value_stride = value.stride(0); int value_stride = value.stride(0);
int head_div_x = head_size / x;
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_div_x, 512)); dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype, DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
CALL_RESHAPE_AND_CACHE); CALL_RESHAPE_AND_CACHE)
} }
// KV_T is the data type of key and value tensors. // KV_T is the data type of key and value tensors.
@ -750,18 +509,6 @@ void reshape_and_cache_flash(
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
reinterpret_cast<const float*>(scale.data_ptr())); reinterpret_cast<const float*>(scale.data_ptr()));
// KV_T is the data type of key and value tensors.
// CACHE_T is the stored data type of kv-cache.
#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE) \
vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<KV_T*>(kv_c.data_ptr()), \
reinterpret_cast<KV_T*>(k_pe.data_ptr()), \
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
reinterpret_cast<const float*>(scale.data_ptr()));
void concat_and_cache_mla( void concat_and_cache_mla(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& k_pe, // [num_tokens, pe_dim] torch::Tensor& k_pe, // [num_tokens, pe_dim]
@ -784,43 +531,20 @@ void concat_and_cache_mla(
int pe_dim = k_pe.size(1); int pe_dim = k_pe.size(1);
int block_size = kv_cache.size(1); int block_size = kv_cache.size(1);
if (kv_cache_dtype == "fp8_ds_mla") { TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
"kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
TORCH_CHECK(kv_c.itemsize() == 2,
"kv_c.itemsize() must be 2 for fp8_ds_mla");
TORCH_CHECK(k_pe.itemsize() == 2,
"k_pe.itemsize() must be 2 for fp8_ds_mla");
} else {
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
}
int kv_c_stride = kv_c.stride(0); int kv_c_stride = kv_c.stride(0);
int k_pe_stride = k_pe.stride(0); int k_pe_stride = k_pe.stride(0);
int block_stride = kv_cache.stride(0); int block_stride = kv_cache.stride(0);
int entry_stride = kv_cache.stride(1); int entry_stride = kv_cache.stride(1);
dim3 grid(num_tokens);
dim3 block(std::min(kv_lora_rank, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c)); const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (kv_cache_dtype == "fp8_ds_mla") { DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
dim3 grid(num_tokens); CALL_CONCAT_AND_CACHE_MLA);
// For the NoPE part, each tile of 128 elements is handled by half of one
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
// The RoPE part (last 64 elements) is handled by another 1 warp (32
// threads). So in total, we use 3 warps (96 threads) per block.
dim3 block(96);
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
CALL_CONCAT_AND_CACHE_DS_MLA);
} else {
dim3 grid(num_tokens);
dim3 block(std::min(kv_lora_rank, 512));
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
CALL_CONCAT_AND_CACHE_MLA);
}
} }
namespace vllm { namespace vllm {
@ -1198,98 +922,3 @@ void cp_gather_cache(
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits); TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
} }
} }
// Macro to dispatch the kernel based on the data type.
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<KV_T*>(k.data_ptr()), \
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
cache_block_size, cache_stride, use_ue8m0);
void indexer_k_quant_and_cache(
torch::Tensor& k, // [num_tokens, head_dim]
torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& slot_mapping, // [num_tokens]
int64_t quant_block_size, // quantization block size
const std::string& scale_fmt) {
int num_tokens = k.size(0);
int head_dim = k.size(1);
int cache_block_size = kv_cache.size(1);
int cache_stride = kv_cache.size(2);
bool use_ue8m0 = scale_fmt == "ue8m0";
TORCH_CHECK(k.device() == kv_cache.device(),
"k and kv_cache must be on the same device");
TORCH_CHECK(k.device() == slot_mapping.device(),
"k and slot_mapping must be on the same device");
TORCH_CHECK(head_dim % quant_block_size == 0,
"head_dim must be divisible by quant_block_size");
constexpr int vec_size = 4;
dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
(quant_block_size * vec_size));
dim3 block(32, vec_size);
const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
CALL_INDEXER_K_QUANT_AND_CACHE);
}
// Macro to dispatch the kernel based on the data amount.
#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \
vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE> \
<<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE, \
(head_dim + 8 * vec_size - 1) / (8 * vec_size)), \
dim3(8, BLOCK_Y_SIZE), 0, stream>>>( \
reinterpret_cast<char*>(kv_cache.data_ptr()), \
reinterpret_cast<char*>(dst_k.data_ptr()), \
reinterpret_cast<char*>(dst_scale.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \
kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \
num_tokens, quant_block_size);
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& dst_k, // [num_tokens, head_dim]
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
const torch::Tensor& block_table, // [batch_size, num_blocks]
const torch::Tensor& cu_seq_lens // [batch_size + 1]
) {
int batch_size = block_table.size(0);
int num_tokens = dst_k.size(0);
int head_dim = dst_k.size(1);
int quant_block_size = head_dim * 4 / dst_scale.size(1);
TORCH_CHECK(kv_cache.device() == dst_k.device(),
"kv_cache and dst_k must be on the same device");
TORCH_CHECK(kv_cache.device() == dst_scale.device(),
"kv_cache and dst_scale must be on the same device");
TORCH_CHECK(kv_cache.device() == block_table.device(),
"kv_cache and block_table must be on the same device");
TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
"kv_cache and cu_seq_lens must be on the same device");
TORCH_CHECK(head_dim % quant_block_size == 0,
"head_dim must be divisible by quant_block_size");
constexpr int vec_size = 16;
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (num_tokens < 32) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
} else if (num_tokens < 64) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
} else if (num_tokens < 128) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
} else if (num_tokens < 256) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
} else if (num_tokens < 512) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
} else {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
}
}

View File

@ -1,16 +0,0 @@
#pragma once
#include <cstdlib>
#include <string>
#include <cctype>
namespace vllm {
// vllm_kernel_override_batch_invariant(); returns true
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
inline bool vllm_kernel_override_batch_invariant() {
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0;
}
} // namespace vllm

View File

@ -14,12 +14,7 @@
// arm implementation // arm implementation
#include "cpu_types_arm.hpp" #include "cpu_types_arm.hpp"
#else #else
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar" #warning "unsupported vLLM cpu implementation"
#include "cpu_types_scalar.hpp"
#endif
#ifdef _OPENMP
#include <omp.h>
#endif #endif
#endif #endif

View File

@ -1,513 +0,0 @@
#include <cmath>
#include <cstdint>
#include <cstring>
#include <torch/all.h>
#include "float_convert.hpp"
namespace vec_op {
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME)
#else
#define CPU_KERNEL_GUARD_IN(NAME) \
std::cout << #NAME << " invoked." << std::endl;
#define CPU_KERNEL_GUARD_OUT(NAME) \
std::cout << #NAME << " exit." << std::endl;
#endif
#define FORCE_INLINE __attribute__((always_inline)) inline
#define __max(a, b) ((a) > (b) ? (a) : (b))
#define __min(a, b) ((a) < (b) ? (a) : (b))
#define __abs(a) ((a) < (0) ? (0 - a) : (a))
typedef struct f16x8_t {
uint16_t val[8];
} f16x8_t;
typedef struct f16x16_t {
uint16_t val[16];
} f16x16_t;
typedef struct f16x32_t {
uint16_t val[32];
} f16x32_t;
typedef struct f32x4_t {
float val[4];
} f32x4_t;
typedef struct f32x8_t {
float val[8];
} f32x8_t;
typedef struct f32x16_t {
float val[16];
} f32x16_t;
namespace {
template <typename T, T... indexes, typename F>
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
(f(std::integral_constant<T, indexes>{}), ...);
};
}; // namespace
template <typename T, T count, typename F,
typename = std::enable_if_t<std::is_invocable_v<F, T> > >
constexpr void unroll_loop(F&& f) {
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
}
template <typename T>
struct Vec {
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
};
struct FP32Vec8;
struct FP32Vec16;
struct FP16Vec8 : public Vec<FP16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
f16x8_t reg;
explicit FP16Vec8(const void* ptr)
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
explicit FP16Vec8(const FP32Vec8&);
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
};
struct FP16Vec16 : public Vec<FP16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
f16x16_t reg;
explicit FP16Vec16(const void* ptr)
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
explicit FP16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
void save(void* ptr, const int elem_num) const {
int num = __min(elem_num, VEC_ELEM_NUM);
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
}
};
struct BF16Vec8 : public Vec<BF16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
f16x8_t reg;
explicit BF16Vec8(const void* ptr)
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
explicit BF16Vec8(const FP32Vec8&);
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
f16x16_t reg;
explicit BF16Vec16(const void* ptr)
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
void save(void* ptr, const int elem_num) const {
int num = __min(elem_num, VEC_ELEM_NUM);
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
}
};
struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32;
f16x32_t reg;
explicit BF16Vec32(const void* ptr)
: reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
explicit BF16Vec32(f16x32_t data) : reg(data) {};
explicit BF16Vec32(BF16Vec8& vec8_data) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
}
}
void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
};
struct FP32Vec4 : public Vec<FP32Vec4> {
constexpr static int VEC_ELEM_NUM = 4;
f32x4_t reg;
explicit FP32Vec4(float v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = v;
}
}
explicit FP32Vec4() {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = 0.0f;
}
}
explicit FP32Vec4(const float* ptr)
: reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
explicit FP32Vec4(f32x4_t data) : reg(data) {};
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
};
struct FP32Vec8 : public Vec<FP32Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
f32x8_t reg;
explicit FP32Vec8(float v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = v;
}
}
explicit FP32Vec8() {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = 0.0f;
}
}
explicit FP32Vec8(const float* ptr)
: reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
explicit FP32Vec8(f32x8_t data) : reg(data) {};
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
explicit FP32Vec8(const FP16Vec8& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = fp16_to_float(v.reg.val[i]);
}
}
FP32Vec8(const BF16Vec8& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = bf16_to_float(v.reg.val[i]);
}
}
float reduce_sum() const {
float result = 0;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result += reg.val[i];
}
return result;
}
FP32Vec8 exp() const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = expf(reg.val[i]);
}
return FP32Vec8(ret);
}
FP32Vec8 tanh() const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = tanhf(reg.val[i]);
}
return FP32Vec8(ret);
}
FP32Vec8 er() const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = erf(reg.val[i]);
}
return FP32Vec8(ret);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] * b.reg.val[i];
}
return FP32Vec8(ret);
}
FP32Vec8 operator+(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] + b.reg.val[i];
}
return FP32Vec8(ret);
}
FP32Vec8 operator-(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] - b.reg.val[i];
}
return FP32Vec8(ret);
}
FP32Vec8 operator/(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] / b.reg.val[i];
}
return FP32Vec8(ret);
}
void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
};
struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
f32x16_t reg;
explicit FP32Vec16(float v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = v;
}
}
explicit FP32Vec16() {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = 0.0f;
}
}
explicit FP32Vec16(const float* ptr)
: reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
explicit FP32Vec16(f32x16_t data) : reg(data) {};
FP32Vec16(const FP32Vec4& data) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
}
}
FP32Vec16(const FP32Vec8& data) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
}
}
FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
explicit FP32Vec16(const FP16Vec16& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = fp16_to_float(v.reg.val[i]);
}
}
explicit FP32Vec16(const BF16Vec16& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = bf16_to_float(v.reg.val[i]);
}
}
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
FP32Vec16 operator*(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] * b.reg.val[i];
}
return result;
}
FP32Vec16 operator+(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] + b.reg.val[i];
}
return result;
}
FP32Vec16 operator-(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] - b.reg.val[i];
}
return result;
}
FP32Vec16 operator/(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] / b.reg.val[i];
}
return result;
}
FP32Vec16 max(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
}
return result;
}
FP32Vec16 min(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
}
return result;
}
FP32Vec16 abs() const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = __abs(reg.val[i]);
}
return result;
}
float reduce_sum() const {
float result = 0.0f;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result += reg.val[i];
}
return result;
}
float reduce_max() const {
float result = reg.val[0];
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result = __max(reg.val[i], result);
}
return result;
}
float reduce_min() const {
float result = reg.val[0];
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result = __min(reg.val[i], result);
}
return result;
}
template <int group_size>
float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
float sum = 0.0;
int start = idx * group_size;
int end = (idx + 1) * group_size;
for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
sum += reg.val[start];
}
return sum;
}
void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
};
template <typename T>
struct VecType {
using vec_type = void;
};
template <typename T>
using vec_t = typename VecType<T>::vec_type;
template <>
struct VecType<float> {
using vec_type = FP32Vec8;
};
template <>
struct VecType<c10::Half> {
using vec_type = FP16Vec8;
};
template <>
struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
/*
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
c10::Half __attribute__((__may_alias__)) *v_ptr =
reinterpret_cast<c10::Half *>(&v);
*ptr = *(v_ptr + 1);
}
*/
template <>
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
uint16_t fp16 = float_to_fp16(v);
*reinterpret_cast<uint16_t*>(ptr) = fp16;
}
template <>
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
reinterpret_cast<c10::BFloat16*>(&v);
*ptr = *(v_ptr + 1);
}
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
int i = 0;
for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_fp16(v.reg.val[i]);
}
}
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
int i = 0;
for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_fp16(v.reg.val[i]);
}
}
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc = acc + a * b;
}
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
int i = 0;
for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_bf16(v.reg.val[i]);
}
}
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
int i = 0;
for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_bf16(v.reg.val[i]);
}
}
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
}; // namespace vec_op

View File

@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
} }
void DNNLMatMulPrimitiveHandler::prepack_weight( void DNNLMatMulPrimitiveHandler::prepack_weight(
void* original_b_ptr, dnnl::memory::desc original_b_md, void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
dnnl::memory::desc b_target_mem_desc) { dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr); dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
dnnl::memory packed_weight(b_target_mem_desc, default_engine()); dnnl::memory packed_weight(b_target_mem_desc, default_engine());
{ {
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
if (a_qs_ == QuantizationStrategy::PER_TOKEN) { if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
assert(!use_azp_); assert(!use_azp_);
}; };
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, prepack_weight(args.b_ptr,
{b_k_stride_, b_n_stride_});
prepack_weight(args.b_ptr, original_b_md,
create_primitive_desc( create_primitive_desc(
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
.use_bias = false, .use_bias = false,
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
assert(ab_type_ == dnnl::memory::data_type::f32 || assert(ab_type_ == dnnl::memory::data_type::f32 ||
ab_type_ == dnnl::memory::data_type::bf16 || ab_type_ == dnnl::memory::data_type::bf16 ||
ab_type_ == dnnl::memory::data_type::f16); ab_type_ == dnnl::memory::data_type::f16);
prepack_weight(args.b_ptr,
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
prepack_weight(args.b_ptr, original_b_md,
create_primitive_desc( create_primitive_desc(
MSizeCacheKey{ MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
#ifdef VLLM_USE_ACL .a_m_stride = DNNL_RUNTIME_DIM_VAL,
// Arm Compute Library (ACL) backend for oneDNN does .use_bias = false,
// not support runtime .bias_type = dnnl::memory::data_type::undef},
// dimensions, so we set M to a default value
.a_m_size = 128,
.a_m_stride = b_k_size_,
#else
.a_m_size = DNNL_RUNTIME_DIM_VAL,
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
#endif
.use_bias = false,
.bias_type = dnnl::memory::data_type::undef},
true) true)
.weights_desc()); .weights_desc());
init_runtime_memory_cache(args); init_runtime_memory_cache(args);
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
c_storage->set_data_handle((void*)args.c_ptr); c_storage->set_data_handle((void*)args.c_ptr);
c_mem_desc->dims[0] = args.a_m_size; c_mem_desc->dims[0] = args.a_m_size;
#ifndef VLLM_USE_ACL
// We do not support in ACL backend of oneDNN, we handle bias by:
// 1. copying it into the result tensor
// 2. attaching a fused-sum post-op to the matmul primitive
if (args.use_bias) { if (args.use_bias) {
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2); auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
bias_storage->set_data_handle((void*)args.bias_ptr); bias_storage->set_data_handle((void*)args.bias_ptr);
} }
#endif
dnnl::matmul matmul = get_matmul_cache(args);
// With ACL backend of oneDNN, the required memory format might change when the dnnl::matmul matmul = get_matmul_cache(args);
// source tensor dims change. This does not really happen in practice, so isn't
// a performance hit, but we need to support it because the API allows for it.
#ifdef VLLM_USE_ACL
auto new_expected_wei_desc =
dnnl::matmul::primitive_desc(
const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
.weights_desc();
if (new_expected_wei_desc != b_target_mem_desc_) {
prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
b_target_mem_desc_, new_expected_wei_desc);
}
#endif
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3); auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
scratchpad_storage->set_data_handle( scratchpad_storage->set_data_handle(
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
} else { } else {
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
{key.a_m_stride, 1}); {key.a_m_stride, 1});
#ifdef VLLM_USE_ACL
// ACL's backend of oneDNN always expects the weight format to be "any"
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
dnnl::memory::format_tag::any);
#else
b_md = b_target_mem_desc_; b_md = b_target_mem_desc_;
#endif
} }
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
dnnl::memory::format_tag::ab); dnnl::memory::format_tag::ab);
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
if (key.use_bias) { if (key.use_bias) {
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
// Since ACL's matmuls don't support passing a bias_md, we apply the bias
// through a fused-sum post-op
#ifdef VLLM_USE_ACL
dnnl::post_ops post_ops;
post_ops.append_sum();
attr.set_post_ops(post_ops);
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
attr);
#else
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
c_md, attr); c_md, attr);
#endif
} else { } else {
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
attr); attr);
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
default_engine(), nullptr); default_engine(), nullptr);
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
// ACL matmuls don't support bias_md, so we don't need these
#ifndef VLLM_USE_ACL
memory_cache_[DNNL_ARG_BIAS] = memory_cache_[DNNL_ARG_BIAS] =
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
default_engine(), nullptr); default_engine(), nullptr);
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get()); set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
#endif
memory_cache_[DNNL_ARG_SCRATCHPAD] = memory_cache_[DNNL_ARG_SCRATCHPAD] =
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
default_engine(), nullptr); default_engine(), nullptr);
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
} }
bool is_onednn_acl_supported() {
#ifdef VLLM_USE_ACL
return true;
#else
return false;
#endif
}

View File

@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
protected: protected:
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type); DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md, void prepack_weight(void* original_b_ptr,
dnnl::memory::desc b_target_mem_desc); dnnl::memory::desc b_target_mem_desc);
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr); void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);

View File

@ -527,42 +527,21 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
MatMulPrimitiveHandler* ptr = MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler); reinterpret_cast<MatMulPrimitiveHandler*>(handler);
// ACL matmuls expect contiguous source tensors
#ifdef VLLM_USE_ACL
torch::Tensor a_contig = a.contiguous();
#endif
MatMulPrimitiveHandler::ExecArgs exec_args; MatMulPrimitiveHandler::ExecArgs exec_args;
#ifdef VLLM_USE_ACL
exec_args.a_m_size = a_contig.size(0);
exec_args.a_m_stride = a_contig.stride(0);
#else
exec_args.a_m_size = a.size(0); exec_args.a_m_size = a.size(0);
exec_args.a_m_stride = a.stride(0); exec_args.a_m_stride = a.stride(0);
#endif
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] { VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
if (bias.has_value()) { if (bias.has_value()) {
exec_args.use_bias = true; exec_args.use_bias = true;
exec_args.bias_type = get_dnnl_type<scalar_t>(); exec_args.bias_type = get_dnnl_type<scalar_t>();
#ifdef VLLM_USE_ACL
// ACL matmuls in oneDNN do not support a bias.
// We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
c.copy_(bias.value());
#else
exec_args.bias_ptr = bias->data_ptr<scalar_t>(); exec_args.bias_ptr = bias->data_ptr<scalar_t>();
#endif
} else { } else {
exec_args.use_bias = false; exec_args.use_bias = false;
exec_args.bias_type = get_dnnl_type<void>(); exec_args.bias_type = get_dnnl_type<void>();
exec_args.bias_ptr = nullptr; exec_args.bias_ptr = nullptr;
} }
#ifdef VLLM_USE_ACL
exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
#else
exec_args.a_ptr = a.data_ptr<scalar_t>(); exec_args.a_ptr = a.data_ptr<scalar_t>();
#endif
exec_args.c_ptr = c.data_ptr<scalar_t>(); exec_args.c_ptr = c.data_ptr<scalar_t>();
ptr->execute(exec_args); ptr->execute(exec_args);

View File

@ -1,106 +0,0 @@
static float bf16_to_float(uint16_t bf16) {
uint32_t bits = static_cast<uint32_t>(bf16) << 16;
float fp32;
std::memcpy(&fp32, &bits, sizeof(fp32));
return fp32;
}
static uint16_t float_to_bf16(float fp32) {
uint32_t bits;
std::memcpy(&bits, &fp32, sizeof(fp32));
return static_cast<uint16_t>(bits >> 16);
}
/************************************************
* Copyright (c) 2015 Princeton Vision Group
* Licensed under the MIT license.
* Codes below copied from
* https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
*************************************************/
static uint16_t float_to_fp16(float fp32) {
uint16_t fp16;
unsigned x;
unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned sign, exponent, mantissa;
std::memcpy(&x, &fp32, sizeof(fp32));
u = (x & 0x7fffffff);
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000) {
fp16 = 0x7fffU;
return fp16;
}
sign = ((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff) {
fp16 = sign | 0x7c00U;
return fp16;
}
if (u < 0x33000001) {
fp16 = (sign | 0x0000);
return fp16;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
} else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
// Round to nearest even.
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
fp16 = (sign | (exponent << 10) | mantissa);
return fp16;
}
static float fp16_to_float(uint16_t fp16) {
unsigned sign = ((fp16 >> 15) & 1);
unsigned exponent = ((fp16 >> 10) & 0x1f);
unsigned mantissa = ((fp16 & 0x3ff) << 13);
int temp;
float fp32;
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
exponent = 0xff;
} else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
} else {
exponent += 0x70;
}
temp = ((sign << 31) | (exponent << 23) | mantissa);
std::memcpy(&fp32, &temp, sizeof(temp));
return fp32;
}

View File

@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
void onednn_mm(torch::Tensor& c, const torch::Tensor& a, void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& bias, int64_t handler); const std::optional<torch::Tensor>& bias, int64_t handler);
bool is_onednn_acl_supported();
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale, torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens); torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -90,18 +88,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int tp_rank, int blocksparse_local_blocks," " int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"); " int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1); ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
ops.def(
"dynamic_4bit_int_moe("
"Tensor x, Tensor topk_ids, Tensor topk_weights,"
"Tensor w13_packed, Tensor w2_packed, int H, int I, int I2,"
"int group_size, bool apply_router_weight_on_input, int activation_kind"
") -> Tensor");
ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
// PagedAttention V2. // PagedAttention V2.
ops.def( ops.def(
"paged_attention_v2(" "paged_attention_v2("
@ -183,9 +171,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"int handler) -> ()"); "int handler) -> ()");
ops.impl("onednn_mm", torch::kCPU, &onednn_mm); ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
// Check if oneDNN was built with ACL backend
ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
// Create oneDNN W8A8 handler // Create oneDNN W8A8 handler
ops.def( ops.def(
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "

View File

@ -12,7 +12,6 @@ using CubMaxOp = cub::Max;
#endif // CUB_VERSION #endif // CUB_VERSION
#else #else
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub; using CubAddOp = cub::Sum;
using CubAddOp = hipcub::Sum; using CubMaxOp = cub::Max;
using CubMaxOp = hipcub::Max;
#endif // USE_ROCM #endif // USE_ROCM

View File

@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
**{ **{
VLLMDataType.u4b8: "u4b8", VLLMDataType.u4b8: "u4b8",
VLLMDataType.u8b128: "u8b128", VLLMDataType.u8b128: "u8b128",
}, }
} }
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
**{ **{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t", VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
}, }
} }
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
**{ **{
VLLMDataType.u4b8: 4, VLLMDataType.u4b8: 4,
VLLMDataType.u8b128: 8, VLLMDataType.u8b128: 8,
}, }
} }
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -67,13 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.f32: "at::ScalarType::Float", DataType.f32: "at::ScalarType::Float",
} }
VLLMKernelScheduleTag: dict[ VLLMKernelScheduleTag: dict[Union[
Union[MixedInputKernelScheduleType, KernelScheduleType], str MixedInputKernelScheduleType, KernelScheduleType], str] = {
] = { **KernelScheduleTag, # type: ignore
**KernelScheduleTag, # type: ignore **{
**{ MixedInputKernelScheduleType.TmaWarpSpecialized:
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 "cutlass::gemm::KernelTmaWarpSpecialized",
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501 MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501 "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
}, MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
} "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
}
}

View File

@ -1,64 +0,0 @@
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
// maximum blocks per SM cap
#ifndef VLLM_LAUNCH_BLOCKS_CAP
#define VLLM_LAUNCH_BLOCKS_CAP 4
#endif
// Compile-time estimate of max threads per SM for launch bounds.
// Families: 1024, 1536, 2048 threads/SM.
#ifndef VLLM_MAX_THREADS_PER_SM
#ifdef __CUDA_ARCH__
/* 1024 thr/SM: Turing (sm_75) */
#if (__CUDA_ARCH__ == 750)
#define VLLM_MAX_THREADS_PER_SM 1024
/* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
#elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
(__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \
(__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
(__CUDA_ARCH__ == 1210)
#define VLLM_MAX_THREADS_PER_SM 1536
/* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
Hopper (sm_90), Blackwell (sm_100/103) */
#elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
(__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \
(__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
#define VLLM_MAX_THREADS_PER_SM 2048
/* Fallback: use 2048 for unknown future CCs */
#else
#define VLLM_MAX_THREADS_PER_SM 2048
#endif
#else
/* Host pass (no __CUDA_ARCH__): neutral default */
#define VLLM_MAX_THREADS_PER_SM 2048
#endif
#endif
// compute the number of blocks per SM to request in __launch_bounds__
#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
(((VAL) <= 0) \
? 1 \
: (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
// runtime-time helper to compute blocks/SM
static inline int vllm_runtime_blocks_per_sm(int block_threads) {
int device = -1;
cudaGetDevice(&device);
int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
cudaDeviceGetAttribute(&max_threads_per_sm,
cudaDevAttrMaxThreadsPerMultiProcessor, device);
int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
}

View File

@ -1,7 +1,6 @@
#include "type_convert.cuh" #include "type_convert.cuh"
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -414,9 +413,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
wt_ptr % req_alignment_bytes == 0; wt_ptr % req_alignment_bytes == 0;
bool offsets_are_multiple_of_vector_width = bool offsets_are_multiple_of_vector_width =
hidden_size % vector_width == 0 && input_stride % vector_width == 0; hidden_size % vector_width == 0 && input_stride % vector_width == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant(); if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
!batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8); LAUNCH_FUSED_ADD_RMS_NORM(8);
} else { } else {
LAUNCH_FUSED_ADD_RMS_NORM(0); LAUNCH_FUSED_ADD_RMS_NORM(0);
@ -462,8 +459,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr()); auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr()); auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0; bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant(); if (ptrs_are_aligned && hidden_size % 8 == 0) {
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
LAUNCH_FUSED_POLY_NORM(8); LAUNCH_FUSED_POLY_NORM(8);
} else { } else {
LAUNCH_FUSED_POLY_NORM(0); LAUNCH_FUSED_POLY_NORM(0);

View File

@ -6,10 +6,9 @@
*/ */
#include "type_convert.cuh" #include "type_convert.cuh"
#include "quantization/w8a8/fp8/common.cuh" #include "quantization/fp8/common.cuh"
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -241,9 +240,7 @@ void fused_add_rms_norm_static_fp8_quant(
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr()); auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
bool ptrs_are_aligned = bool ptrs_are_aligned =
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant(); if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
!batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8); LAUNCH_FUSED_ADD_RMS_NORM(8);
} else { } else {
LAUNCH_FUSED_ADD_RMS_NORM(0); LAUNCH_FUSED_ADD_RMS_NORM(0);

View File

@ -1,156 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <torch/all.h>
// _dyn_quant_matmul_4bit is only available on AArch64.
#if defined(__aarch64__)
#include <ATen/ops/_dyn_quant_matmul_4bit.h>
#endif
inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w,
int64_t group_size_eff, int64_t in_features,
int64_t out_features) {
#if defined(__aarch64__)
return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff,
in_features, out_features);
#else
TORCH_CHECK(false,
"dynamic 4-bit int MoE path requires AArch64 (ARM64); "
"_dyn_quant_matmul_4bit is unavailable on this architecture");
return {};
#endif
}
enum ActivationKind : int64_t {
SwiGLU_Gu = 0, // act = SiLU(g) * u
SwiGLUOAI = 1, // act = SiLU(u) * g
SiLU = 2 // SiLU
};
torch::Tensor dynamic_4bit_int_moe_cpu(
torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
int64_t activation_kind) {
TORCH_CHECK(x.dim() == 2, "x must be 2D");
TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2,
"topk tensors must be [T, K]");
TORCH_CHECK(
w13_packed.size(0) == w2_packed.size(0),
"w13_packed and w2_packed must have same number of experts in dim 0");
TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I");
const int64_t T = x.size(0);
const int64_t K = topk_ids.size(1);
const int64_t E = w13_packed.size(0);
const int64_t N = T * K;
auto x_c = x.contiguous();
auto ids_c = topk_ids.contiguous();
auto gates_c = topk_weights.to(at::kFloat).contiguous();
// bucketing tokens -> experts
c10::SmallVector<int64_t, 64> counts(
E, 0); // Small vector uses stack allocation
{
const auto* ids_ptr = ids_c.data_ptr<int64_t>();
for (int64_t i = 0; i < N; ++i) {
const int64_t e_id = ids_ptr[i];
TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range");
counts[e_id]++;
}
}
c10::SmallVector<int64_t, 65> offsets(E + 1, 0); // ( E +1 )
for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e];
auto expert_tokens = at::empty({offsets[E]}, ids_c.options());
auto expert_gates = at::empty({offsets[E]}, gates_c.options());
{
c10::SmallVector<int64_t, 64> cursor(E, 0);
const auto* ids_ptr = ids_c.data_ptr<int64_t>();
const auto* gts_ptr = gates_c.data_ptr<float>();
auto* tok_ptr = expert_tokens.data_ptr<int64_t>();
auto* gate_ptr = expert_gates.data_ptr<float>();
for (int64_t t = 0; t < T; ++t) {
const int64_t base = t * K;
for (int64_t k = 0; k < K; ++k) {
const int64_t idx = base + k;
const int64_t e = ids_ptr[idx];
const int64_t p = offsets[e] + (cursor[e]++);
tok_ptr[p] = t;
gate_ptr[p] = gts_ptr[idx];
}
}
}
const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
// Per-expert outputs filled in parallel
std::vector<torch::Tensor> y_list(E);
y_list.resize(E);
at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
for (int64_t e = e_begin; e < e_end; ++e) {
const int64_t te = counts[e];
if (te == 0) {
y_list[e] = at::empty({0, H}, x_c.options());
continue;
}
const int64_t start = offsets[e];
auto sel_tokens =
expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto gates_e =
expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
if (apply_router_weight_on_input) {
x_e = x_e.mul(gates_e.unsqueeze(1));
}
auto w13_e = w13_packed.select(/*dim=*/0, e);
auto w2_e = w2_packed.select(/*dim=*/0, e);
// W13
auto y13 =
mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2);
auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I);
auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I);
torch::Tensor act;
if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI
constexpr double kAlpha = 1.702; // GPT-OSS default
constexpr double kLimit = 7.0; // GPT-OSS default
auto gate_c = at::clamp_max(g_part, kLimit);
auto up_c = at::clamp(u_part, -kLimit, kLimit);
auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha)));
act = up_c.add(1.0).mul(glu);
} else { // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul()
act = at::silu(g_part).mul(u_part);
}
// W2
auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
if (!apply_router_weight_on_input) {
y = y.mul(gates_e.unsqueeze(1));
}
// Store per-expert result
y_list[e] = y;
}
});
// Concatenate all expert outputs to match expert_tokens order
auto Y_all = at::cat(y_list, /*dim=*/0);
auto out = at::zeros({T, H}, x.options());
out =
at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
return out;
}

View File

@ -21,7 +21,6 @@
#include <torch/all.h> #include <torch/all.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda/std/limits>
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <cooperative_groups/reduce.h> #include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
@ -29,6 +28,7 @@ namespace cg = cooperative_groups;
namespace vllm { namespace vllm {
namespace moe { namespace moe {
constexpr float kNegInfinity = INFINITY * -1;
constexpr unsigned FULL_WARP_MASK = 0xffffffff; constexpr unsigned FULL_WARP_MASK = 0xffffffff;
constexpr int32_t WARP_SIZE = 32; constexpr int32_t WARP_SIZE = 32;
constexpr int32_t BLOCK_SIZE = 512; constexpr int32_t BLOCK_SIZE = 512;
@ -411,30 +411,14 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
return __bfloat162float(val); return __bfloat162float(val);
} }
template <typename T>
__device__ inline T neg_inf() {
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
// so we need to cast from fp32
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
}
template <typename T>
__device__ inline bool is_finite(const T val) {
#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
return cuda::std::isfinite(val);
#else
return isfinite(cuda_cast<float, T>(val));
#endif
}
template <typename T> template <typename T>
__device__ void topk_with_k2(T* output, T const* input, __device__ void topk_with_k2(T* output, T const* input,
cg::thread_block_tile<32> const& tile, cg::thread_block_tile<32> const& tile,
int32_t const lane_id, int32_t const lane_id,
int const num_experts_per_group) { int const num_experts_per_group) {
// Get the top2 per thread // Get the top2 per thread
T largest = neg_inf<T>(); T largest = -INFINITY;
T second_largest = neg_inf<T>(); T second_largest = -INFINITY;
if (num_experts_per_group > WARP_SIZE) { if (num_experts_per_group > WARP_SIZE) {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@ -529,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
warp_id * topk; warp_id * topk;
s_topk_idx += warp_id * topk; s_topk_idx += warp_id * topk;
T value = neg_inf<T>(); T value = kNegInfinity;
T topk_group_value = neg_inf<T>(); T topk_group_value = kNegInfinity;
int32_t num_equalto_topkth_group; int32_t num_equalto_topkth_group;
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@ -541,8 +525,11 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) { if (case_id < num_tokens) {
// calculate group_idx // calculate group_idx
int32_t target_num_min = WARP_SIZE - n_group + topk_group; int32_t target_num_min = WARP_SIZE - n_group + topk_group;
// The check is necessary to avoid abnormal input if (lane_id < n_group &&
if (lane_id < n_group && is_finite(group_scores[lane_id])) { (isfinite(cuda_cast<float, T>(
group_scores[lane_id])))) // The check is necessary to avoid
// abnormal input
{
value = group_scores[lane_id]; value = group_scores[lane_id];
} }
@ -553,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
__syncwarp(); // Ensure all threads have valid data before reduction __syncwarp(); // Ensure all threads have valid data before reduction
topk_group_value = cg::reduce(tile, value, cg::greater<T>()); topk_group_value = cg::reduce(tile, value, cg::greater<T>());
if (value == topk_group_value) { if (value == topk_group_value) {
value = neg_inf<T>(); value = kNegInfinity;
} }
pre_count_equal_to_top_value = count_equal_to_top_value; pre_count_equal_to_top_value = count_equal_to_top_value;
count_equal_to_top_value = count_equal_to_top_value = __popc(__ballot_sync(
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>()))); FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
} }
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
} }
@ -565,10 +552,11 @@ __global__ void group_idx_and_topk_idx_kernel(
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t, warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
/* is_stable */ true> /* is_stable */ true>
queue((int32_t)topk, neg_inf<T>()); queue((int32_t)topk, -INFINITY);
int count_equalto_topkth_group = 0; int count_equalto_topkth_group = 0;
bool if_proceed_next_topk = topk_group_value != neg_inf<T>(); bool if_proceed_next_topk =
(topk_group_value != cuda_cast<T, float>(kNegInfinity));
if (case_id < num_tokens && if_proceed_next_topk) { if (case_id < num_tokens && if_proceed_next_topk) {
for (int i_group = 0; i_group < n_group; i_group++) { for (int i_group = 0; i_group < n_group; i_group++) {
if ((group_scores[i_group] > topk_group_value) || if ((group_scores[i_group] > topk_group_value) ||
@ -577,10 +565,11 @@ __global__ void group_idx_and_topk_idx_kernel(
int32_t offset = i_group * num_experts_per_group; int32_t offset = i_group * num_experts_per_group;
for (int32_t i = lane_id; i < align_num_experts_per_group; for (int32_t i = lane_id; i < align_num_experts_per_group;
i += WARP_SIZE) { i += WARP_SIZE) {
T candidates = (i < num_experts_per_group) && T candidates =
is_finite(scores_with_bias[offset + i]) (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
? scores_with_bias[offset + i] scores_with_bias[offset + i]))
: neg_inf<T>(); ? scores_with_bias[offset + i]
: cuda_cast<T, float>(kNegInfinity);
queue.add(candidates, offset + i); queue.add(candidates, offset + i);
} }
if (group_scores[i_group] == topk_group_value) { if (group_scores[i_group] == topk_group_value) {
@ -609,8 +598,7 @@ __global__ void group_idx_and_topk_idx_kernel(
if (i < topk) { if (i < topk) {
s_topk_value[i] = value; s_topk_value[i] = value;
} }
topk_sum += topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
} }
} }

View File

@ -17,30 +17,25 @@ FILE_HEAD = """
namespace MARLIN_NAMESPACE_NAME { namespace MARLIN_NAMESPACE_NAME {
""".strip() """.strip()
TEMPLATE = ( TEMPLATE = ("template __global__ void Marlin<"
"template __global__ void Marlin<" "{{scalar_t}}, "
"{{scalar_t}}, " "{{w_type_id}}, "
"{{w_type_id}}, " "{{s_type_id}}, "
"{{s_type_id}}, " "{{threads}}, "
"{{threads}}, " "{{thread_m_blocks}}, "
"{{thread_m_blocks}}, " "{{thread_n_blocks}}, "
"{{thread_n_blocks}}, " "{{thread_k_blocks}}, "
"{{thread_k_blocks}}, " "{{'true' if m_block_size_8 else 'false'}}, "
"{{'true' if m_block_size_8 else 'false'}}, " "{{stages}}, "
"{{stages}}, " "{{group_blocks}}, "
"{{group_blocks}}, " "{{'true' if is_zp_float else 'false'}}>"
"{{'true' if is_zp_float else 'false'}}>" "( MARLIN_KERNEL_PARAMS );")
"( MARLIN_KERNEL_PARAMS );"
)
# int8 with zero point case (vllm::kU8) is also supported, # int8 with zero point case (vllm::kU8) is also supported,
# we don't add it to reduce wheel size. # we don't add it to reduce wheel size.
SCALAR_TYPES = [ SCALAR_TYPES = [
"vllm::kU4", "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
"vllm::kU4B8", "vllm::kFE2M1f"
"vllm::kU8B128",
"vllm::kFE4M3fn",
"vllm::kFE2M1f",
] ]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)] THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
@ -63,12 +58,11 @@ def generate_new_kernels():
all_template_str_list = [] all_template_str_list = []
for group_blocks, m_blocks, thread_configs in itertools.product( for group_blocks, m_blocks, thread_configs in itertools.product(
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
):
# act order case only support gptq-int4 and gptq-int8 # act order case only support gptq-int4 and gptq-int8
if group_blocks == 0 and scalar_type not in [ if group_blocks == 0 and scalar_type not in [
"vllm::kU4B8", "vllm::kU4B8", "vllm::kU8B128"
"vllm::kU8B128",
]: ]:
continue continue
if thread_configs[2] == 256: if thread_configs[2] == 256:

View File

@ -44,9 +44,6 @@ __global__ void moe_align_block_size_kernel(
for (size_t i = tid; i < numel; i += stride) { for (size_t i = tid; i < numel; i += stride) {
int expert_id = topk_ids[i]; int expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
int warp_idx = expert_id / experts_per_warp; int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp; int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@ -98,15 +95,12 @@ template <typename scalar_t>
__global__ void count_and_sort_expert_tokens_kernel( __global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids, const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
size_t numel, int32_t num_experts) { size_t numel) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x; const size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < numel; i += stride) { for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i]; int32_t expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
sorted_token_ids[rank_post_pad] = i; sorted_token_ids[rank_post_pad] = i;
} }
@ -275,7 +269,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>( sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(), sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts); cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
} }
}); });
} }

View File

@ -21,7 +21,6 @@
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "../cub_helpers.h" #include "../cub_helpers.h"
#include "../core/batch_invariant.hpp"
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>; using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
static constexpr int VPT = Constants::VPT; static constexpr int VPT = Constants::VPT;
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP; static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant(); const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);

View File

@ -100,11 +100,6 @@ void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& output_mask, const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties); const torch::Tensor& repetition_penalties);
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices,
torch::Tensor& values, int64_t numRows, int64_t stride0,
int64_t stride1);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale, torch::Tensor& weight, torch::Tensor& scale,
double epsilon); double epsilon);
@ -138,12 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
torch::Tensor& input, torch::Tensor& input,
torch::Tensor& input_global_scale); torch::Tensor& input_global_scale);
#endif #endif
void persistent_masked_m_silu_mul_quant( void silu_mul_fp8_quant_deep_gemm_cuda(
const at::Tensor& input, // (E, T, 2*H) const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& counts, // (E) const at::Tensor& counts, // (E)
at::Tensor& y_q, // (E, T, H) [OUT] at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT] at::Tensor& y_s, // (E, T, H//group_size) [OUT]
bool use_ue8m0); int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
void mul_and_silu(torch::Tensor& out, torch::Tensor& input); void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
@ -333,12 +328,6 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
const std::optional<torch::Tensor>& has_initial_state, const std::optional<torch::Tensor>& has_initial_state,
const torch::Tensor& ssm_states, int64_t pad_slot_id); const torch::Tensor& ssm_states, int64_t pad_slot_id);
torch::Tensor dynamic_4bit_int_moe_cpu(
torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
int64_t activation_kind);
using fptr_t = int64_t; using fptr_t = int64_t;
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs, fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank, torch::Tensor& rank_data, int64_t rank,

View File

@ -7,7 +7,7 @@
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "quantization/w8a8/fp8/common.cuh" #include "quantization/fp8/common.cuh"
#include <c10/util/Float8_e4m3fn.h> #include <c10/util/Float8_e4m3fn.h>
@ -23,14 +23,9 @@
typedef __hip_bfloat162 __nv_bfloat162; typedef __hip_bfloat162 __nv_bfloat162;
typedef __hip_bfloat16 __nv_bfloat16; typedef __hip_bfloat16 __nv_bfloat16;
typedef __hip_bfloat16_raw __nv_bfloat16_raw; typedef __hip_bfloat16_raw __nv_bfloat16_raw;
#if defined(HIP_FP8_TYPE_OCP)
typedef __hip_fp8_e4m3 __nv_fp8_e4m3; typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3; typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
#else
// ROCm 6.2 fallback: only *_fnuz types exist
typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
#endif
#endif #endif
#include "core/registration.h" #include "core/registration.h"
@ -114,22 +109,13 @@ __global__ void act_and_mul_quant_kernel(
} }
__device__ __forceinline__ float silu(float x) { __device__ __forceinline__ float silu(float x) {
return __fdividef(x, (1.f + expf(-x))); return (__fdividef(x, (1.f + expf(-x))));
} }
__device__ __forceinline__ float2 silu2(float2 x) { __device__ __forceinline__ float2 silu2(float2 x) {
return make_float2(silu(x.x), silu(x.y)); return make_float2(silu(x.x), silu(x.y));
} }
__device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
#ifndef USE_ROCM
return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
__float2bfloat16_rn(silu(x.y)));
#else
return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
#endif
}
#ifndef USE_ROCM #ifndef USE_ROCM
__device__ __forceinline__ float warp_max(float v) { __device__ __forceinline__ float warp_max(float v) {
static constexpr unsigned FULL_MASK = 0xffffffffu; static constexpr unsigned FULL_MASK = 0xffffffffu;
@ -232,308 +218,225 @@ constexpr __nv_bfloat16 get_fp8_min() {
return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032}); return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
} }
} }
#ifndef USE_ROCM
template <typename Idx_t> template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
__device__ __forceinline__ int warp_expert_search( int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
const Idx_t* input_ptr = input + idx;
int base_offset = 0;
for (;;) {
bool move_on = (idx < n && *input_ptr <= val);
unsigned mask = __ballot_sync(0xffffffff, move_on);
if (mask != 0xffffffffu) {
int last_lane = 31 - __clz(mask);
return base_offset + last_lane;
}
input_ptr += 32;
base_offset += 32;
idx += 32;
}
}
template <int num_parallel_tokens>
__device__ __forceinline__ void token_bounds(int32_t n_tokens,
int32_t worker_id,
int32_t& n_tokens_lower,
int32_t& n_tokens_upper) {
if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
if (worker_id >= num_parallel_tokens) return;
n_tokens_lower = worker_id;
n_tokens_upper = worker_id + 1;
} else {
int32_t chunk_size = n_tokens / num_parallel_tokens;
int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
auto calc_id = [&](int32_t id) {
if (id < residual)
return min(n_tokens, id * (chunk_size + 1));
else
return min(n_tokens, id * chunk_size + residual);
};
n_tokens_lower = calc_id(worker_id);
n_tokens_upper = calc_id(worker_id + 1);
}
}
template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
int NUM_STAGES = 3> int NUM_STAGES = 3>
__global__ void silu_mul_fp8_quant_deep_gemm_kernel( __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q, const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert, float* __restrict__ _y_s, const int32_t* __restrict__ counts,
// sizes // sizes
Idx_t E, Idx_t T, Idx_t H, int H, int G,
// strides (in elements) // strides (in elements)
Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e, Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t, Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
Idx_t stride_ys_g, Idx_t stride_counts_e) { Idx_t stride_ys_g, Idx_t stride_counts_e) {
#ifndef USE_ROCM
static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
extern __shared__ __align__(16) __int128_t smem_128[];
int* s_expert_offsets =
reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>(); static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>(); static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
// We assign EPS with it's 16-bit unsigned counterpart to allow constexpr. // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996}); static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
int tid = threadIdx.x;
int warp_id = tid >> 5;
int lane_id = tid & 0x1f;
int running_sum{}; // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
if (!warp_id) { static constexpr int32_t BFLOAT16_PER_GROUP = 8;
for (int i = 0; i < E; i += WARP_SIZE) {
bool valid = (i + threadIdx.x) < E;
int value =
(valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
(!lane_id ? running_sum : 0);
for (int offset = 1; offset < 32; offset *= 2) { // We split the shared memory in half, corresponding to gate and up matrices:
int n = __shfl_up_sync(0xFFFFFFFFu, value, offset); // [...gate_i, ...up_i] where 0 <= i < stages.
if (lane_id >= offset) value += n; static constexpr int32_t S_NUM_128 =
} 2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
__shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
if (valid) { const int32_t tid = threadIdx.x;
s_expert_offsets[i + threadIdx.x + 1] = value; const int32_t warp_id = tid / WARP_SIZE;
} const int32_t lane_id = tid % WARP_SIZE;
running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1); auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
}
if (!lane_id) { // block handles one (expert e, group g)
s_expert_offsets[0] = 0; int32_t pid = blockIdx.x;
} int32_t e = pid / G;
int32_t g = pid % G;
const int32_t n_tokens = counts[e * stride_counts_e];
if (!n_tokens) {
return; // Exit ASAP.
} }
__syncthreads(); const Idx_t stride_i_t_128 = stride_i_t / 8u;
int32_t total_tokens = s_expert_offsets[E]; int32_t n_tokens_lower, n_tokens_upper;
const int warp_position_yq = warp_id * (H / NUM_WARPS);
const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
// A single block will handle tokens_per_block tokens.
// Each block i iterates over tokens of a slice of n_tokens = // Each block i iterates over tokens of a slice of n_tokens =
// expert_counts[i], with the size of chunk being // expert_counts[i], with the size of chunk being
// (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
// updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling. // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
// Each warp will get space to store its hidden dim for gate and up. // Specialize this, but can be likely fused.
__int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES); if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
__int128_t* smem_load_ptr = s_hidden_load + lane_id; return;
}
const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max); n_tokens_lower = blockIdx.y;
n_tokens_upper = blockIdx.y + 1;
int32_t compute_pipeline_offset_64 = 0;
int32_t load_stage_offset{};
const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
__int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
lane_id;
__int64_t* s_gate64_ptr = smem_compute_ptr;
__int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
int tokens_lower, tokens_upper;
token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
tokens_upper);
Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
int token_id = tokens_lower;
int32_t t_load{};
if (token_id < tokens_upper) {
expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
expert_offset = s_expert_offsets[expert_id];
next_expert_offset = s_expert_offsets[expert_id + 1];
} else { } else {
// This thread block has no work to do. auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
auto calc_id = [&](int32_t id) {
if (id < residual) {
return min(n_tokens, id * (chunk_size + 1));
} else {
return min(n_tokens, id * chunk_size + residual);
}
};
n_tokens_lower = calc_id(blockIdx.y);
n_tokens_upper = calc_id(blockIdx.y + 1);
}
if (n_tokens_lower >= n_tokens_upper) {
return; return;
} }
int t_load_bound = H / (GROUP_SIZE * NUM_WARPS); // We do calculations here, using constexpr wherever possible.
const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
const Idx_t base_yq =
e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
stride_i_t_128 * n_tokens_lower;
auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
auto y_s_ptr =
_y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
stride_yq_t * n_tokens_lower + 4 * lane_id;
int32_t t_load = n_tokens_lower, load_stage_id = 0;
auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
int32_t stage_offset{};
Idx_t base_i = ((expert_id * stride_i_e) / 8) + static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
(token_id - expert_offset) * stride_i_t / 8; static constexpr int32_t LOAD_STAGE_MOD =
const Idx_t gate_warp_offset = NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
const __int128_t* input_128_ptr =
reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
__int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
auto token_offset = token_id - expert_offset;
// Two halves of all threads in a block conduct global loads for gate and up,
// repsectively.
auto load_and_advance_y_pred = [&] { auto load_and_advance_y_pred = [&] {
if (t_load < t_load_bound) { if (t_load < n_tokens_upper) {
// Here we are simply continuing to load data auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
// from the current token. auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
// It is very important that LOAD_STAGE_SIZE is constexpr to avoid // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
// unnecessary ALU ops. // unnecessary ALU ops.
load_stage_offset += LOAD_STAGE_SIZE; stage_offset += LOAD_STAGE_SIZE;
load_stage_offset %= LOAD_STAGE_MOD; stage_offset %= LOAD_STAGE_MOD;
cp_async4(smem_load_ptr_staged, load_ptr); if (tid < HALF_THREAD_COUNT) {
load_ptr += GROUP_SIZE / 8; cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
++t_load; gate_128_ptr += stride_i_t_128;
} else if (token_id + 1 < tokens_upper) {
// We loaded everything from the current token, let's move on
// to the next one, and we checked that we have more tokens to load.
++token_id;
t_load = 0;
if (token_id >= next_expert_offset) {
// We need to find the next expert.
do {
// This is a loop because it's possible
// that some experts are assigned 0 tokens.
// NOTE: We are guaranteed that there's at least
// one more token left so we don't have to check for
// expert_id bounds.
++expert_id;
// This skips 1 memory read.
expert_offset = next_expert_offset;
next_expert_offset = s_expert_offsets[expert_id + 1];
} while (next_expert_offset == expert_offset);
base_i = expert_id * (stride_i_e / 8);
token_offset = 0;
load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
} else { } else {
// We remain within the same expert, so just cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
// move by H/4 __int128_t (2 * H/8). up_128_ptr += stride_i_t_128;
base_i += stride_yq_t / 4;
token_offset++;
} }
load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
// It is very important that LOAD_STAGE_SIZE is constexpr to avoid
// unnecessary ALU ops.
load_stage_offset += LOAD_STAGE_SIZE;
load_stage_offset %= LOAD_STAGE_MOD;
cp_async4(smem_load_ptr_staged, load_ptr);
load_ptr += GROUP_SIZE / 8;
++t_load; ++t_load;
++load_stage_id;
} }
// We fence even if there is nothing to load to simplify pipelining. // We fence even if there is nothing to load to simplify pipelining.
cp_async_fence(); cp_async_fence();
}; };
// We need to warm-up the pipeline.
#pragma unroll #pragma unroll
for (int i = 0; i < NUM_STAGES - 1; i++) { for (int i = 0; i < NUM_STAGES - 1; i++) {
load_and_advance_y_pred(); load_and_advance_y_pred();
} }
__nv_fp8x4_e4m3* y_q_base_ptr = __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id; s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g; lane_id;
__int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
for (auto j = tokens_lower; j < tokens_upper; j++) { static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
const Idx_t base_ys = expert_id * stride_ys_e; static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
__nv_fp8x4_e4m3* y_q_ptr =
y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
warp_position_yq * stride_yq_h) /
4;
const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
for (int i = 0; i < COMPUTE_LIMIT; i++) { int32_t compute_pipeline_offset_64 = 0;
cp_async_wait<NUM_STAGES - 2>();
__syncthreads();
load_and_advance_y_pred();
__int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64; for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
__int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64; __nv_bfloat16 y_max_bf16 = EPS;
__nv_bfloat162 results_bf162[2];
// COMPUTE_STAGE_SIZE/MOD must also be constexpr! cp_async_wait<NUM_STAGES - 2>();
compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE; __syncthreads();
compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
__int64_t gate64 = *gate64_ptr; // We double-buffer pipelined loads so that the next load will
__int64_t up64 = *up64_ptr; // concurrently run with compute without overwrites.
load_and_advance_y_pred();
// Compute auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
__nv_bfloat162 res[2]; auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
__nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
__nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64); // STAGE_SIZE must also be constexpr!
compute_pipeline_offset_64 += STAGE_SIZE;
compute_pipeline_offset_64 %= STAGE_MOD;
// Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
__int64_t gate64 = *s_gate_compute_64;
__nv_bfloat162* s_gate_compute_32 =
reinterpret_cast<__nv_bfloat162*>(&gate64);
__int64_t up64 = *s_up_compute_64;
__nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
#pragma unroll #pragma unroll
for (int32_t k = 0; k < 2; ++k) { for (int i = 0; i < 2; i++) {
__nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k])); // For silu, we make sure that div is emitted.
res[k] = __hmul2(gate, s_up_comp[k]); float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
} results_bf162[i] = __float22bfloat162_rn(gate);
}
auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
_y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
__nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
if constexpr (USE_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
__nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
auto y_s2 = make_bfloat162(inv_y, inv_y);
#pragma unroll #pragma unroll
for (int32_t k = 0; k < 2; ++k) { for (int i = 0; i < 2; i++) {
res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min), results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
__bfloat162bfloat162(fp8_max)); }
}
*y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]); auto _y_max2 =
y_q_ptr += WARP_SIZE * stride_yq_h; __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
if (!lane_id) { y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
*y_s_ptr = y_s;
y_s_ptr += stride_ys_g; // An entire group is assigned to a single warp, so a simple warp reduce
} // is used.
__nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
if constexpr (USE_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
auto inv_y = __float2bfloat16_rn(1.f) / y_s;
auto y_s2 = make_bfloat162(inv_y, inv_y);
#pragma unroll
for (int32_t i = 0; i < 2; ++i) {
results_bf162[i] =
clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
__bfloat162bfloat162(fp8_max));
}
auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
*reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
y_q_ptr += stride_yq_t;
if (lane_id == 0) {
*y_s_ptr = y_s;
y_s_ptr += stride_ys_t;
} }
} }
#endif
} }
#endif
} // namespace vllm } // namespace vllm
@ -568,14 +471,14 @@ void silu_and_mul_quant(torch::Tensor& out, // [..., d]
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
} }
void persistent_masked_m_silu_mul_quant( void silu_mul_fp8_quant_deep_gemm_cuda(
const at::Tensor& input, // (E, T, 2*H) const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& tokens_per_expert, // (E) const at::Tensor& counts, // (E)
at::Tensor& y_q, // (E, T, H) [OUT] at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT] at::Tensor& y_s, // (E, T, H//group_size) [OUT]
bool use_ue8m0) { int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
#ifndef USE_ROCM #ifndef USE_ROCM
// This kernel relies heavily on cp.async and fp8 support.
// This kernel currently only supports H % 128 == 0 and assumes a // This kernel currently only supports H % 128 == 0 and assumes a
// fixed GROUP_SIZE of 128. // fixed GROUP_SIZE of 128.
TORCH_CHECK(input.dtype() == torch::kBFloat16); TORCH_CHECK(input.dtype() == torch::kBFloat16);
@ -584,6 +487,10 @@ void persistent_masked_m_silu_mul_quant(
TORCH_CHECK(y_s.dtype() == torch::kFloat32); TORCH_CHECK(y_s.dtype() == torch::kFloat32);
TORCH_CHECK(input.size(-1) % 256 == 0); TORCH_CHECK(input.size(-1) % 256 == 0);
// Check that num_parallel_tokens is of power of 2 and between 1 and 64.
TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
using Idx_t = int64_t; using Idx_t = int64_t;
Idx_t E = input.size(0); Idx_t E = input.size(0);
@ -599,54 +506,81 @@ void persistent_masked_m_silu_mul_quant(
Idx_t stride_ys_t = y_s.stride(1); Idx_t stride_ys_t = y_s.stride(1);
Idx_t stride_ys_g = y_s.stride(2); Idx_t stride_ys_g = y_s.stride(2);
Idx_t stride_counts_e = tokens_per_expert.stride(0); Idx_t stride_counts_e = counts.stride(0);
static constexpr int GROUP_SIZE = 128; static constexpr int GROUP_SIZE = 128;
#define KERNEL_FN \
if (use_ue8m0) { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \
NUM_PARALLEL_TOKENS, true> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \
stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \
stride_counts_e); \
} else { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \
NUM_PARALLEL_TOKENS, false> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \
stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \
stride_counts_e); \
}
#define KERNEL_CALL_H \
if (H % (4 * GROUP_SIZE) == 0) { \
static constexpr int NUM_WARPS = 4; \
populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
KERNEL_FN \
} else { \
static constexpr int NUM_WARPS = 1; \
populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
KERNEL_FN \
}
#define KERNEL_CALL_TOP_LEVEL \
if (num_parallel_tokens == 1) { \
static constexpr int NUM_PARALLEL_TOKENS = 1; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 2) { \
static constexpr int NUM_PARALLEL_TOKENS = 2; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 4) { \
static constexpr int NUM_PARALLEL_TOKENS = 4; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 8) { \
static constexpr int NUM_PARALLEL_TOKENS = 8; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 16) { \
static constexpr int NUM_PARALLEL_TOKENS = 16; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 32) { \
static constexpr int NUM_PARALLEL_TOKENS = 32; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 64) { \
static constexpr int NUM_PARALLEL_TOKENS = 64; \
KERNEL_CALL_H \
}
Idx_t G;
dim3 block, grid;
auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
G = H / Idx_t(group_size * num_warps);
grid = dim3(E * G, _num_parallel_tokens);
block = dim3(num_warps * WARP_SIZE);
};
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
#define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \ VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \ "silu_mul_fp8_quant_deep_gemm_kernel",
int sms = SILU_V2_BLOCK_COUNT; \ [&] { KERNEL_CALL_TOP_LEVEL });
static constexpr int max_shared_mem_bytes = \
GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2; \
dim3 grid(sms), block(THREAD_COUNT); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
VLLM_DISPATCH_FP8_TYPES( \
y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel< \
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \
USE_UE8M0, GROUP_SIZE, STAGES> \
<<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \
stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \
stride_ys_g, stride_counts_e); \
});
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
if (!use_ue8m0) {
if (H >= 4096) {
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
} else {
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
}
} else {
if (H >= 4096) {
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
} else {
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
}
}
#endif #endif
} }

View File

@ -1,11 +1,15 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <torch/all.h> #include <torch/all.h>
#ifndef USE_ROCM
#include "../per_token_group_quant_8bit.h"
#endif
#include <cmath> #include <cmath>
#include "dispatch_utils.h" #include "../../cub_helpers.h"
#include "quantization/vectorization_utils.cuh" #include "../../dispatch_utils.h"
#include "cub_helpers.h" #include "../vectorization_utils.cuh"
static inline __device__ int8_t float_to_int8_rn(float x) { static inline __device__ int8_t float_to_int8_rn(float x) {
#ifdef USE_ROCM #ifdef USE_ROCM
@ -21,6 +25,7 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
float dst = std::nearbyint(x); float dst = std::nearbyint(x);
// saturate // saturate
// See https://github.com/pytorch/pytorch/issues/127666 // See https://github.com/pytorch/pytorch/issues/127666
// See https://github.com/llvm/llvm-project/issues/95183 // See https://github.com/llvm/llvm-project/issues/95183
// hip-clang std::clamp __glibcxx_assert_fail host function when building on // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -79,6 +84,7 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
static_cast<int32_t>(std::numeric_limits<int8_t>::max()); static_cast<int32_t>(std::numeric_limits<int8_t>::max());
// saturate // saturate
// See https://github.com/pytorch/pytorch/issues/127666 // See https://github.com/pytorch/pytorch/issues/127666
// See https://github.com/llvm/llvm-project/issues/95183 // See https://github.com/llvm/llvm-project/issues/95183
// hip-clang std::clamp __glibcxx_assert_fail host function when building on // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -170,6 +176,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax; float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
// 2. quantize
vectorize_with_alignment<16>( vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride, row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) { [=] __device__(int8_t& dst, const scalar_t& src) {
@ -187,6 +194,7 @@ struct MinMax {
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {} __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
// add a value to the MinMax
__host__ __device__ MinMax& operator+=(float v) { __host__ __device__ MinMax& operator+=(float v) {
min = fminf(min, v); min = fminf(min, v);
max = fmaxf(max, v); max = fmaxf(max, v);
@ -220,6 +228,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
const scalar_t* row_in = input + token_idx * hidden_size; const scalar_t* row_in = input + token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size; int8_t* row_out = output + token_idx * hidden_size;
// 1. calculate min & max
MinMax thread_mm; MinMax thread_mm;
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride, vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
[&] __device__(const scalar_t& src) { [&] __device__(const scalar_t& src) {
@ -252,6 +261,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
const float inv_s = 1.f / scale_sh; const float inv_s = 1.f / scale_sh;
const azp_t azp = azp_sh; const azp_t azp = azp_sh;
// 2. quantize
vectorize_with_alignment<16>( vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride, row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) { [=] __device__(int8_t& dst, const scalar_t& src) {
@ -322,4 +332,14 @@ void dynamic_scaled_int8_quant(
hidden_size); hidden_size);
} }
}); });
} }
#ifndef USE_ROCM
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max) {
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
int8_min, int8_max);
}
#endif

View File

@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
} else { } else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>, OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm, Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);
} }
@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
} else { } else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>, OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm, Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);
} }
@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
} else { } else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>, OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm, Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);
} }
@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
// TMA epilogue isn't compatible with Swap A/B // TMA epilogue isn't compatible with Swap A/B
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>, OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm, Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);
} }
} }
} // namespace vllm } // namespace vllm

View File

@ -25,10 +25,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) { if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
int8_func(c, a, b, a_scales, b_scales, bias); int8_func(c, a, b, a_scales, b_scales, bias);
} else { } else {
int32_t version_num = get_sm_version_num(); TORCH_CHECK(false, "Int8 not supported for this architecture");
TORCH_CHECK(
false, "Int8 not supported on SM", version_num,
". Use FP8 quantization instead, or run on older arch (SM < 100).");
} }
} }
} else { } else {

View File

@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
} }
} }
} // namespace vllm } // namespace vllm

Some files were not shown because too many files have changed in this diff Show More