Compare commits

..

20 Commits

Author SHA1 Message Date
1236aebf0e Merge remote-tracking branch 'origin/main' into fp8_ep_dp 2025-06-02 14:53:27 -04:00
95c40f9b09 hacks
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-30 02:33:58 +00:00
a0efd3106c hack fix MoEConfig.quant_dtype
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-30 02:08:21 +00:00
e69879996f re-enable cudagraph+torch.compile
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-30 00:12:54 +00:00
922165cba3 fp8 + pplx tests + fixes
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-29 21:25:33 +00:00
12ea698498 pplx + fp8 test
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-29 18:50:37 +00:00
caca0b718a fixes
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-29 02:08:22 +00:00
d86e3f0172 lint
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:56 +00:00
3ca8322b74 lint
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:56 +00:00
03b41b6cad fix merge
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:56 +00:00
cad6447664 fix
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:56 +00:00
c169b05541 merge
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:56 +00:00
468d16654a cleanup quantization
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:53 +00:00
909f234faa stuff
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
f8510587c2 tests + fix
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
9cfebf51ba basic working test
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
77f95b99a6 test
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
bbe888d033 wip
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
25ed6738d4 wip
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
e568e401da fp8 support
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-28 23:40:27 +00:00
1628 changed files with 7799 additions and 29277 deletions

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import os import os

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path from pathlib import Path
import pytest import pytest

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import os import os

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
import json import json

View File

@ -1,6 +1,5 @@
steps: steps:
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@ -12,7 +11,6 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@ -30,7 +28,6 @@ steps:
- label: "Build wheel - CUDA 11.8" - label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel # depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@ -47,7 +44,6 @@ steps:
- label: "Build release image" - label: "Build release image"
depends_on: block-release-image-build depends_on: block-release-image-build
id: build-release-image
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@ -55,18 +51,6 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
depends_on: ~ depends_on: ~
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
@ -86,10 +70,9 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: "release-version"
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build

View File

@ -1,31 +0,0 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -1,17 +0,0 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@ -94,10 +94,6 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \ commands="${commands} \

View File

@ -7,7 +7,6 @@ set -ex
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
if [[ -n "$container_id" ]]; then if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true podman rm -f "$container_id" || true
fi fi
podman system prune -f podman system prune -f
@ -38,7 +37,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.

View File

@ -6,82 +6,75 @@ set -ex
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() { function cpu_tests() {
set -e set -e
export NUMA_NODE=$2 export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference # offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/language/pooling -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/multimodal/generation \ pytest -v -s tests/models/encoder_decoder/language -m cpu_model
--ignore=tests/models/multimodal/generation/test_mllama.py \ pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
--ignore=tests/models/multimodal/generation/test_pixtral.py \ pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
-m cpu_model"
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test # Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
VLLM_USE_V1=0 pytest -s -v \ pytest -s -v \
tests/quantization/test_ipex_quant.py" tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test # Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v -k cpu_model \ pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py" tests/basic_correctness/test_chunked_prefill.py"
# online serving # online serving
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ python3 benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model facebook/opt-125m \ --model facebook/opt-125m \
@ -90,7 +83,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m" --tokenizer facebook/opt-125m"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/lora/test_qwen2vl.py" tests/lora/test_qwen2vl.py"
@ -98,4 +91,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

View File

@ -150,15 +150,11 @@ run_and_track_test 9 "test_multimodal.py" \
run_and_track_test 10 "test_pallas.py" \ run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \ run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
run_and_track_test 12 "test_moe_pallas.py" \ run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \ run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status. # After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -1,18 +0,0 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@ -1,14 +0,0 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -1,102 +0,0 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@ -1,94 +0,0 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@ -145,7 +145,6 @@ steps:
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py - tests/v1/test_async_llm_dp.py
- tests/v1/engine/test_engine_core_client.py
commands: commands:
# test with tp=2 and external_dp=2 # test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -155,7 +154,6 @@ steps:
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py - python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
@ -177,11 +175,6 @@ steps:
- tests/tracing - tests/tracing
commands: commands:
- pytest -v -s metrics - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s tracing - pytest -v -s tracing
##### fast check tests ##### ##### fast check tests #####
@ -294,7 +287,7 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
@ -310,7 +303,6 @@ steps:
commands: commands:
- pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_async_tp.py
@ -326,7 +318,6 @@ steps:
# these tests need to be separated, cannot combine # these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
@ -430,9 +421,6 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
commands: commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@ -675,7 +663,7 @@ steps:
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min - label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:

16
.github/CODEOWNERS vendored
View File

@ -10,17 +10,15 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth CMakeLists.txt @tlrmchlsmth
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm /vllm/v1/structured_output @mgoin @russellb
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -29,8 +27,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon /tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
@ -40,11 +38,11 @@ CMakeLists.txt @tlrmchlsmth
/tests/quantization @mgoin @robertgshaw2-redhat /tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao /tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee /tests/lora @jeejeelee
# Docs # Docs
/docs @hmellor /docs @hmellor
mkdocs.yaml @hmellor mkdocs.yaml @hmellor

View File

@ -8,16 +8,6 @@ body:
attributes: attributes:
value: > value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea - type: textarea
attributes: attributes:
label: Your current environment label: Your current environment

View File

@ -1,18 +1,6 @@
## Essential Elements of an Effective PR Description Checklist FILL IN THE PR DESCRIPTION HERE
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. FIX #xxxx (*link existing issues this PR will resolve*)
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

49
.github/mergify.yml vendored
View File

@ -36,20 +36,6 @@ pull_request_rules:
add: add:
- frontend - frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-multi-modality - name: label-multi-modality
description: Automatically apply multi-modality label description: Automatically apply multi-modality label
conditions: conditions:
@ -65,41 +51,6 @@ pull_request_rules:
add: add:
- multi-modality - multi-modality
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
- body~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-rocm
description: Automatically apply rocm label
conditions:
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
- title~=(?i)ROCm
actions:
label:
add:
- rocm
- name: label-structured-output - name: label-structured-output
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:

2
.gitignore vendored
View File

@ -200,5 +200,5 @@ benchmarks/**/*.json
actionlint actionlint
shellcheck*/ shellcheck*/
# Ignore moe/marlin_moe gen code # Ingore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_* csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -11,8 +11,6 @@ repos:
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.11.7
hooks: hooks:
@ -20,10 +18,12 @@ repos:
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.* files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos - repo: https://github.com/codespell-project/codespell
rev: v1.32.0 rev: v2.4.1
hooks: hooks:
- id: typos - id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 6.0.1 rev: 6.0.1
hooks: hooks:
@ -143,13 +143,6 @@ repos:
types: [python] types: [python]
pass_filenames: false pass_filenames: false
additional_dependencies: [regex] additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/check_pickle_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [pathspec, regex]
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@ -182,6 +182,9 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP") if(VLLM_GPU_LANG STREQUAL "HIP")
# #
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -189,6 +192,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
# #
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed. # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -242,7 +246,6 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -308,7 +311,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
# #
@ -420,9 +423,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# require CUDA 12.8 or later # CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -454,7 +457,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8) # (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}") "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
@ -542,10 +545,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels # CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# if it's possible to compile MoE kernels that use its output. # to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu") "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
@ -684,7 +687,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
# #

View File

@ -58,8 +58,8 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
- Tensor parallelism and pipeline parallelism support for distributed inference - Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including: vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama) - Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral) - Embedding Models (e.g. E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA) - Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -156,10 +156,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
## Media Kit ## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

View File

@ -10,15 +10,11 @@
# 3. Set variables (ALL REQUIRED) # 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo # BASE: your directory for vllm repo
# MODEL: the model served by vllm # MODEL: the model served by vllm
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights. # DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len # INPUT_LEN: request input len
# OUTPUT_LEN: request output len # OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate # MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 # MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file. # 5. The final result will be saved in RESULT file.
@ -34,27 +30,31 @@
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
BASE="" BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct" MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR="" DOWNLOAD_DIR=""
INPUT_LEN=4000 INPUT_LEN=4000
OUTPUT_LEN=16 OUTPUT_LEN=16
MIN_CACHE_HIT_PCT=0 MIN_CACHE_HIT_PCT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
echo "result file: $RESULT" echo "result file$ $RESULT"
echo "model: $MODEL" echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER mkdir -p $LOG_FOLDER
cd "$BASE/vllm" cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install -q datasets pip install datasets
current_hash=$(git rev-parse HEAD) current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT" echo "hash:$current_hash" >> "$RESULT"
@ -64,69 +64,53 @@ best_throughput=0
best_max_num_seqs=0 best_max_num_seqs=0
best_num_batched_tokens=0 best_num_batched_tokens=0
best_goodput=0 best_goodput=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
run_benchmark() { run_benchmark() {
local max_num_seqs=$1 local max_num_seqs=$1
local max_num_batched_tokens=$2 local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
pkill -f vllm
echo "starting server..." # start the server
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
result=$? --disable-log-requests \
if [[ "$result" -eq 1 ]]; then --port 8004 \
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" --gpu-memory-utilization 0.98 \
else --max-num-seqs $max_num_seqs \
echo "server started." --max-num-batched-tokens $max_num_batched_tokens \
fi --tensor-parallel-size 1 \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1
fi
echo "run benchmark test..." echo "run benchmark test..."
echo
meet_latency_requirement=0 meet_latency_requirement=0
# get a basic qps by using request-rate inf # get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -134,29 +118,29 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name sonnet \
--random-input-len $INPUT_LEN \ --dataset-path benchmarks/sonnet_4x.txt \
--random-output-len $OUTPUT_LEN \ --sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate inf \ --request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--port 8004 &> "$bm_log" --port 8004 > "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1 meet_latency_requirement=1
request_rate=inf
fi fi
if (( ! meet_latency_requirement )); then if (( ! meet_latency_requirement )); then
# start from request-rate as int(throughput) + 1 # start from request-rate as int(through_put) + 1
request_rate=$((${throughput%.*} + 1)) request_rate=$((${through_put%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
# clear prefix cache # clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@ -165,18 +149,19 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name sonnet \
--random-input-len $INPUT_LEN \ --dataset-path benchmarks/sonnet_4x.txt \
--random-output-len $OUTPUT_LEN \ --sonnet-input-len $INPUT_LEN \
--ignore-eos \ --sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $request_rate \ --request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--port 8004 &> "$bm_log" --port 8004 > "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -188,10 +173,10 @@ run_benchmark() {
fi fi
# write the results and update the best result. # write the results and update the best result.
if ((meet_latency_requirement)); then if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$throughput best_throughput=$through_put
best_max_num_seqs=$max_num_seqs best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput best_goodput=$goodput
@ -203,39 +188,22 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm pkill vllm
sleep 10 sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20) printf '=%.0s' $(seq 1 20)
return 0 return 0
} }
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
# first find out the max gpu-memory-utilization without HBM OOM. num_seqs_list="128 256"
gpu_memory_utilization=0.98 num_batched_tokens_list="512 1024 2048 4096"
find_gpu_memory_utilization=0 for num_seqs in $num_seqs_list; do
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do for num_batched_tokens in $num_batched_tokens_list; do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" run_benchmark $num_seqs $num_batched_tokens
result=$? exit 0
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done done
done done
echo "finish permutations" echo "finish permutations"

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io import io
import json import json

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This module defines a framework for sampling benchmark requests from various This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -865,15 +864,7 @@ class InstructCoderDataset(HuggingFaceDataset):
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
prompt = f"{item['input']}\n\n{item['instruction']} Just output \ prompt = f"{item['instruction']}:\n{item['input']}"
the code, do not include any explanation."
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark the latency of processing a single batch of requests.""" """Benchmark the latency of processing a single batch of requests."""
import argparse import argparse
@ -123,7 +122,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, results) save_to_pytorch_benchmark_format(args, results)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of " description="Benchmark the latency of processing a single batch of "
"requests till completion." "requests till completion."
@ -171,12 +170,6 @@ def create_argument_parser():
# V1 enables prefix caching by default which skews the latency # V1 enables prefix caching by default which skews the latency
# numbers. We need to disable prefix caching by default. # numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False) parser.set_defaults(enable_prefix_caching=False)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError( raise OSError(

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Offline benchmark to test the long document QA throughput. Offline benchmark to test the long document QA throughput.
@ -142,7 +141,7 @@ def main(args):
) )
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance with or " description="Benchmark the performance with or "
"without automatic prefix caching." "without automatic prefix caching."
@ -192,11 +191,5 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Benchmark the efficiency of prefix caching. Benchmark the efficiency of prefix caching.
@ -218,7 +217,7 @@ def main(args):
) )
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance with or without " description="Benchmark the performance with or without "
"automatic prefix caching." "automatic prefix caching."
@ -268,11 +267,5 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline prioritization.""" """Benchmark offline prioritization."""
import argparse import argparse
@ -161,7 +160,7 @@ def main(args: argparse.Namespace):
json.dump(results, f, indent=4) json.dump(results, f, indent=4)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument( parser.add_argument(
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
@ -204,12 +203,6 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput. r"""Benchmark online serving throughput.
On the server side, run one of the following commands: On the server side, run one of the following commands:
@ -875,7 +874,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, result_json, file_name) save_to_pytorch_benchmark_format(args, result_json, file_name)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput." description="Benchmark the online serving throughput."
) )
@ -1225,10 +1224,6 @@ def create_argument_parser():
"script chooses a LoRA module at random.", "script chooses a LoRA module at random.",
) )
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput with structured outputs. r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands: On the server side, run one of the following commands:
@ -12,6 +11,7 @@ On the client side, run:
--model <your_model> \ --model <your_model> \
--dataset json \ --dataset json \
--structured-output-ratio 1.0 \ --structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \ --request-rate 10 \
--num-prompts 1000 --num-prompts 1000
@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
json.dump(results, outfile, indent=4) json.dump(results, outfile, indent=4)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput." description="Benchmark the online serving throughput."
) )
@ -1034,10 +1034,5 @@ def create_argument_parser():
help="Ratio of Structured Outputs requests", help="Ratio of Structured Outputs requests",
) )
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput.""" """Benchmark offline inference throughput."""
import argparse import argparse
@ -595,7 +594,7 @@ def validate_args(args):
) )
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument( parser.add_argument(
"--backend", "--backend",
@ -717,12 +716,6 @@ def create_argument_parser():
) )
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
@ -66,9 +65,4 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None: def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f: with open(filename, "w") as f:
json.dump( json.dump(records, f, cls=InfEncoder)
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils # Cutlass bench utils
from collections.abc import Iterable from collections.abc import Iterable

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format # Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM) # ([K, N], TP_SPLIT_DIM)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import itertools import itertools

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle as pkl import pickle as pkl
import time import time

View File

@ -4,85 +4,11 @@ import copy
import itertools import itertools
import torch import torch
import triton
from weight_shapes import WEIGHT_SHAPES from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"fp8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"fp8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"fp8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"fp8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"fp8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"fp8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"fp8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"fp8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
else:
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
return b_fp8.t(), scale_b_fp8
def build_fp8_runner(cfg, a, b, dtype, device):
b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
scale_a_const = (
torch.ones(1, device=device, dtype=torch.float32)
if cfg["a"] == "tensor"
else None
)
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
else:
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
def run():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
if cfg["a"] == "tensor":
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
@triton.testing.perf_report( @triton.testing.perf_report(
@ -91,8 +17,28 @@ def build_fp8_runner(cfg, a, b, dtype, device):
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False, x_log=False,
line_arg="provider", line_arg="provider",
line_vals=_enabled, line_vals=[
line_names=_enabled, "torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
line_names=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
ylabel="TFLOP/s (larger is better)", ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs FP8 GEMMs", plot_name="BF16 vs FP8 GEMMs",
args={}, args={},
@ -103,34 +49,144 @@ def benchmark(batch_size, provider, N, K):
device = "cuda" device = "cuda"
dtype = torch.bfloat16 dtype = torch.bfloat16
# Create input tensors
a = torch.randn((M, K), device=device, dtype=dtype) a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype) b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8] quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16": if "torch-bf16" in provider:
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
) )
else:
cfg = PROVIDER_CFGS[provider] elif "fp8" in provider:
run_quant = build_fp8_runner(cfg, a, b, dtype, device) # Weights are always quantized ahead of time
if "noquant" in provider:
# For no quantization, we just measure the GEMM
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
# In these cases, we quantize the activations during the GEMM call
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
b_fp8 = b_fp8.t()
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles lambda: run_quant(), quantiles=quantiles
) )
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) # Calculate TFLOP/s, two flops per multiply-add
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
return tflops(ms), tflops(max_ms), tflops(min_ms)
def prepare_shapes(args): def prepare_shapes(args):
out = [] KN_model_names = []
for model, tp_size in itertools.product(args.models, args.tp_sizes): models_tps = list(itertools.product(args.models, args.tp_sizes))
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): for model, tp_size in models_tps:
KN[tp_dim] //= tp_size assert model in WEIGHT_SHAPES
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
KN.append(model) KN.append(model)
out.append(KN) KN_model_names.append(KN)
return out return KN_model_names
if __name__ == "__main__": if __name__ == "__main__":
@ -140,13 +196,21 @@ if __name__ == "__main__":
nargs="+", nargs="+",
type=str, type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"], default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()), choices=[*WEIGHT_SHAPES.keys()],
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
) )
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args() args = parser.parse_args()
for K, N, model in prepare_shapes(args): KN_model_names = prepare_shapes(args)
print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") for K, N, model_name in KN_model_names:
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
benchmark.run( benchmark.run(
print_data=True, print_data=True,
show_plots=True, show_plots=True,

View File

@ -1,169 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"int8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"int8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"int8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"int8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"int8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"int8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"int8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"int8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
def _quant_weight(b, w_type, device):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
assert scale_b_int8.numel() == 1
else: # channel
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
assert scale_b_int8.numel() == b.shape[0]
return b_int8.t(), scale_b_int8
def build_int8_runner(cfg, a, b, dtype, device):
# quant before running the kernel
b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
scale_a_const = None
if cfg["a"] == "tensor":
scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
# no quant, create activation ahead
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
else: # token
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
def run_quant():
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
# dynamic quant, create activation inside
if cfg["a"] == "tensor":
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
else: # token
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=[k for k in _enabled],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs INT8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_int8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
)
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_int8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT License. # Licensed under the MIT License.

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -91,7 +90,7 @@ def bench_run(
score = torch.randn((m, num_experts), device=device, dtype=dtype) score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
quant_blocksize = 16 quant_blocksize = 16
w1_blockscale = torch.empty( w1_blockscale = torch.empty(

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
@ -7,8 +6,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
cutlass_moe_fp8,
fused_experts, fused_experts,
fused_topk, fused_topk,
) )
@ -70,9 +69,18 @@ def bench_run(
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
for expert in range(num_experts): for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype) score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
@ -113,6 +121,10 @@ def bench_run(
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
num_repeats: int, num_repeats: int,
): ):
for _ in range(num_repeats): for _ in range(num_repeats):
@ -120,10 +132,14 @@ def bench_run(
a, a,
w1, w1,
w2, w2,
topk_weights,
topk_ids,
w1_scale, w1_scale,
w2_scale, w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale, a1_scale=a_scale,
) )
@ -136,6 +152,10 @@ def bench_run(
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
): ):
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -144,10 +164,14 @@ def bench_run(
a, a,
w1_q, w1_q,
w2_q, w2_q,
topk_weights,
topk_ids,
w1_scale, w1_scale,
w2_scale, w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale, a1_scale=a_scale,
) )
@ -193,6 +217,10 @@ def bench_run(
w2_scale, w2_scale,
topk_weights, topk_weights,
topk_ids, topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
) )
torch.cuda.synchronize() torch.cuda.synchronize()
@ -201,8 +229,8 @@ def bench_run(
with torch.cuda.graph(triton_graph, stream=triton_stream): with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph( run_triton_from_graph(
a, a,
w1_q, w1_q_notransp,
w2_q, w2_q_notransp,
topk_weights, topk_weights,
topk_ids, topk_ids,
w1_scale, w1_scale,
@ -221,12 +249,18 @@ def bench_run(
"w2": w2, "w2": w2,
"score": score, "score": score,
"topk": topk, "topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params # Cutlass params
"a_scale": a_scale, "a_scale": a_scale,
"w1_q": w1_q, "w1_q": w1_q,
"w2_q": w2_q, "w2_q": w2_q,
"w1_scale": w1_scale, "w1_scale": w1_scale,
"w2_scale": w2_scale, "w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params # cuda graph params
"cutlass_graph": cutlass_graph, "cutlass_graph": cutlass_graph,
"triton_graph": triton_graph, "triton_graph": triton_graph,
@ -244,8 +278,8 @@ def bench_run(
# Warmup # Warmup
run_triton_moe( run_triton_moe(
a, a,
w1_q, w1_q_notransp,
w2_q, w2_q_notransp,
topk_weights, topk_weights,
topk_ids, topk_ids,
w1_scale, w1_scale,
@ -256,7 +290,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@ -287,12 +321,16 @@ def bench_run(
w2_scale, w2_scale,
topk_weights, topk_weights,
topk_ids, topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
num_warmup, num_warmup,
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501 stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
@ -7,6 +6,7 @@ import time
from contextlib import nullcontext from contextlib import nullcontext
from datetime import datetime from datetime import datetime
from itertools import product from itertools import product
from types import SimpleNamespace
from typing import Any, TypedDict from typing import Any, TypedDict
import ray import ray
@ -42,7 +42,7 @@ def benchmark_config(
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
num_iters: int = 100, num_iters: int = 100,
block_quant_shape: list[int] = None, block_quant_shape: List[int] = None,
use_deep_gemm: bool = False, use_deep_gemm: bool = False,
) -> float: ) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -399,7 +399,7 @@ class BenchmarkWorker:
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
block_quant_shape: list[int] = None, block_quant_shape: List[int] = None,
use_deep_gemm: bool = False, use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]: ) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed) current_platform.seed_everything(self.seed)
@ -531,7 +531,7 @@ def save_configs(
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
block_quant_shape: list[int], block_quant_shape: List[int],
) -> None: ) -> None:
dtype_str = get_config_dtype_str( dtype_str = get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -562,6 +562,7 @@ def main(args: argparse.Namespace):
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix: if args.model_prefix:
config = getattr(config, args.model_prefix) config = getattr(config, args.model_prefix)
config = SimpleNamespace(**config)
if config.architectures[0] == "DbrxForCausalLM": if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts E = config.ffn_config.moe_num_experts
@ -593,7 +594,11 @@ def main(args: argparse.Namespace):
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype dtype = (
torch.float16
if current_platform.is_rocm()
else getattr(torch, config.torch_dtype)
)
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)

View File

@ -1,159 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import itertools
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton,
)
from vllm.triton_utils import triton
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
return torch.stack(
[
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
for _ in range(num_tokens)
]
)
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
expert_ids_triton = torch.zeros(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
sorted_ids_vllm.fill_(topk_ids.numel())
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm", "triton"], # "triton"
line_names=["VLLM", "Triton"], # "Triton"
plot_name="moe-align-block-size-performance",
args={},
)
)
def benchmark(num_tokens, num_experts, topk, provider):
"""Benchmark function for Triton."""
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--num_experts",
type=int,
default=64,
choices=[8, 16, 32, 64, 128, 256],
)
parser.add_argument(
"--topk",
type=int,
default=8,
choices=[2, 4, 8],
help="Top-k value for correctness check.",
)
args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
from typing import Any, TypedDict from typing import Any, TypedDict

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random import random
import time import time

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from typing import Optional, Union from typing import Optional, Union

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate from itertools import accumulate
from typing import Optional from typing import Optional

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES = { WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]], "ideal": [[4 * 256 * 32, 256 * 32]],

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py # Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse import argparse

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off # fmt: off
# ruff: noqa: E501 # ruff: noqa: E501
import time import time

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math import math
import pickle import pickle

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from collections.abc import Iterable from collections.abc import Iterable

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format # Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM) # ([K, N], TP_SPLIT_DIM)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile import cProfile
import pstats import pstats

View File

@ -75,7 +75,6 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
else() else()
find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@ -107,19 +106,13 @@ elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2") list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA") message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) elseif (POWER9_FOUND OR POWER10_FOUND)
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
if (POWER9_FOUND) # Check for PowerPC VSX support
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
"-mvsx" "-mvsx"
"-mcpu=power9" "-mcpu=native"
"-mtune=power9") "-mtune=native")
elseif (POWER10_FOUND OR POWER11_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10")
endif()
elseif (ASIMD_FOUND) elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected") message(STATUS "ARMv8 or later architecture detected")

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# #
# A command line tool for running pytorch's hipify preprocessor on CUDA # A command line tool for running pytorch's hipify preprocessor on CUDA

View File

@ -122,7 +122,6 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"-DENABLE_FP8" "-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__" "-U__HIP_NO_HALF_OPERATORS__"
"-Werror=unused-variable"
"-fno-gpu-rdc") "-fno-gpu-rdc")
endif() endif()

View File

@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
{static_cast<ElementOut*>(out.data_ptr()), stride_O, {static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE}, static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info, hw_info,
1, // split_kv -1, // split_kv
nullptr, // is_var_split_kv nullptr, // is_var_split_kv
}; };
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

View File

@ -65,6 +65,9 @@ void paged_attention_v1_launcher(
int kv_block_stride = key_cache.stride(0); int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1); int kv_head_stride = key_cache.stride(1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional. // NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr = const float* alibi_slopes_ptr =
alibi_slopes alibi_slopes
@ -190,4 +193,4 @@ void paged_attention_v1(
#undef WARP_SIZE #undef WARP_SIZE
#undef MAX #undef MAX
#undef MIN #undef MIN
#undef DIVIDE_ROUND_UP #undef DIVIDE_ROUND_UP

View File

@ -66,6 +66,9 @@ void paged_attention_v2_launcher(
int kv_block_stride = key_cache.stride(0); int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1); int kv_head_stride = key_cache.stride(1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional. // NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr = const float* alibi_slopes_ptr =
alibi_slopes alibi_slopes
@ -200,4 +203,4 @@ void paged_attention_v2(
#undef WARP_SIZE #undef WARP_SIZE
#undef MAX #undef MAX
#undef MIN #undef MIN
#undef DIVIDE_ROUND_UP #undef DIVIDE_ROUND_UP

View File

@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
} }
template <typename T> template <typename T>
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data, FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
const int size) { const int size) {
T max = max_data[0]; T max = max_data[0];
for (int i = 1; i < size; ++i) { for (int i = 1; i < size; ++i) {
max = max >= max_data[i] ? max : max_data[i]; max = max >= max_data[i] ? max : max_data[i];
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
if (partition_num == 1) continue; if (partition_num == 1) continue;
reducePartitionSoftmax( reducePartitonSoftmax(
max_logits + seq_idx * num_heads * max_num_partitions + max_logits + seq_idx * num_heads * max_num_partitions +
head_idx * max_num_partitions, head_idx * max_num_partitions,
exp_sums + seq_idx * num_heads * max_num_partitions + exp_sums + seq_idx * num_heads * max_num_partitions +

View File

@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
explicit FP16Vec16(const void* ptr) explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temporal load // non-temproal load
explicit FP16Vec16(bool, void* ptr) explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {} : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
explicit BF16Vec16(const void* ptr) explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temporal load // non-temproal load
explicit BF16Vec16(bool, void* ptr) explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {} : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
// normal load // normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temporal load // non-temproal load
explicit FP32Vec16(bool, void* ptr) explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {} : reg((__m512)_mm512_stream_load_si512(ptr)) {}
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
// normal load // normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {} explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temporal load // non-temproal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {} explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); } void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
_mm512_mask_storeu_epi8(ptr, mask, reg); _mm512_mask_storeu_epi8(ptr, mask, reg);
} }
// non-temporal save // non-temproal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); } void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
}; };
#endif #endif

View File

@ -54,7 +54,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
int page_num = numa_migrate_pages(pid, src_mask, mask); int page_num = numa_migrate_pages(pid, src_mask, mask);
if (page_num == -1) { if (page_num == -1) {
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno)); TORCH_CHECK(false,
"numa_migrate_pages failed. errno: " + std::to_string(errno));
} }
// restrict memory allocation node. // restrict memory allocation node.
@ -104,4 +105,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
return ss.str(); return ss.str();
} }
#endif #endif

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum import enum
from typing import Union from typing import Union

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob import glob
import itertools import itertools
import os import os

View File

@ -13,45 +13,232 @@
namespace vllm { namespace vllm {
namespace moe { namespace moe {
template <typename scalar_t> namespace {
__global__ void moe_align_block_size_kernel( __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
const scalar_t* __restrict__ topk_ids, int32_t col) {
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, // don't worry about overflow because num_experts is relatively small
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, return row * total_col + col;
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, }
size_t numel, int32_t* __restrict__ cumsum) { } // namespace
extern __shared__ int32_t shared_counts[];
const int warp_id = threadIdx.x / WARP_SIZE; template <typename scalar_t, typename token_cnts_t>
const int my_expert_start = warp_id * experts_per_warp; __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
int32_t* sorted_token_ids,
int32_t* expert_ids,
int32_t* total_tokens_post_pad,
int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
for (int i = 0; i < experts_per_warp; ++i) { extern __shared__ int32_t shared_mem[];
if (my_expert_start + i < padded_num_experts) { int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1)
shared_counts[warp_id * experts_per_warp + i] = 0; token_cnts_t* tokens_cnts =
(token_cnts_t*)(shared_mem + num_experts +
1); // 2d tensor with shape (blockDim.x + 1, num_experts)
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
} }
} }
__syncthreads(); __syncthreads();
const size_t tid = threadIdx.x; // We accumulate the token counts of all experts in thread 0.
const size_t stride = blockDim.x; if (threadIdx.x == 0) {
cumsum[0] = 0;
for (size_t i = tid; i < numel; i += stride) { for (int i = 1; i <= num_experts; ++i) {
int expert_id = topk_ids[i]; cumsum[i] = cumsum[i - 1] +
int warp_idx = expert_id / experts_per_warp; CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
int expert_offset = expert_id % experts_per_warp; block_size) *
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
} }
__syncthreads(); __syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// TODO(simon): this is temporarily adapted from
// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
// we did this to unblock Deepseek V3 but there should be a better
// implementation to manage shared memory.
template <typename scalar_t>
__global__ void moe_align_block_size_global_mem_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = cumsum[num_experts];
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t>
__global__ void sgl_moe_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* cumsum) {
__shared__ int32_t shared_counts[32][8];
const int warp_id = threadIdx.x / 32;
const int experts_per_warp = 8;
const int my_expert_start = warp_id * experts_per_warp;
// Initialize shared_counts for this warp's experts
for (int i = 0; i < experts_per_warp; ++i) {
if (my_expert_start + i < num_experts) {
shared_counts[warp_id][i] = 0;
}
}
__syncthreads();
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int expert_id = topk_ids[i];
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
}
__syncthreads();
// Single thread computes cumulative sum and total tokens
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
cumsum[0] = 0; cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) { for (int i = 1; i <= num_experts; ++i) {
int expert_count = 0; int expert_count = 0;
int warp_idx = (i - 1) / experts_per_warp; int warp_idx = (i - 1) / experts_per_warp;
int expert_offset = (i - 1) % experts_per_warp; int expert_offset = (i - 1) % experts_per_warp;
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset]; expert_count = shared_counts[warp_idx][expert_offset];
cumsum[i] = cumsum[i] =
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
@ -61,6 +248,7 @@ __global__ void moe_align_block_size_kernel(
__syncthreads(); __syncthreads();
// Assign expert IDs to blocks
if (threadIdx.x < num_experts) { if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) { i += block_size) {
@ -69,11 +257,13 @@ __global__ void moe_align_block_size_kernel(
} }
} }
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t> template <typename scalar_t>
__global__ void count_and_sort_expert_tokens_kernel( __global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
const scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, int32_t* cumsum_buffer,
size_t numel) { size_t numel) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x; const size_t stride = blockDim.x * gridDim.x;
@ -100,138 +290,132 @@ __global__ void moe_sum_kernel(
} }
} }
template <typename scalar_t>
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tid = threadIdx.x;
const size_t stride = blockDim.x;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem;
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
}
for (size_t i = tid; i < numel; i += stride) {
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
}
__syncthreads();
if (threadIdx.x < num_experts) {
tokens_cnts[threadIdx.x] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[i * num_experts + threadIdx.x] +=
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
}
}
__syncthreads();
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] =
cumsum[i - 1] +
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
int32_t rank_post_pad =
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[threadIdx.x * num_experts + expert_id];
}
}
} // namespace moe } // namespace moe
} // namespace vllm } // namespace vllm
// taken from
// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) { torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int64_t padded_num_experts = int device_max_shared_mem;
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; auto dev = topk_ids.get_device();
int experts_per_warp = WARP_SIZE; cudaDeviceGetAttribute(&device_max_shared_mem,
int threads = 1024; cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { const int32_t shared_mem_i32 =
// calc needed amount of shared mem for `cumsum` tensors ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
auto options_int = const int32_t shared_mem_i16 =
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); ((num_thread + 1) * num_experts) * sizeof(uint16_t) +
torch::Tensor cumsum_buffer = (num_experts + 1) * sizeof(int32_t);
torch::zeros({num_experts + 1}, options_int);
bool small_batch_expert_mode =
(topk_ids.numel() < 1024) && (num_experts <= 64);
if (small_batch_expert_mode) { bool use_global_memory = false;
const int32_t threads = max((int32_t)num_experts, WARP_SIZE); bool use_i16 = false; // Use uint16_t for shared memory token counts
const int32_t shared_mem_size = if (shared_mem_i32 < device_max_shared_mem) {
((threads + 1) * num_experts + (num_experts + 1)) * // Do nothing in this case. We're all set to use int32_t token counts
sizeof(int32_t); } else if (shared_mem_i16 < device_max_shared_mem &&
topk_ids.numel() <= 65535) {
// when nelements of topk_ids is smaller than 65535 (max value of uint16),
// element value of token_cnts would also smaller than 65535,
// so we can use uint16 as dtype of token_cnts
use_i16 = true;
} else {
use_global_memory = true;
}
auto small_batch_expert_kernel = if (use_global_memory) {
vllm::moe::moe_align_block_size_small_batch_expert_kernel< VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
scalar_t>; topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>( // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
auto options_int = torch::TensorOptions()
.dtype(torch::kInt)
.device(topk_ids.device());
torch::Tensor token_cnts_buffer =
torch::empty({(num_experts + 1) * num_experts}, options_int);
torch::Tensor cumsum_buffer =
torch::empty({num_experts + 1}, options_int);
auto kernel =
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
kernel<<<1, num_thread, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>());
});
} else if (use_i16) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// set dynamic shared mem
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i16));
kernel<<<1, num_thread, shared_mem_i16, stream>>>(
topk_ids.data_ptr<scalar_t>(), topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(), sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(), experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size, num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel()); topk_ids.numel());
} else { });
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>; } else {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp); topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
size_t shared_mem_size = auto kernel =
num_warps * experts_per_warp * sizeof(int32_t); vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
align_kernel<<<1, threads, shared_mem_size, stream>>>( (void*)kernel, shared_mem_i32));
kernel<<<1, num_thread, shared_mem_i32, stream>>>(
topk_ids.data_ptr<scalar_t>(), topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(), sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(), experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
padded_num_experts, experts_per_warp, block_size, topk_ids.numel());
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>()); });
}
}
const int block_threads = std::min(256, (int)threads); void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
const int num_blocks = int64_t block_size,
(topk_ids.numel() + block_threads - 1) / block_threads; torch::Tensor sorted_token_ids,
const int max_blocks = 65535; torch::Tensor experts_ids,
const int actual_blocks = std::min(num_blocks, max_blocks); torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(num_experts == 256,
"sgl_moe_align_block_size kernel only supports deepseek v3.");
auto sort_kernel = VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>; topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>( // calc needed amount of shared mem for `cumsum` tensors
topk_ids.data_ptr<scalar_t>(), auto options_int =
sorted_token_ids.data_ptr<int32_t>(), torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel()); torch::Tensor cumsum_buffer =
} torch::zeros({num_experts + 1}, options_int);
auto align_kernel =
vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
align_kernel<<<1, 1024, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
const int block_threads = 256;
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
}); });
} }

View File

@ -12,6 +12,12 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad); torch::Tensor num_tokens_post_pad);
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM #ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales, torch::Tensor b_qweight, torch::Tensor b_scales,
@ -24,8 +30,4 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
int64_t BLOCK_SIZE_K, int64_t bit); int64_t BLOCK_SIZE_K, int64_t bit);
#endif #endif
bool moe_permute_unpermute_supported(); bool moe_permute_unpermute_supported();
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor);

View File

@ -12,7 +12,7 @@ void moe_permute(
const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk] const torch::Tensor& topk_weights, //[n_token, topk]
torch::Tensor& topk_ids, // [n_token, topk] torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk] const torch::Tensor& token_expert_indicies, // [n_token, topk]
const std::optional<torch::Tensor>& expert_map, // [n_expert] const std::optional<torch::Tensor>& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk, int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size, const std::optional<int64_t>& align_block_size,
@ -27,15 +27,15 @@ void moe_permute(
"expert_first_token_offset must be int64"); "expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32"); "topk_ids must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32"); "token_expert_indicies must be int32");
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
"src_row_id2dst_row_id_map must be int32"); "src_row_id2dst_row_id_map must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1") "expert_first_token_offset shape != n_local_expert+1")
TORCH_CHECK( TORCH_CHECK(
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
"token_expert_indices shape must be same as src_row_id2dst_row_id_map"); "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
auto n_token = input.sizes()[0]; auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1]; auto n_hidden = input.sizes()[1];
auto align_block_size_value = auto align_block_size_value =
@ -71,7 +71,7 @@ void moe_permute(
expert_map_ptr, n_expert, stream); expert_map_ptr, n_expert, stream);
} }
// expert sort topk expert id and scan expert id get expert_first_token_offset // expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices), sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
get_ptr<int>(permuted_experts_id), get_ptr<int>(permuted_experts_id),
get_ptr<int>(dst_row_id2src_row_id_map), get_ptr<int>(dst_row_id2src_row_id_map),
get_ptr<int64_t>(expert_first_token_offset), n_token, get_ptr<int64_t>(expert_first_token_offset), n_token,
@ -130,67 +130,11 @@ void moe_unpermute(
}); });
} }
template <typename T>
__global__ void shuffleInputRowsKernel(const T* input,
const int32_t* dst2src_map, T* output,
int64_t num_src_rows,
int64_t num_dst_rows, int64_t num_cols) {
int64_t dest_row_idx = blockIdx.x;
int64_t const source_row_idx = dst2src_map[dest_row_idx];
if (blockIdx.x < num_dst_rows) {
// Load 128-bits per thread
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
auto* dest_row_ptr =
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
int64_t const start_offset = threadIdx.x;
int64_t const stride = blockDim.x;
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
for (int elem_index = start_offset; elem_index < num_elems_in_col;
elem_index += stride) {
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
}
}
}
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor) {
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
"Input and output tensors must have the same data type");
auto stream = at::cuda::getCurrentCUDAStream().stream();
int64_t const blocks = output_tensor.size(0);
int64_t const threads = 256;
int64_t const num_dest_rows = output_tensor.size(0);
int64_t const num_src_rows = input_tensor.size(0);
int64_t const num_cols = input_tensor.size(1);
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
"num_cols must be divisible by 128 / "
"sizeof(input_tensor.scalar_type()) / 8");
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
dst2src_map.data_ptr<int32_t>(),
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
num_dest_rows, num_cols);
});
}
#else #else
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
torch::Tensor& topk_ids, torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indices, const torch::Tensor& token_expert_indicies,
const std::optional<torch::Tensor>& expert_map, const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk, int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size, const std::optional<int64_t>& align_block_size,
@ -203,7 +147,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
void moe_unpermute(const torch::Tensor& input, void moe_unpermute(const torch::Tensor& input,
const torch::Tensor& topk_weights, torch::Tensor& topk_ids, const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indices, const torch::Tensor& token_expert_indicies,
const std::optional<torch::Tensor>& expert_map, const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk, int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size, const std::optional<int64_t>& align_block_size,

View File

@ -14,13 +14,12 @@
__VA_ARGS__(); \ __VA_ARGS__(); \
break; \ break; \
} }
#define MOE_DISPATCH_FLOAT_CASE(...) \ #define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
#define MOE_DISPATCH(TYPE, ...) \ #define MOE_DISPATCH(TYPE, ...) \
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__)) MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@ -40,11 +39,6 @@ template <>
struct ScalarType2CudaType<at::ScalarType::BFloat16> { struct ScalarType2CudaType<at::ScalarType::BFloat16> {
using type = __nv_bfloat16; using type = __nv_bfloat16;
}; };
// uint8 for packed fp4
template <>
struct ScalarType2CudaType<at::ScalarType::Byte> {
using type = uint8_t;
};
// #if __CUDA_ARCH__ >= 890 // #if __CUDA_ARCH__ >= 890
// fp8 // fp8

View File

@ -20,6 +20,7 @@ __global__ void expandInputRowsKernel(
int expert_id = sorted_experts[expanded_dest_row]; int expert_id = sorted_experts[expanded_dest_row];
extern __shared__ int64_t smem_expert_first_token_offset[]; extern __shared__ int64_t smem_expert_first_token_offset[];
int64_t align_expanded_row_accumulate = 0;
if constexpr (ALIGN_BLOCK_SIZE) { if constexpr (ALIGN_BLOCK_SIZE) {
// load g2s // load g2s
for (int idx = threadIdx.x; idx < num_local_experts + 1; for (int idx = threadIdx.x; idx < num_local_experts + 1;
@ -62,6 +63,7 @@ __global__ void expandInputRowsKernel(
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>; using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows // Duplicate and permute rows
int64_t const source_k_rank = expanded_source_row / num_rows;
int64_t const source_row = expanded_source_row % num_rows; int64_t const source_row = expanded_source_row % num_rows;
auto const* source_row_ptr = auto const* source_row_ptr =
@ -158,6 +160,7 @@ __global__ void finalizeMoeRoutingKernel(
elem_index += stride) { elem_index += stride) {
ComputeElem thread_output; ComputeElem thread_output;
thread_output.fill(0); thread_output.fill(0);
float row_rescale{0.f};
for (int k_idx = 0; k_idx < k; ++k_idx) { for (int k_idx = 0; k_idx < k; ++k_idx) {
int64_t const expanded_original_row = original_row + k_idx * num_rows; int64_t const expanded_original_row = original_row + k_idx * num_rows;
int64_t const expanded_permuted_row = int64_t const expanded_permuted_row =
@ -174,6 +177,8 @@ __global__ void finalizeMoeRoutingKernel(
auto const* expanded_permuted_rows_row_ptr = auto const* expanded_permuted_rows_row_ptr =
expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col; expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
int64_t const expert_idx = expert_for_source_row[k_offset];
ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>( ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
expanded_permuted_rows_row_ptr[elem_index]); expanded_permuted_rows_row_ptr[elem_index]);
thread_output = thread_output + row_scale * (expert_result); thread_output = thread_output + row_scale * (expert_result);

View File

@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \ #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
gating_output, nullptr, topk_weights, topk_indices, \ gating_output, nullptr, topk_weights, topk_indicies, \
token_expert_indices, num_tokens, topk, 0, num_experts, \ token_expert_indices, num_tokens, topk, 0, num_experts, \
stream); stream);
@ -433,7 +433,7 @@ template <typename IndType>
void topkGatingSoftmaxKernelLauncher( void topkGatingSoftmaxKernelLauncher(
const float* gating_output, const float* gating_output,
float* topk_weights, float* topk_weights,
IndType* topk_indices, IndType* topk_indicies,
int* token_expert_indices, int* token_expert_indices,
float* softmax_workspace, float* softmax_workspace,
const int num_tokens, const int num_tokens,
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>( moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts); gating_output, nullptr, softmax_workspace, num_experts);
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>( moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices, softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
num_experts, topk, 0, num_experts); num_experts, topk, 0, num_experts);
} }
} }
@ -516,8 +516,9 @@ void topk_softmax(
topk, topk,
stream); stream);
} }
else if (topk_indices.scalar_type() == at::ScalarType::UInt32) else
{ {
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
vllm::moe::topkGatingSoftmaxKernelLauncher( vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(), gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(), topk_weights.data_ptr<float>(),
@ -529,17 +530,4 @@ void topk_softmax(
topk, topk,
stream); stream);
} }
else {
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
} }

View File

@ -22,6 +22,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()"); " Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// temporarily adapted from
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
m.def(
"sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
#ifndef USE_ROCM #ifndef USE_ROCM
m.def( m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
@ -57,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def( m.def(
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert," "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
"int n_local_expert," "int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! " "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
@ -72,12 +81,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def("moe_permute_unpermute_supported() -> bool"); m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
// Row shuffle for MoE
m.def(
"shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
"output_tensor) -> ()");
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
#endif #endif
} }

View File

@ -92,11 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon); torch::Tensor& weight, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale, torch::Tensor& weight, torch::Tensor& scale,
double epsilon); double epsilon);
@ -236,8 +231,7 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides);
bool per_act_token, bool per_out_ch);
void cutlass_fp4_group_mm( void cutlass_fp4_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@ -249,16 +243,7 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation, torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k, const int64_t num_experts, const int64_t n, const int64_t k);
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& b,

View File

@ -274,6 +274,7 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
[[maybe_unused]] int block_tables_stride = block_tables.stride(0);
TORCH_CHECK((blocks * threads > num_queries), TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ", "multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0), num_queries, " block_tables.stride(0) = ", block_tables.stride(0),

View File

@ -1,17 +1,15 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <torch/all.h> #include <torch/all.h>
#include <cmath> #include <cmath>
#include "../../dispatch_utils.h" #include "../../dispatch_utils.h"
#include "../vectorization_utils.cuh"
#ifndef USE_ROCM #ifndef USE_ROCM
#include <cub/cub.cuh>
#include <cub/util_type.cuh> #include <cub/util_type.cuh>
#include <cub/cub.cuh>
#else #else
#include <hipcub/hipcub.hpp>
#include <hipcub/util_type.hpp> #include <hipcub/util_type.hpp>
#include <hipcub/hipcub.hpp>
#endif #endif
static inline __device__ int8_t float_to_int8_rn(float x) { static inline __device__ int8_t float_to_int8_rn(float x) {
@ -105,170 +103,134 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
namespace vllm { namespace vllm {
template <typename scalar_t, typename scale_t> template <typename scalar_t, typename scale_type>
__global__ void static_scaled_int8_quant_kernel( __global__ void static_scaled_int8_quant_kernel(
const scalar_t* __restrict__ input, int8_t* __restrict__ output, scalar_t const* __restrict__ input, int8_t* __restrict__ out,
const scale_t* scale_ptr, const int hidden_size) { scale_type const* scale_ptr, const int hidden_size) {
const int tid = threadIdx.x; int const tid = threadIdx.x;
const int stride = blockDim.x; int64_t const token_idx = blockIdx.x;
const int64_t token_idx = blockIdx.x; scale_type const scale = *scale_ptr;
const float scale = *scale_ptr;
// Must be performed using 64-bit math to avoid integer overflow. // Must be performed using 64-bit math to avoid integer overflow.
const scalar_t* row_in = input + token_idx * hidden_size; out += token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size; input += token_idx * hidden_size;
vectorize_with_alignment<16>( for (int i = tid; i < hidden_size; i += blockDim.x) {
row_in, row_out, hidden_size, tid, stride, out[i] = float_to_int8_rn(static_cast<float>(input[i]) / scale);
[=] __device__(int8_t& dst, const scalar_t& src) { }
dst = float_to_int8_rn(static_cast<float>(src) / scale);
});
} }
template <typename scalar_t, typename scale_t, typename azp_t> template <typename scalar_t, typename scale_type, typename azp_type>
__global__ void static_scaled_int8_azp_quant_kernel( __global__ void static_scaled_int8_azp_quant_kernel(
const scalar_t* __restrict__ input, int8_t* __restrict__ output, scalar_t const* __restrict__ input, int8_t* __restrict__ out,
const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) { scale_type const* scale_ptr, azp_type const* azp_ptr,
const int tid = threadIdx.x; const int hidden_size) {
const int stride = blockDim.x; int const tid = threadIdx.x;
const int64_t token_idx = blockIdx.x; int64_t const token_idx = blockIdx.x;
const float scale = *scale_ptr; scale_type const scale = *scale_ptr;
const azp_t azp = *azp_ptr; azp_type const azp = *azp_ptr;
const float inv_s = 1.0f / scale;
// Must be performed using 64-bit math to avoid integer overflow. // Must be performed using 64-bit math to avoid integer overflow.
const scalar_t* row_in = input + token_idx * hidden_size; out += token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size; input += token_idx * hidden_size;
vectorize_with_alignment<16>( for (int i = tid; i < hidden_size; i += blockDim.x) {
row_in, row_out, hidden_size, tid, stride, auto const val = static_cast<float>(input[i]);
[=] __device__(int8_t& dst, const scalar_t& src) { auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
const auto v = static_cast<float>(src) * inv_s; out[i] = quant_val;
dst = int32_to_int8(float_to_int32_rn(v) + azp); }
});
} }
template <typename scalar_t, typename scale_t> template <typename scalar_t, typename scale_type>
__global__ void dynamic_scaled_int8_quant_kernel( __global__ void dynamic_scaled_int8_quant_kernel(
const scalar_t* __restrict__ input, int8_t* __restrict__ output, scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_t* scale_out, const int hidden_size) { scale_type* scale, const int hidden_size) {
const int tid = threadIdx.x; int const tid = threadIdx.x;
const int stride = blockDim.x; int64_t const token_idx = blockIdx.x;
const int64_t token_idx = blockIdx.x; float absmax_val = 0.0f;
float const zero = 0.0f;
// Must be performed using 64-bit math to avoid integer overflow. // Must be performed using 64-bit math to avoid integer overflow.
const scalar_t* row_in = input + token_idx * hidden_size; out += token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size; input += token_idx * hidden_size;
// calculate for absmax for (int i = tid; i < hidden_size; i += blockDim.x) {
float thread_max = 0.f; float val = static_cast<float>(input[i]);
for (int i = tid; i < hidden_size; i += stride) { val = val > zero ? val : -val;
const auto v = fabsf(static_cast<float>(row_in[i])); absmax_val = val > absmax_val ? val : absmax_val;
thread_max = fmaxf(thread_max, v);
} }
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp; using BlockReduce = cub::BlockReduce<float, 1024>;
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); __shared__ typename BlockReduce::TempStorage reduceStorage;
__shared__ float absmax; float const block_absmax_val_maybe =
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
__shared__ float block_absmax_val;
if (tid == 0) { if (tid == 0) {
absmax = block_max; block_absmax_val = block_absmax_val_maybe;
scale_out[blockIdx.x] = absmax / 127.f; scale[token_idx] = block_absmax_val / 127.0f;
} }
__syncthreads(); __syncthreads();
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax; float const tmp_scale = 127.0f / block_absmax_val;
for (int i = tid; i < hidden_size; i += blockDim.x) {
// 2. quantize out[i] = float_to_int8_rn(static_cast<float>(input[i]) * tmp_scale);
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
dst = float_to_int8_rn(static_cast<float>(src) * inv_s);
});
}
// MinMax structure to hold min and max values in one go
struct MinMax {
float min, max;
__host__ __device__ MinMax()
: min(std::numeric_limits<float>::max()),
max(std::numeric_limits<float>::lowest()) {}
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
// add a value to the MinMax
__host__ __device__ MinMax& operator+=(float v) {
min = fminf(min, v);
max = fmaxf(max, v);
return *this;
} }
// merge two MinMax objects
__host__ __device__ MinMax& operator&=(const MinMax& other) {
min = fminf(min, other.min);
max = fmaxf(max, other.max);
return *this;
}
};
__host__ __device__ inline MinMax operator+(MinMax a, float v) {
return a += v;
}
__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) {
return a &= b;
} }
template <typename scalar_t, typename scale_t, typename azp_t> template <typename scalar_t, typename scale_type, typename azp_type>
__global__ void dynamic_scaled_int8_azp_quant_kernel( __global__ void dynamic_scaled_int8_azp_quant_kernel(
const scalar_t* __restrict__ input, int8_t* __restrict__ output, scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_t* scale_out, azp_t* azp_out, const int hidden_size) { scale_type* scale, azp_type* azp, const int hidden_size) {
const int tid = threadIdx.x; int64_t const token_idx = blockIdx.x;
const int stride = blockDim.x;
const int64_t token_idx = blockIdx.x;
// Must be performed using 64-bit math to avoid integer overflow. // Must be performed using 64-bit math to avoid integer overflow.
const scalar_t* row_in = input + token_idx * hidden_size; out += token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size; input += token_idx * hidden_size;
// 1. calculate min & max // Scan for the min and max value for this token
MinMax thread_mm; float max_val = std::numeric_limits<float>::min();
for (int i = tid; i < hidden_size; i += stride) { float min_val = std::numeric_limits<float>::max();
thread_mm += static_cast<float>(row_in[i]); for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
auto val = static_cast<float>(input[i]);
max_val = std::max(max_val, val);
min_val = std::min(min_val, val);
} }
using BlockReduce = cub::BlockReduce<MinMax, 256>; // Reduce the max and min values across the block
__shared__ typename BlockReduce::TempStorage tmp; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStorage;
max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
__syncthreads(); // Make sure min doesn't mess with max shared memory
min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
MinMax mm = BlockReduce(tmp).Reduce( __shared__ scale_type scale_sh;
thread_mm, __shared__ azp_type azp_sh;
[] __device__(MinMax a, const MinMax& b) {
a &= b;
return a;
},
blockDim.x);
__shared__ float scale_sh; // Compute the scale and zero point and store them, only on the first thread
__shared__ azp_t azp_sh; if (threadIdx.x == 0) {
if (tid == 0) { float const scale_val = (max_val - min_val) / 255.0f;
float s = (mm.max - mm.min) / 255.f; // Use rounding to even (same as torch.round)
float zp = nearbyintf(-128.f - mm.min / s); // round-to-even auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
scale_sh = s; auto const azp_val = static_cast<azp_type>(azp_float);
azp_sh = azp_t(zp);
scale_out[blockIdx.x] = s; // Store the scale and azp into shared and global
azp_out[blockIdx.x] = azp_sh; scale[token_idx] = scale_sh = scale_val;
azp[token_idx] = azp_sh = azp_val;
} }
// Wait for the scale and azp to be computed
__syncthreads(); __syncthreads();
const float inv_s = 1.f / scale_sh; float const scale_val = scale_sh;
const azp_t azp = azp_sh; azp_type const azp_val = azp_sh;
// 2. quantize // Quantize the values
vectorize_with_alignment<16>( for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
row_in, row_out, hidden_size, tid, stride, auto const val = static_cast<float>(input[i]);
[=] __device__(int8_t& dst, const scalar_t& src) { auto const quant_val =
const auto v = static_cast<float>(src) * inv_s; int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
dst = int32_to_int8(float_to_int32_rn(v) + azp); out[i] = quant_val;
}); }
} }
} // namespace vllm } // namespace vllm
@ -285,7 +247,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
int const hidden_size = input.size(-1); int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size; int const num_tokens = input.numel() / hidden_size;
dim3 const grid(num_tokens); dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, 256)); dim3 const block(std::min(hidden_size, 1024));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@ -316,7 +278,7 @@ void dynamic_scaled_int8_quant(
int const hidden_size = input.size(-1); int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size; int const num_tokens = input.numel() / hidden_size;
dim3 const grid(num_tokens); dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, 256)); dim3 const block(std::min(hidden_size, 1024));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] { input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {

View File

@ -9,6 +9,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,
torch::Tensor const& b_scales) { torch::Tensor const& b_scales) {
TORCH_CHECK(
a.size(0) % 4 == 0,
"Input tensor must have a number of rows that is a multiple of 4. ",
"but got: ", a.size(0), " rows.");
if (out.dtype() == torch::kBFloat16) { if (out.dtype() == torch::kBFloat16) {
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>( cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);

View File

@ -1,6 +1,5 @@
#pragma once #pragma once
#include "cuda_utils.h"
#include "cutlass/cutlass.h" #include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h" #include "cutlass/numeric_types.h"
@ -23,49 +22,49 @@ namespace vllm {
using namespace cute; using namespace cute;
// clang-format off template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
template <class OutType, int ScaleGranularityM, class ClusterShape, typename EpilogueScheduler,
int ScaleGranularityN, int ScaleGranularityK, typename MainloopScheduler>
class MmaTileShape, class ClusterShape,
class EpilogueScheduler, class MainloopScheduler,
bool swap_ab_ = false>
struct cutlass_3x_gemm_fp8_blockwise { struct cutlass_3x_gemm_fp8_blockwise {
static constexpr bool swap_ab = swap_ab_;
using ElementAB = cutlass::float_e4m3_t; using ElementAB = cutlass::float_e4m3_t;
using ElementA = ElementAB; using ElementA = ElementAB;
using LayoutA = cutlass::layout::RowMajor; using LayoutA = cutlass::layout::RowMajor;
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
using ElementB = ElementAB; using ElementB = ElementAB;
using LayoutB = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
using ElementC = void;
using ElementD = OutType; using ElementD = OutType;
using LayoutD = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor;
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
using ElementC = void; // TODO: support bias
using LayoutC = LayoutD; using LayoutC = LayoutD;
using LayoutC_Transpose = LayoutD_Transpose;
static constexpr int AlignmentC = AlignmentD; static constexpr int AlignmentC = AlignmentD;
using ElementAccumulator = float; using ElementAccumulator = float;
using ElementCompute = float; using ElementCompute = float;
using ElementBlockScale = float; using ElementBlockScale = float;
using ScaleConfig = conditional_t<swap_ab, // MMA and Cluster Tile Shapes
cutlass::detail::Sm100BlockwiseScaleConfig< // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
cute::UMMA::Major::K, cute::UMMA::Major::MN>, static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
cutlass::detail::Sm100BlockwiseScaleConfig< static constexpr int ScaleGranularityM =
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, size<0>(MmaTileShape{}) / ScaleMsPerTile;
cute::UMMA::Major::MN, cute::UMMA::Major::K>>; static constexpr int ScaleGranularityN =
size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
static constexpr int ScaleGranularityK =
size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
// layout_SFA and layout_SFB cannot be swapped since they are deduced. // Shape of the threadblocks in a cluster
using ClusterShape_MNK = ClusterShape;
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
@ -74,6 +73,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
using ElementScalar = float; using ElementScalar = float;
// clang-format off
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>; using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, ArchTag,
@ -84,47 +84,33 @@ struct cutlass_3x_gemm_fp8_blockwise {
ElementAccumulator, ElementAccumulator,
ElementCompute, ElementCompute,
ElementC, ElementC,
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, LayoutC,
AlignmentC, AlignmentC,
ElementD, ElementD,
conditional_t<swap_ab, LayoutD_Transpose, LayoutD>, LayoutD,
AlignmentD, AlignmentD,
EpilogueScheduler, EpilogueScheduler,
DefaultOperation DefaultOperation
>::CollectiveOp; >::CollectiveOp;
using StageCountType = cutlass::gemm::collective::StageCountAuto; using StageCountType = cutlass::gemm::collective::StageCountAuto;
using CollectiveMainloop = conditional_t<swap_ab, using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
typename cutlass::gemm::collective::CollectiveBuilder< ArchTag,
ArchTag, OperatorClass,
OperatorClass, ElementA,
ElementB, cute::tuple<LayoutA, LayoutSFA>,
cute::tuple<LayoutB_Transpose, LayoutSFA>, AlignmentA,
AlignmentB, ElementB,
ElementA, cute::tuple<LayoutB, LayoutSFB>,
cute::tuple<LayoutA_Transpose, LayoutSFB>, AlignmentB,
AlignmentA, ElementAccumulator,
ElementAccumulator, MmaTileShape,
MmaTileShape, ClusterShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>, cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler MainloopScheduler
>::CollectiveOp, >::CollectiveOp;
typename cutlass::gemm::collective::CollectiveBuilder< // clang-format on
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp>;
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal< using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>; Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@ -137,7 +123,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,
torch::Tensor const& b_scales) { torch::Tensor const& b_scales) {
static constexpr bool swap_ab = Gemm::swap_ab;
using GemmKernel = typename Gemm::GemmKernel; using GemmKernel = typename Gemm::GemmKernel;
using StrideA = typename Gemm::GemmKernel::StrideA; using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB; using StrideB = typename Gemm::GemmKernel::StrideB;
@ -151,6 +136,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
using ElementD = typename Gemm::ElementD; using ElementD = typename Gemm::ElementD;
int32_t m = a.size(0), n = b.size(1), k = a.size(1); int32_t m = a.size(0), n = b.size(1), k = a.size(1);
auto prob_shape = cute::make_shape(m, n, k, 1);
StrideA a_stride; StrideA a_stride;
StrideB b_stride; StrideB b_stride;
@ -160,13 +146,11 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
b_stride = b_stride =
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
c_stride = c_stride =
cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
LayoutSFA layout_SFA = swap_ab ? LayoutSFA layout_SFA =
ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
LayoutSFB layout_SFB = swap_ab ? LayoutSFB layout_SFB =
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
auto a_ptr = static_cast<ElementAB*>(a.data_ptr()); auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@ -174,22 +158,9 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr()); auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr()); auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
auto mainloop_args = [&](){ typename GemmKernel::MainloopArguments mainloop_args{
// layout_SFA and layout_SFB cannot be swapped since they are deduced. a_ptr, a_stride, b_ptr, b_stride,
if (swap_ab) { a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
return typename GemmKernel::MainloopArguments{
b_ptr, b_stride, a_ptr, a_stride,
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
};
}
else {
return typename GemmKernel::MainloopArguments{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
};
}
}();
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
auto c_ptr = static_cast<ElementD*>(out.data_ptr()); auto c_ptr = static_cast<ElementD*>(out.data_ptr());
typename GemmKernel::EpilogueArguments epilogue_args{ typename GemmKernel::EpilogueArguments epilogue_args{
@ -204,74 +175,29 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,
torch::Tensor const& b_scales) { torch::Tensor const& b_scales) {
int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms; auto m = a.size(0);
auto k = a.size(1);
auto n = b.size(1);
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
constexpr int TILE_K = 128; auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
// TODO: better heuristics return std::ceil(static_cast<float>(m) / tile1SM) *
bool swap_ab = (m < 16) || (m % 4 != 0); std::ceil(static_cast<float>(n) / tile1SM) >=
bool use_tma_epilogue = (m * n) % 4 == 0; sms;
if (!swap_ab) { };
constexpr int TILE_N = 128; bool use_2sm = should_use_2sm(m, n);
int tile_m = 256; if (use_2sm) {
if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
tile_m = 64;
}
else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
tile_m = 128;
}
if (tile_m == 64) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else if (tile_m == 128) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else { // tile_m == 256
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
}
}
} else {
// TODO: Test more tile N configs
constexpr int TILE_M = 128;
constexpr int TILE_N = 16;
// TMA epilogue isn't compatible with Swap A/B
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>, OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);
} }
} }

View File

@ -15,59 +15,16 @@ using c3x::cutlass_gemm_caller;
template <typename InType, typename OutType, template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue> template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_default { struct sm100_fp8_config_default {
// M in (256, inf)
static_assert(std::is_same<InType, cutlass::float_e4m3_t>()); static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_256, _128, _128>; using TileShape = Shape<_256, _128, _64>;
using ClusterShape = Shape<_2, _2, _1>; using ClusterShape = Shape<_2, _2, _1>;
using Cutlass3xGemm = using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape, cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>; KernelSchedule, EpilogueSchedule>;
}; };
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M256 {
// M in (128, 256]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _128>;
using ClusterShape = Shape<_2, _2, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M128 {
// M in (64, 128]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _256>;
using ClusterShape = Shape<_2, _4, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M64 {
// M in [1, 64]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_64, _64, _256>;
using ClusterShape = Shape<_1, _8, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType, template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue, template <typename, typename, typename> typename Epilogue,
typename... EpilogueArgs> typename... EpilogueArgs>
@ -82,34 +39,8 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault = using Cutlass3xGemmDefault =
typename sm100_fp8_config_default<InType, OutType, typename sm100_fp8_config_default<InType, OutType,
Epilogue>::Cutlass3xGemm; Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM64 = return cutlass_gemm_caller<Cutlass3xGemmDefault>(
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm; out, a, b, std::forward<EpilogueArgs>(args)...);
using Cutlass3xGemmM128 =
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM256 =
typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
uint32_t const m = a.size(0);
uint32_t const mp2 =
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
if (mp2 <= 64) {
// m in [1, 64]
return cutlass_gemm_caller<Cutlass3xGemmM64>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 128) {
// m in (64, 128]
return cutlass_gemm_caller<Cutlass3xGemmM128>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 256) {
// m in (128, 256]
return cutlass_gemm_caller<Cutlass3xGemmM256>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else {
// m in (256, inf)
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
out, a, b, std::forward<EpilogueArgs>(args)...);
}
} }
template <template <typename, typename, typename> typename Epilogue, template <template <typename, typename, typename> typename Epilogue,

View File

@ -84,8 +84,7 @@ void run_cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided."); TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided."); TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided."); TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@ -114,23 +113,19 @@ void run_cutlass_moe_mm_sm90(
if (n >= 8192) { if (n >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmN8192>( cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else if (k >= 8192) { } else if (k >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmK8192>( cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else if (m <= 16) { } else if (m <= 16) {
cutlass_group_gemm_caller<Cutlass3xGemmM16>( cutlass_group_gemm_caller<Cutlass3xGemmM16>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else { } else {
cutlass_group_gemm_caller<Cutlass3xGemmDefault>( cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} }
} }
@ -139,18 +134,15 @@ void dispatch_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
if (out_tensors.dtype() == torch::kBFloat16) { if (out_tensors.dtype() == torch::kBFloat16) {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>( run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else { } else {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>( run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} }
} }
@ -161,9 +153,8 @@ void cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch); c_strides);
} }

View File

@ -76,8 +76,7 @@ void cutlass_group_gemm_caller(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
using ElementAB = typename Gemm::ElementAB; using ElementAB = typename Gemm::ElementAB;
using ElementD = typename Gemm::ElementD; using ElementD = typename Gemm::ElementD;
@ -85,6 +84,9 @@ void cutlass_group_gemm_caller(
int k_size = a_tensors.size(1); int k_size = a_tensors.size(1);
int n_size = out_tensors.size(1); int n_size = out_tensors.size(1);
bool per_act_token = a_scales.numel() != 1;
bool per_out_ch = b_scales.numel() != num_experts;
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
auto options_int = auto options_int =

Some files were not shown because too many files have changed in this diff Show More