Compare commits

..

2 Commits

Author SHA1 Message Date
6f62c94d7e updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-03 13:47:16 -04:00
52a7d91980 debug
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-03 13:25:00 -04:00
1687 changed files with 104465 additions and 140083 deletions

View File

@ -368,7 +368,7 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
)
# get markdown tables

View File

@ -454,6 +454,11 @@ main() {
fi
check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)

46
.buildkite/pyproject.toml Normal file
View File

@ -0,0 +1,46 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true

View File

@ -48,7 +48,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@ -150,16 +150,11 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker push vllm/vllm-openai:nightly"
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
@ -168,4 +163,3 @@ steps:
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"

View File

@ -8,41 +8,20 @@ set -ex
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub credentials from environment
# Get DockerHub token from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1
fi
if [ -z "$DOCKERHUB_USERNAME" ]; then
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
exit 1
fi
# Get DockerHub bearer token
echo "Getting DockerHub bearer token..."
set +x
BEARER_TOKEN=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
"https://hub.docker.com/v2/users/login" | jq -r '.token')
set -x
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
echo "Error: Failed to get DockerHub bearer token"
exit 1
fi
# Function to get all tags from DockerHub
get_all_tags() {
local page=1
local all_tags=""
while true; do
set +x
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100")
set -x
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"

View File

@ -25,28 +25,25 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman exec -it "$container_id" bash -c "
set -evx
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
# Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 120m bash -c cpu_tests
timeout 40m bash -c cpu_tests

View File

@ -64,9 +64,10 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -64,9 +64,10 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -44,5 +44,6 @@ docker run \
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_metrics
pytest -v -s v1/test_serial_utils.py
'

View File

@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.7
EXPECTED_THROUGHPUT=10.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
vllm serve $MODEL \
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

View File

@ -296,7 +296,6 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
@ -318,7 +317,7 @@ steps:
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
@ -398,12 +397,12 @@ steps:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@ -432,9 +431,8 @@ steps:
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- pytest -v -s kernels/core
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
@ -478,7 +476,6 @@ steps:
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
@ -829,20 +826,18 @@ steps:
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- label: Blackwell GPT-OSS Eval
- label: GPT-OSS Eval (Blackwell)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true # run on nightlies
optional: true # disable while debugging
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
@ -869,16 +864,6 @@ steps:
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
@ -1095,8 +1080,6 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048

1
.github/CODEOWNERS vendored
View File

@ -23,7 +23,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat

2
.github/mergify.yml vendored
View File

@ -11,8 +11,6 @@ pull_request_rules:
label:
add:
- documentation
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build
description: Automatically apply ci/build label

View File

@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months

View File

@ -6,18 +6,30 @@ default_stages:
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.0
- repo: https://github.com/google/yapf
rev: v0.43.0
hooks:
- id: ruff-check
- id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
- id: ruff
args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
rev: v1.38.1
rev: v1.35.5
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v21.1.2
rev: v20.1.3
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -34,7 +46,7 @@ repos:
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.9.1
rev: 0.6.17
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
@ -55,6 +67,11 @@ repos:
types_or: [python, pyi]
require_serial: true
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: python tools/pre_commit/mypy.py 1 "3.9"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10"
@ -70,11 +87,6 @@ repos:
entry: python tools/pre_commit/mypy.py 1 "3.12"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.13
entry: python tools/pre_commit/mypy.py 1 "3.13"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
entry: tools/shellcheck.sh

View File

@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu"
"csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
@ -314,13 +314,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
"csrc/quantization/fp8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@ -424,11 +423,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -459,9 +458,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -493,9 +492,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -526,7 +525,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -649,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -673,7 +672,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -698,7 +697,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -721,7 +720,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -1007,7 +1006,6 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)

View File

@ -149,7 +149,6 @@ Compute Resources:
- Trainy
- UC Berkeley
- UC San Diego
- Volcengine
Slack Sponsor: Anyscale

View File

@ -74,7 +74,7 @@ start_server() {
local vllm_log=$4
local profile_dir=$5
pkill -if "vllm serve" || true
pkill -if vllm
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
@ -96,11 +96,11 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!
@ -139,7 +139,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -if "vllm serve" || true
pkill -if vllm
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
@ -232,7 +232,7 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if "vllm serve" || true
pkill -if vllm
sleep 10
echo "===================="
return 0
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
pkill -if "vllm serve" || true
pkill -if vllm
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool

View File

@ -5,9 +5,9 @@ import time
from unittest import mock
import numpy as np
from benchmark_utils import TimeCollector
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.config import (
CacheConfig,
DeviceConfig,
@ -164,7 +164,7 @@ def invoke_main() -> None:
)
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
)
) # noqa: E501
parser.add_argument(
"--num-iteration",
type=int,

View File

@ -37,13 +37,14 @@ from typing import Optional
import datasets
import numpy as np
import pandas as pd
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
@ -909,13 +910,13 @@ def create_argument_parser():
parser.add_argument(
"--tokenizer",
type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--num-prompts",

View File

@ -1,191 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"mxfp4": dict(no_a_quant=False, enabled=True),
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_mxfp4(
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
b, forward_hadamard_matrix, method="abs_max"
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
return weight_hf_e2m1, weight_hf_scale_block
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
b, forward_hadamard_matrix, device
)
alpha = torch.tensor([1.0], device="cuda")
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
def run():
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs MXFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_mxfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_mxfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -1,207 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_nvfp4(
b: torch.Tensor,
forward_hadamard_matrix: torch.Tensor,
global_scale: torch.Tensor,
device: str,
M: int,
N: int,
K: int,
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
b, forward_hadamard_matrix, global_scale
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
-1, K // 16
)
return weight_hf_e2m1, weight_hf_scale_block
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
alpha = torch.tensor([1.0], device="cuda")
global_scale = torch.tensor([1.0], device="cuda")
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
b, forward_hadamard_matrix, global_scale, device, M, N, K
)
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
def run():
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs NVFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_nvfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [16, 32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -579,12 +579,10 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
@ -594,7 +592,6 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
"Qwen2MoeForCausalLM",
"Qwen3MoeForCausalLM",
@ -603,18 +600,10 @@ def main(args: argparse.Namespace):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
text_config = config.get_text_config()
E = text_config.num_experts
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
else:
# Support for llama4
config = config.get_text_config()
@ -622,7 +611,6 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
@ -631,6 +619,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"

View File

@ -1,19 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Comprehensive 3-way SiLU Benchmark Suite
This benchmark compares three SiLU implementations:
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
2. Triton Kernel - Triton-based implementation
The suite generates detailed performance comparisons including:
- Memory bandwidth utilization
- Speedup ratios (baseline vs optimized implementations)
- Performance across different expert configurations and token distributions
"""
from collections.abc import Callable
import matplotlib.pyplot as plt
@ -21,7 +7,7 @@ import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
silu_mul_fp8_quant_deep_gemm_cuda,
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
@ -108,7 +94,6 @@ def silu_mul_fp8_quant_deep_gemm_triton(
num_parallel_tokens,
group_size: int = 128,
eps: float = 1e-10,
expert_offsets: torch.Tensor = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
@ -189,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
# Parse generation strategies
strategies = ["random_imbalanced", "uniform", "max_t"]
strategies = ["uniform", "max_t", "first_t"]
def benchmark(
@ -210,27 +195,15 @@ def benchmark(
current_platform.seed_everything(42 + seed_offset)
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
if gen_strategy == "random_imbalanced":
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
mean = total_tokens // n_e
min_max = mean // ratio
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
e[0] = min_max
r = torch.rand(size=(E - 1,))
r /= r.sum()
r *= total_tokens - min_max
r = r.round().long()
e[1:] = r.to(device=device)
return e
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
elif gen_strategy == "uniform":
r = torch.rand(size=(E,))
if gen_strategy == "uniform":
r = torch.rand(size=(E,), device="cuda")
r /= r.sum()
r *= total_tokens
r = r.round().long()
tokens_per_expert = r
tokens_per_expert = r.int()
tokens_per_expert = torch.minimum(
tokens_per_expert,
torch.ones((E,), device=r.device, dtype=torch.int) * T,
)
elif gen_strategy == "max_t":
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
tokens_per_expert.fill_(total_tokens / E)
@ -308,34 +281,40 @@ def benchmark(
def create_comparison_plot(
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
):
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
"""Create a comparison plot for a specific generation strategy"""
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
# Configure x-axis positions
x = np.arange(len(config_labels))
width = 0.25
width = 0.35
# Execution Time plot (lower is better)
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
ax.bar(
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
)
ax.bar(
x + width / 2,
baseline_times,
width,
label="Baseline",
alpha=0.8,
color="orange",
)
# Add speedup labels over each bar trio
# Add speedup labels over each bar pair
for i in range(len(x)):
triton_v2_speedup = ratios[i][1] # triton/v2
max_height = max(silu_v2_times[i], triton_times[i])
# Triton/V2 speedup
speedup = ratio[i]
max_height = max(cuda_times[i], baseline_times[i])
ax.text(
x[i] + width / 2,
x[i],
max_height + max_height * 0.02,
f"{triton_v2_speedup:.2f}x",
f"{speedup:.2f}x",
ha="center",
va="bottom",
fontweight="bold",
fontsize=8,
fontsize=9,
)
ax.set_xlabel("Configuration")
@ -353,75 +332,56 @@ def create_comparison_plot(
def create_combined_plot(all_results):
"""Create a combined plot with all strategies in one PNG"""
num_strategies = len(all_results)
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
if num_strategies == 1:
axes = [axes]
for idx, (
strategy_name,
all_ratios,
all_silu_v2_results,
all_triton_results,
ratio,
cuda_times,
baseline_times,
config_labels,
config_x_axis,
) in enumerate(all_results):
ax = axes[idx]
# Flatten the nested results to get bandwidth percentages for plotting
silu_v2_bandwidths = []
triton_bandwidths = []
flat_ratios = []
for config_results in all_silu_v2_results:
for result in config_results:
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
for config_results in all_triton_results:
for result in config_results:
triton_bandwidths.append(result[3]) # bandwidth percentage
for config_ratios in all_ratios:
for ratio in config_ratios:
flat_ratios.append(ratio)
# Configure x-axis positions
x = np.arange(len(config_labels))
width = 0.25
width = 0.35
# Bandwidth utilization plot (higher is better)
# Execution Time plot (lower is better)
ax.bar(
x,
silu_v2_bandwidths,
x - width / 2,
cuda_times,
width,
label="SiLU V2 (CUDA)",
label="CUDA Kernel",
alpha=0.8,
color="blue",
)
ax.bar(
x + width,
triton_bandwidths,
x + width / 2,
baseline_times,
width,
label="Triton Kernel",
label="Baseline",
alpha=0.8,
color="green",
color="orange",
)
# Add speedup labels over each bar trio
# Add speedup labels over each bar pair
for i in range(len(x)):
triton_v2_speedup = flat_ratios[i] # triton/v2
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
# Triton/V2 speedup
speedup = ratio[i]
max_height = max(cuda_times[i], baseline_times[i])
ax.text(
x[i] + width / 2,
x[i],
max_height + max_height * 0.02,
f"{triton_v2_speedup:.2f}x",
f"{speedup:.2f}x",
ha="center",
va="bottom",
fontweight="bold",
fontsize=8,
fontsize=9,
)
ax.set_xlabel("Configuration")
@ -435,7 +395,7 @@ def create_combined_plot(all_results):
ax.grid(True, alpha=0.3)
plt.tight_layout()
filename = "silu_benchmark_combined_3way.png"
filename = "../../silu_bench/silu_benchmark_combined.png"
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
@ -445,9 +405,7 @@ def create_combined_plot(all_results):
outer_dim = 7168
configs = [
# DeepSeekV3 Configs
# (1, 56, 7168),
(8, 1024, 7168),
# (32, 56, 7168),
# DeepSeekV3 Configs
(32, 1024, 7168),
# DeepSeekV3 Configs
@ -459,7 +417,6 @@ num_warmups = 20
strategy_descriptions = {
"uniform": "Uniform Random",
"random_imbalanced": "Imbalanced Random",
"max_t": "Even Assignment",
"first_t": "experts[0] = T, experts[1:] = 0",
}
@ -476,31 +433,28 @@ for id, strategy in enumerate(strategies):
print(f"Testing strategy: {strategy_descriptions[strategy]}")
print(f"{'=' * 60}")
# Collect benchmark data for all three algorithms
# Collect benchmark data for both algorithms
config_labels = []
config_x_axis = []
all_silu_v2_results = []
all_triton_results = []
all_cuda_results = []
all_baseline_results = []
all_ratios = []
for E, T, H in configs:
total_tokens_config = []
for i in [8, 16, 32, 64, 128, 256, 512]:
if i <= T:
total_tokens_config.append(i * E)
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
config_x_axis.append(total_tokens_config)
silu_v2_results = []
triton_results = []
cuda_results = []
baseline_results = []
ratios = []
for total_tokens in total_tokens_config:
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
config_labels.append(config_label)
# SiLU V2 (CUDA kernel) results
time_ms_silu_v2, gflops, gbps, perc = benchmark(
persistent_masked_m_silu_mul_quant,
# CUDA kernel results
time_ms_cuda, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_cuda,
E,
T,
H,
@ -509,9 +463,9 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups,
gen_strategy=strategy,
)
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
# Triton kernel results
# Baseline results
time_ms_triton, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_triton,
E,
@ -522,20 +476,12 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups,
gen_strategy=strategy,
)
triton_results.append((time_ms_triton, gflops, gbps, perc))
baseline_results.append((time_ms_triton, gflops, gbps, perc))
ratios.append(time_ms_triton / time_ms_cuda)
# Calculate speedup ratios (triton baseline / implementation)
triton_v2_ratio = time_ms_triton / time_ms_silu_v2
ratios.append(triton_v2_ratio)
print(
f"Completed: {config_label}:"
f" V2: {time_ms_silu_v2:.3f}ms,"
f" Triton: {time_ms_triton:.3f}ms"
)
all_silu_v2_results.append(silu_v2_results)
all_triton_results.append(triton_results)
print(f"Completed: {config_label}")
all_cuda_results.append(cuda_results)
all_baseline_results.append(baseline_results)
all_ratios.append(ratios)
# Store results for combined plotting
@ -543,8 +489,8 @@ for id, strategy in enumerate(strategies):
(
strategy_descriptions[strategy],
all_ratios,
all_silu_v2_results,
all_triton_results,
all_cuda_results,
all_baseline_results,
config_labels,
config_x_axis,
)
@ -552,18 +498,15 @@ for id, strategy in enumerate(strategies):
# Print summary table for this strategy
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
print("-" * 90)
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
print("-" * 60)
for i, (E, T, H) in enumerate(configs):
# Get the first result for each config (simplifying for summary)
v2_time = silu_v2_results[i][0]
triton_time = triton_results[i][0]
triton_v2_speedup = triton_time / v2_time
speedup = baseline_results[i][0] / cuda_results[i][0]
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
print(
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
f"{triton_v2_speedup:8.2f}x"
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
)
@ -571,14 +514,15 @@ def create_total_tokens_plot(all_results):
num_strategies = len(all_results)
num_configs = len(configs)
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
fig, axs = plt.subplots(
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
)
# Add main title to the entire figure
fig.suptitle(
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
fontsize=18,
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
fontsize=16,
fontweight="bold",
y=0.98,
)
@ -595,8 +539,8 @@ def create_total_tokens_plot(all_results):
(
strategy_name,
all_ratios,
all_silu_v2_results,
all_triton_results,
all_cuda_results,
all_baseline_results,
config_labels,
config_x_axis,
) = result
@ -611,54 +555,42 @@ def create_total_tokens_plot(all_results):
ratios = all_ratios[config_idx]
total_tokens_values = config_x_axis[config_idx]
# Extract speedup ratios
triton_v2_ratios = [ratio for ratio in ratios]
# Extract bandwidth percentages for all implementations
v2_bandwidth_percentages = [
result[3] for result in all_silu_v2_results[config_idx]
# Extract CUDA and Triton bandwidth percentages
cuda_bandwidth_percentages = [
result[3] for result in all_cuda_results[config_idx]
]
triton_bandwidth_percentages = [
result[3] for result in all_triton_results[config_idx]
result[3] for result in all_baseline_results[config_idx]
]
# Plot speedup ratios vs total tokens (left plot)
ax_speedup.plot(
total_tokens_values,
triton_v2_ratios,
"go-",
linewidth=3,
markersize=8,
label="Triton/V2 Speedup",
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
)
ax_speedup.set_title(
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
fontsize=12,
fontweight="bold",
)
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
ax_speedup.legend(prop={"weight": "bold"})
ax_speedup.grid(True, alpha=0.3)
# Plot bandwidth utilization (right plot)
ax_bandwidth.plot(
total_tokens_values,
v2_bandwidth_percentages,
"o-",
cuda_bandwidth_percentages,
"ro-",
linewidth=3,
markersize=8,
label="SiLU V2",
color="blue",
label="CUDA",
)
ax_bandwidth.plot(
total_tokens_values,
triton_bandwidth_percentages,
"o-",
"go-",
linewidth=3,
markersize=8,
label="Triton",
color="green",
)
ax_bandwidth.set_title(
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -686,12 +618,38 @@ def create_total_tokens_plot(all_results):
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontweight("bold")
# Add value labels on Triton/V2 speedup points
for x, y in zip(total_tokens_values, triton_v2_ratios):
# Add value labels on speedup points
for x, y in zip(total_tokens_values, ratios):
ax_speedup.annotate(
f"{y:.2f}x",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=10,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
)
# Add value labels on CUDA bandwidth points
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=9,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
)
# Add value labels on Triton bandwidth points
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, -15),
ha="center",
fontsize=9,
@ -701,20 +659,17 @@ def create_total_tokens_plot(all_results):
plt.tight_layout()
plt.subplots_adjust(top=0.93) # Make room for main title
filename = "silu_benchmark_total_tokens_3way.png"
filename = "silu_benchmark_total_tokens.png"
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
return filename
# Create comprehensive 3-way comparison plots
combined_plot_filename = create_combined_plot(all_results)
total_tokens_plot_filename = create_total_tokens_plot(all_results)
# Create combined plot with all strategies
combined_plot_filename = create_total_tokens_plot(all_results)
print(f"\n{'=' * 80}")
print("3-Way Benchmark Suite Complete!")
print(f"Generated combined comparison plot: {combined_plot_filename}")
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
print("Compared: SiLU V2 (CUDA), and Triton implementations")
print(f"{'=' * 80}")
print(f"\n{'=' * 60}")
print("Benchmark Complete!")
print(f"Generated combined plot: {combined_plot_filename}")
print(f"{'=' * 60}")

View File

@ -14,7 +14,7 @@ import torch
from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_triton_block_scaled_mm,
_w8a8_block_fp8_matmul,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
@ -83,7 +83,7 @@ def w8a8_block_matmul(
)
if A.dtype == torch.float8_e4m3fn:
kernel = _w8a8_triton_block_scaled_mm
kernel = _w8a8_block_fp8_matmul
else:
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501
import time
@ -19,21 +20,19 @@ from vllm.utils.deep_gemm import (
)
def benchmark_shape(
m: int,
n: int,
k: int,
warmup: int = 100,
repeat: int = 10000,
verbose: bool = False,
) -> dict:
def benchmark_shape(m: int,
n: int,
k: int,
warmup: int = 100,
repeat: int = 10000,
verbose: bool = False) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape."""
if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
# Reference result in BF16
torch.cuda.synchronize()
@ -50,39 +49,34 @@ def benchmark_shape(
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True
)
A, block_size[1], column_major_scales=True)
# === DeepGEMM Implementation ===
def deepgemm_gemm():
fp8_gemm_nt(
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
)
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm),
C_deepgemm)
return C_deepgemm
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
return w8a8_triton_block_scaled_mm(
A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,
block_size,
output_dtype=torch.bfloat16,
)
return w8a8_triton_block_scaled_mm(A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,
block_size,
output_dtype=torch.bfloat16)
# === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm():
return ops.cutlass_scaled_mm(
A_vllm_cutlass,
B_vllm.T,
scale_a=A_scale_vllm_cutlass,
scale_b=B_scale_vllm.T,
out_dtype=torch.bfloat16,
)
return ops.cutlass_scaled_mm(A_vllm_cutlass,
B_vllm.T,
scale_a=A_scale_vllm_cutlass,
scale_b=B_scale_vllm.T,
out_dtype=torch.bfloat16)
# Run correctness check first
if verbose:
@ -99,23 +93,26 @@ def benchmark_shape(
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
print(
"vLLM Triton vs DeepGEMM difference: "
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
)
print(
"vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
)
print("vLLM Triton vs DeepGEMM difference: "
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
print("vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
# Benchmark implementations
implementations = {
"DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm,
"vLLM CUTLASS": vllm_cutlass_gemm,
"vLLM CUTLASS": vllm_cutlass_gemm
}
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
benchmark_results = {
"shape": {
"m": m,
"n": n,
"k": k
},
"implementations": {}
}
for name, func in implementations.items():
# Warmup
@ -143,36 +140,38 @@ def benchmark_shape(
"tflops": tflops,
"gb_s": gb_s,
"diff": {
"DeepGEMM": 0.0
if name == "DeepGEMM"
else calc_diff(func(), C_deepgemm),
"Reference": deepgemm_diff
if name == "DeepGEMM"
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
},
"DeepGEMM":
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
"Reference":
deepgemm_diff if name == "DeepGEMM" else
(vllm_triton_diff
if name == "vLLM Triton" else vllm_cutlass_diff)
}
}
if verbose:
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
print(
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
)
# Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM":
speedup = baseline / data["time_ms"]
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
benchmark_results["implementations"][name][
"speedup_vs_deepgemm"] = speedup
if verbose:
print(
f"DeepGEMM is {1 / speedup:.2f}x "
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
)
print(f"DeepGEMM is {1/speedup:.2f}x "
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
"time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
"time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
cutlass_vs_triton
)
benchmark_results["implementations"]["vLLM CUTLASS"][
"speedup_vs_triton"] = cutlass_vs_triton
if verbose:
print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -184,7 +183,8 @@ def benchmark_shape(
def format_table_row(values, widths):
"""Format a row with specified column widths."""
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
return "| " + " | ".join(f"{val:{w}}"
for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None):
@ -292,50 +292,38 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"]
deepgemm_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
]
)
deepgemm_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
])
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
print_table(deepgemm_headers,
deepgemm_rows,
title="DeepGEMM Implementation:")
# Print vLLM Triton table
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
triton_headers = [
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
]
triton_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
triton_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(speedup),
]
)
triton_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
format_speedup(speedup)
])
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
print_table(triton_headers,
triton_rows,
title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table
cutlass_headers = [
"m",
"n",
"k",
"Time (μs)",
"TFLOPS",
"GB/s",
"vs DeepGEMM",
"vs Triton",
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
"vs Triton"
]
cutlass_rows = []
for result in all_results:
@ -343,27 +331,28 @@ def run_benchmarks(verbose: bool = False):
impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
cutlass_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton),
]
)
cutlass_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton)
])
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
print_table(cutlass_headers,
cutlass_rows,
title="vLLM CUTLASS Implementation:")
# Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = {
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
impl: {
"tflops": 0,
"gb_s": 0,
"time_ms": 0
}
for impl in implementations
}
for result in all_results:
@ -381,9 +370,9 @@ def run_benchmarks(verbose: bool = False):
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
avg_rows.append(
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
)
avg_rows.append([
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
])
print_table(avg_headers, avg_rows)
@ -391,19 +380,21 @@ def run_benchmarks(verbose: bool = False):
avg_speedups = {
"DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0,
"vLLM CUTLASS vs vLLM Triton": 0,
"vLLM CUTLASS vs vLLM Triton": 0
}
for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
"time_ms"]
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
vllm_triton_time / vllm_cutlass_time
)
avg_speedups[
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups[
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
avg_speedups[
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"]
@ -421,7 +412,8 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
for impl in implementations:
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
avg_diff[impl] += result["implementations"][impl]["diff"][
"Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = []

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum
from http import HTTPStatus
from statistics import mean
from typing import NamedTuple, Union
from typing import NamedTuple, Optional, Union
import aiohttp # type: ignore
import numpy as np # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
class ClientArgs(NamedTuple):
seed: int
max_num_requests: int | None
max_num_requests: Optional[int]
skip_first_turn: bool
max_turns: int | None
max_turns: Optional[int]
max_active_conversations: int
verbose: bool
print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
class MetricStats:
def __init__(self) -> None:
self.min: float | None = None
self.max: float | None = None
self.avg: float | None = None
self.min: Optional[float] = None
self.max: Optional[float] = None
self.avg: Optional[float] = None
self.sum = 0.0
self.count = 0
@ -143,7 +143,7 @@ class MovingAverage:
self.index = 0
self.sum = 0.0
self.count = 0
self.avg: float | None = None
self.avg: Optional[float] = None
def update(self, new_value: float) -> None:
if self.count < self.window_size:
@ -198,6 +198,14 @@ class DebugStats:
self.logger.info("-" * 50)
# Must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text
def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0
@ -212,8 +220,8 @@ async def send_request(
chat_url: str,
model: str,
stream: bool = True,
min_tokens: int | None = None,
max_tokens: int | None = None,
min_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
) -> ServerResponse:
payload = {
"model": model,
@ -242,9 +250,9 @@ async def send_request(
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
ttft: float | None = None
ttft: Optional[float] = None
chunk_delay: list[int] = []
latency: float | None = None
latency: Optional[float] = None
first_chunk = ""
generated_text = ""
@ -261,7 +269,7 @@ async def send_request(
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
if chunk == "[DONE]":
# End of stream
latency = time.perf_counter_ns() - start_time
@ -356,7 +364,7 @@ async def send_turn(
req_args: RequestArgs,
verbose: bool,
verify_output: bool,
) -> RequestStats | None:
) -> Optional[RequestStats]:
assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages)
@ -761,7 +769,7 @@ def get_client_config(
"Number of conversations must be equal or larger than the number of clients"
)
max_req_per_client: int | None = None
max_req_per_client: Optional[int] = None
if args.max_num_requests is not None:
# Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients
@ -1024,7 +1032,7 @@ def process_statistics(
warmup_percentages: list[float],
test_params: dict,
verbose: bool,
gen_conv_args: GenConvArgs | None = None,
gen_conv_args: Optional[GenConvArgs] = None,
excel_output: bool = False,
) -> None:
if len(client_metrics) == 0:

49
benchmarks/pyproject.toml Normal file
View File

@ -0,0 +1,49 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.lint.isort]
known-first-party = ["vllm"]
[tool.ruff.format]
docstring-code-format = true

View File

@ -198,24 +198,13 @@ else()
endif()
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
FetchContent_Declare(
oneDNN
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
)
else()
message(STATUS "Downloading oneDNN from GitHub")
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
endif()
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@ -224,7 +213,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -320,4 +308,4 @@ define_gpu_extension_target(
WITH_SOABI
)
message(STATUS "Enabling C extension.")
message(STATUS "Enabling C extension.")

View File

@ -1,97 +0,0 @@
include(FetchContent)
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
if(DEFINED ENV{QUTLASS_SRC_DIR})
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
endif()
if(QUTLASS_SRC_DIR)
FetchContent_Declare(
qutlass
SOURCE_DIR ${QUTLASS_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
qutlass
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif()
if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif()
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
if(QUTLASS_ARCHS MATCHES "10\\.0a")
set(QUTLASS_TARGET_CC 100)
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
set(QUTLASS_TARGET_CC 120)
else()
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
endif()
set(QUTLASS_SOURCES
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
)
set(QUTLASS_INCLUDES
${qutlass_SOURCE_DIR}
${qutlass_SOURCE_DIR}/qutlass
${qutlass_SOURCE_DIR}/qutlass/csrc/include
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
)
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
else()
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
endif()
set_gencode_flags_for_srcs(
SRCS "${QUTLASS_SOURCES}"
CUDA_ARCHS "${QUTLASS_ARCHS}"
)
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
target_compile_definitions(_C PRIVATE
QUTLASS_DISABLE_PYBIND=1
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
)
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
)
else()
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
message(STATUS
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
else()
message(STATUS
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
"CUDA_ARCHS='${CUDA_ARCHS}'.")
endif()
endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -16,7 +16,7 @@ import shutil
from torch.utils.hipify.hipify_python import hipify
if __name__ == "__main__":
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Project directory where all the source + include files live.
@ -34,14 +34,15 @@ if __name__ == "__main__":
)
# Source files to convert.
parser.add_argument(
"sources", help="Source files to hipify.", nargs="*", default=[]
)
parser.add_argument("sources",
help="Source files to hipify.",
nargs="*",
default=[])
args = parser.parse_args()
# Limit include scope to project_dir only
includes = [os.path.join(args.project_dir, "*")]
includes = [os.path.join(args.project_dir, '*')]
# Get absolute path for all source files.
extra_files = [os.path.abspath(s) for s in args.sources]
@ -50,31 +51,25 @@ if __name__ == "__main__":
# The directory might already exist to hold object files so we ignore that.
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
hipify_result = hipify(
project_directory=args.project_dir,
output_directory=args.output_dir,
header_include_dirs=[],
includes=includes,
extra_files=extra_files,
show_detailed=True,
is_pytorch_extension=True,
hipify_extra_files_only=True,
)
hipify_result = hipify(project_directory=args.project_dir,
output_directory=args.output_dir,
header_include_dirs=[],
includes=includes,
extra_files=extra_files,
show_detailed=True,
is_pytorch_extension=True,
hipify_extra_files_only=True)
hipified_sources = []
for source in args.sources:
s_abs = os.path.abspath(source)
hipified_s_abs = (
hipify_result[s_abs].hipified_path
if (
s_abs in hipify_result
and hipify_result[s_abs].hipified_path is not None
)
else s_abs
)
hipified_s_abs = (hipify_result[s_abs].hipified_path if
(s_abs in hipify_result
and hipify_result[s_abs].hipified_path is not None)
else s_abs)
hipified_sources.append(hipified_s_abs)
assert len(hipified_sources) == len(args.sources)
assert (len(hipified_sources) == len(args.sources))
# Print hipified source files.
print("\n".join(hipified_sources))

View File

@ -28,10 +28,10 @@
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
#include "../quantization/fp8/amd/quant_utils.cuh"
typedef __hip_bfloat16 __nv_bfloat16;
#else
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))

View File

@ -64,11 +64,3 @@ void indexer_k_quant_and_cache(
torch::Tensor& slot_mapping, // [num_tokens]
int64_t quant_block_size, // quantization block size
const std::string& scale_fmt);
// Extract function to gather quantized K cache
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& dst_k, // [num_tokens, head_dim]
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
const torch::Tensor& block_table, // [batch_size, num_blocks]
const torch::Tensor& cu_seq_lens); // [batch_size + 1]

View File

@ -9,14 +9,14 @@
#include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
#include "quantization/fp8/amd/quant_utils.cuh"
#else
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#include "quantization/fp8/nvidia/quant_utils.cuh"
#endif
#include <algorithm>
#include <cassert>
#include <cfloat>
#include <cfloat> // FLT_MIN
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
@ -479,7 +479,6 @@ __global__ void concat_and_cache_ds_mla_kernel(
// Compute the scale for the tile
float tile_scale = max_abs / 448.f;
tile_scale = fmaxf(tile_scale, FLT_MIN);
// The first lane of each half-warp writes the scale to kv_cache
if ((lane_idx == 0) || (lane_idx == 16)) {
@ -572,70 +571,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
}
}
template <int BLOCK_Y_SIZE>
__global__ void cp_gather_indexer_k_quant_cache_kernel(
const char* __restrict__ kv_cache, // [num_blocks, block_size,
// cache_stride]
char* __restrict__ dst_k, // [num_tokens, head_dim]
char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size *
// 4]
const int* __restrict__ block_table, // [batch_size, num_blocks]
const int* __restrict__ cu_seq_lens, // [batch_size + 1]
const int batch_size, // batch size
const int64_t token_stride, // stride for each token in dst_k
const int64_t head_dim, // dimension of each head
const int64_t block_stride, // stride for each block in kv_cache
const int64_t cache_token_stride, // stride for each token in kv_cache
const int64_t cache_block_size, // num_tokens for each block in kv_cache
const int num_blocks, // number of blocks
const int num_tokens, // number of tokens
const int quant_block_size // quantization block size
) {
constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
// Find batch index within a block
__shared__ int batch_idx[BLOCK_Y_SIZE];
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
iter++) {
int tid = iter * blockDim.x + threadIdx.x;
if (tid < batch_size) {
const int seq_start = cu_seq_lens[tid];
const int seq_end = cu_seq_lens[tid + 1];
if (token_idx >= seq_start && token_idx < seq_end) {
batch_idx[threadIdx.y] = tid;
}
}
}
#ifndef USE_ROCM
__syncwarp();
#endif
if (head_idx >= head_dim || token_idx >= num_tokens) {
return;
}
const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
inbatch_seq_idx / cache_block_size];
const int64_t src_block_offset = block_idx * block_stride;
const int64_t cache_inblock_offset =
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
;
if (threadIdx.x == 0) {
const int64_t src_scale_offset =
src_block_offset + cache_block_size * head_dim +
cache_inblock_offset * 4 / quant_block_size;
reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
}
}
} // namespace vllm
// KV_T is the data type of key and value tensors.
@ -1237,59 +1172,3 @@ void indexer_k_quant_and_cache(
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
CALL_INDEXER_K_QUANT_AND_CACHE);
}
// Macro to dispatch the kernel based on the data amount.
#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \
vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE> \
<<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE, \
(head_dim + 8 * vec_size - 1) / (8 * vec_size)), \
dim3(8, BLOCK_Y_SIZE), 0, stream>>>( \
reinterpret_cast<char*>(kv_cache.data_ptr()), \
reinterpret_cast<char*>(dst_k.data_ptr()), \
reinterpret_cast<char*>(dst_scale.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \
kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \
num_tokens, quant_block_size);
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& dst_k, // [num_tokens, head_dim]
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
const torch::Tensor& block_table, // [batch_size, num_blocks]
const torch::Tensor& cu_seq_lens // [batch_size + 1]
) {
int batch_size = block_table.size(0);
int num_tokens = dst_k.size(0);
int head_dim = dst_k.size(1);
int quant_block_size = head_dim * 4 / dst_scale.size(1);
TORCH_CHECK(kv_cache.device() == dst_k.device(),
"kv_cache and dst_k must be on the same device");
TORCH_CHECK(kv_cache.device() == dst_scale.device(),
"kv_cache and dst_scale must be on the same device");
TORCH_CHECK(kv_cache.device() == block_table.device(),
"kv_cache and block_table must be on the same device");
TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
"kv_cache and cu_seq_lens must be on the same device");
TORCH_CHECK(head_dim % quant_block_size == 0,
"head_dim must be divisible by quant_block_size");
constexpr int vec_size = 16;
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (num_tokens < 32) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
} else if (num_tokens < 64) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
} else if (num_tokens < 128) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
} else if (num_tokens < 256) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
} else if (num_tokens < 512) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
} else {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
}
}

View File

@ -8,12 +8,9 @@ namespace vllm {
// vllm_kernel_override_batch_invariant(); returns true
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
inline bool vllm_kernel_override_batch_invariant() {
static bool cached = []() {
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0;
}();
return cached;
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0;
}
} // namespace vllm

View File

@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
}
void DNNLMatMulPrimitiveHandler::prepack_weight(
void* original_b_ptr, dnnl::memory::desc original_b_md,
dnnl::memory::desc b_target_mem_desc) {
void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
{
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
assert(!use_azp_);
};
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
prepack_weight(args.b_ptr, original_b_md,
prepack_weight(args.b_ptr,
create_primitive_desc(
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
.use_bias = false,
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
assert(ab_type_ == dnnl::memory::data_type::f32 ||
ab_type_ == dnnl::memory::data_type::bf16 ||
ab_type_ == dnnl::memory::data_type::f16);
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
prepack_weight(args.b_ptr, original_b_md,
prepack_weight(args.b_ptr,
create_primitive_desc(
MSizeCacheKey{
#ifdef VLLM_USE_ACL
// Arm Compute Library (ACL) backend for oneDNN does
// not support runtime
// dimensions, so we set M to a default value
.a_m_size = 128,
.a_m_stride = b_k_size_,
#else
.a_m_size = DNNL_RUNTIME_DIM_VAL,
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
#endif
.use_bias = false,
.bias_type = dnnl::memory::data_type::undef},
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
.use_bias = false,
.bias_type = dnnl::memory::data_type::undef},
true)
.weights_desc());
init_runtime_memory_cache(args);
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
c_storage->set_data_handle((void*)args.c_ptr);
c_mem_desc->dims[0] = args.a_m_size;
#ifndef VLLM_USE_ACL
// We do not support in ACL backend of oneDNN, we handle bias by:
// 1. copying it into the result tensor
// 2. attaching a fused-sum post-op to the matmul primitive
if (args.use_bias) {
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
bias_storage->set_data_handle((void*)args.bias_ptr);
}
#endif
dnnl::matmul matmul = get_matmul_cache(args);
// With ACL backend of oneDNN, the required memory format might change when the
// source tensor dims change. This does not really happen in practice, so isn't
// a performance hit, but we need to support it because the API allows for it.
#ifdef VLLM_USE_ACL
auto new_expected_wei_desc =
dnnl::matmul::primitive_desc(
const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
.weights_desc();
if (new_expected_wei_desc != b_target_mem_desc_) {
prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
b_target_mem_desc_, new_expected_wei_desc);
}
#endif
dnnl::matmul matmul = get_matmul_cache(args);
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
scratchpad_storage->set_data_handle(
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
} else {
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
{key.a_m_stride, 1});
#ifdef VLLM_USE_ACL
// ACL's backend of oneDNN always expects the weight format to be "any"
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
dnnl::memory::format_tag::any);
#else
b_md = b_target_mem_desc_;
#endif
}
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
dnnl::memory::format_tag::ab);
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
if (key.use_bias) {
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
// Since ACL's matmuls don't support passing a bias_md, we apply the bias
// through a fused-sum post-op
#ifdef VLLM_USE_ACL
dnnl::post_ops post_ops;
post_ops.append_sum();
attr.set_post_ops(post_ops);
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
attr);
#else
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
c_md, attr);
#endif
} else {
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
attr);
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
default_engine(), nullptr);
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
// ACL matmuls don't support bias_md, so we don't need these
#ifndef VLLM_USE_ACL
memory_cache_[DNNL_ARG_BIAS] =
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
default_engine(), nullptr);
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
#endif
memory_cache_[DNNL_ARG_SCRATCHPAD] =
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
default_engine(), nullptr);
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
}
bool is_onednn_acl_supported() {
#ifdef VLLM_USE_ACL
return true;
#else
return false;
#endif
}

View File

@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
protected:
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
void prepack_weight(void* original_b_ptr,
dnnl::memory::desc b_target_mem_desc);
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);

View File

@ -527,42 +527,21 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
// ACL matmuls expect contiguous source tensors
#ifdef VLLM_USE_ACL
torch::Tensor a_contig = a.contiguous();
#endif
MatMulPrimitiveHandler::ExecArgs exec_args;
#ifdef VLLM_USE_ACL
exec_args.a_m_size = a_contig.size(0);
exec_args.a_m_stride = a_contig.stride(0);
#else
exec_args.a_m_size = a.size(0);
exec_args.a_m_stride = a.stride(0);
#endif
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
if (bias.has_value()) {
exec_args.use_bias = true;
exec_args.bias_type = get_dnnl_type<scalar_t>();
#ifdef VLLM_USE_ACL
// ACL matmuls in oneDNN do not support a bias.
// We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
c.copy_(bias.value());
#else
exec_args.bias_ptr = bias->data_ptr<scalar_t>();
#endif
} else {
exec_args.use_bias = false;
exec_args.bias_type = get_dnnl_type<void>();
exec_args.bias_ptr = nullptr;
}
#ifdef VLLM_USE_ACL
exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
#else
exec_args.a_ptr = a.data_ptr<scalar_t>();
#endif
exec_args.c_ptr = c.data_ptr<scalar_t>();
ptr->execute(exec_args);

View File

@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& bias, int64_t handler);
bool is_onednn_acl_supported();
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -183,9 +181,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"int handler) -> ()");
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
// Check if oneDNN was built with ACL backend
ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
// Create oneDNN W8A8 handler
ops.def(
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "

View File

@ -12,7 +12,6 @@ using CubMaxOp = cub::Max;
#endif // CUB_VERSION
#else
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
using CubAddOp = hipcub::Sum;
using CubMaxOp = hipcub::Max;
using CubAddOp = cub::Sum;
using CubMaxOp = cub::Max;
#endif // USE_ROCM

View File

@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
**{
VLLMDataType.u4b8: "u4b8",
VLLMDataType.u8b128: "u8b128",
},
}
}
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
**{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
},
}
}
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
**{
VLLMDataType.u4b8: 4,
VLLMDataType.u8b128: 8,
},
}
}
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -67,13 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.f32: "at::ScalarType::Float",
}
VLLMKernelScheduleTag: dict[
Union[MixedInputKernelScheduleType, KernelScheduleType], str
] = {
**KernelScheduleTag, # type: ignore
**{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
},
}
VLLMKernelScheduleTag: dict[Union[
MixedInputKernelScheduleType, KernelScheduleType], str] = {
**KernelScheduleTag, # type: ignore
**{
MixedInputKernelScheduleType.TmaWarpSpecialized:
"cutlass::gemm::KernelTmaWarpSpecialized",
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
"cutlass::gemm::KernelTmaWarpSpecializedPingpong",
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
"cutlass::gemm::KernelTmaWarpSpecializedCooperative",
}
}

View File

@ -6,7 +6,7 @@
*/
#include "type_convert.cuh"
#include "quantization/w8a8/fp8/common.cuh"
#include "quantization/fp8/common.cuh"
#include "dispatch_utils.h"
#include "cub_helpers.h"
#include "core/batch_invariant.hpp"

View File

@ -17,30 +17,25 @@ FILE_HEAD = """
namespace MARLIN_NAMESPACE_NAME {
""".strip()
TEMPLATE = (
"template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
"{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, "
"{{stages}}, "
"{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>"
"( MARLIN_KERNEL_PARAMS );"
)
TEMPLATE = ("template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
"{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, "
"{{stages}}, "
"{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>"
"( MARLIN_KERNEL_PARAMS );")
# int8 with zero point case (vllm::kU8) is also supported,
# we don't add it to reduce wheel size.
SCALAR_TYPES = [
"vllm::kU4",
"vllm::kU4B8",
"vllm::kU8B128",
"vllm::kFE4M3fn",
"vllm::kFE2M1f",
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
"vllm::kFE2M1f"
]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
@ -63,12 +58,11 @@ def generate_new_kernels():
all_template_str_list = []
for group_blocks, m_blocks, thread_configs in itertools.product(
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
):
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
# act order case only support gptq-int4 and gptq-int8
if group_blocks == 0 and scalar_type not in [
"vllm::kU4B8",
"vllm::kU8B128",
"vllm::kU4B8", "vllm::kU8B128"
]:
continue
if thread_configs[2] == 256:

View File

@ -100,11 +100,6 @@ void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices,
torch::Tensor& values, int64_t numRows, int64_t stride0,
int64_t stride1);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale,
double epsilon);
@ -138,12 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
torch::Tensor& input,
torch::Tensor& input_global_scale);
#endif
void persistent_masked_m_silu_mul_quant(
void silu_mul_fp8_quant_deep_gemm_cuda(
const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& counts, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
bool use_ue8m0);
int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

View File

@ -7,7 +7,7 @@
#include "../cuda_compat.h"
#include "dispatch_utils.h"
#include "quantization/w8a8/fp8/common.cuh"
#include "quantization/fp8/common.cuh"
#include <c10/util/Float8_e4m3fn.h>
@ -114,22 +114,13 @@ __global__ void act_and_mul_quant_kernel(
}
__device__ __forceinline__ float silu(float x) {
return __fdividef(x, (1.f + expf(-x)));
return (__fdividef(x, (1.f + expf(-x))));
}
__device__ __forceinline__ float2 silu2(float2 x) {
return make_float2(silu(x.x), silu(x.y));
}
__device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
#ifndef USE_ROCM
return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
__float2bfloat16_rn(silu(x.y)));
#else
return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
#endif
}
#ifndef USE_ROCM
__device__ __forceinline__ float warp_max(float v) {
static constexpr unsigned FULL_MASK = 0xffffffffu;
@ -232,308 +223,224 @@ constexpr __nv_bfloat16 get_fp8_min() {
return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
}
}
template <typename Idx_t>
__device__ __forceinline__ int warp_expert_search(
int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
const Idx_t* input_ptr = input + idx;
int base_offset = 0;
for (;;) {
bool move_on = (idx < n && *input_ptr <= val);
unsigned mask = __ballot_sync(0xffffffff, move_on);
if (mask != 0xffffffffu) {
int last_lane = 31 - __clz(mask);
return base_offset + last_lane;
}
input_ptr += 32;
base_offset += 32;
idx += 32;
}
}
template <int num_parallel_tokens>
__device__ __forceinline__ void token_bounds(int32_t n_tokens,
int32_t worker_id,
int32_t& n_tokens_lower,
int32_t& n_tokens_upper) {
if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
if (worker_id >= num_parallel_tokens) return;
n_tokens_lower = worker_id;
n_tokens_upper = worker_id + 1;
} else {
int32_t chunk_size = n_tokens / num_parallel_tokens;
int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
auto calc_id = [&](int32_t id) {
if (id < residual)
return min(n_tokens, id * (chunk_size + 1));
else
return min(n_tokens, id * chunk_size + residual);
};
n_tokens_lower = calc_id(worker_id);
n_tokens_upper = calc_id(worker_id + 1);
}
}
template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
#ifndef USE_ROCM
template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
int NUM_STAGES = 3>
__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
float* __restrict__ _y_s, const int32_t* __restrict__ counts,
// sizes
Idx_t E, Idx_t T, Idx_t H,
int H, int G,
// strides (in elements)
Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
Idx_t stride_ys_g, Idx_t stride_counts_e) {
#ifndef USE_ROCM
static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
extern __shared__ __align__(16) __int128_t smem_128[];
int* s_expert_offsets =
reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
// We assign EPS with it's 16-bit unsigned counterpart to allow constexpr.
// We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
int tid = threadIdx.x;
int warp_id = tid >> 5;
int lane_id = tid & 0x1f;
int running_sum{};
if (!warp_id) {
for (int i = 0; i < E; i += WARP_SIZE) {
bool valid = (i + threadIdx.x) < E;
int value =
(valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
(!lane_id ? running_sum : 0);
// We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
static constexpr int32_t BFLOAT16_PER_GROUP = 8;
for (int offset = 1; offset < 32; offset *= 2) {
int n = __shfl_up_sync(0xFFFFFFFFu, value, offset);
if (lane_id >= offset) value += n;
}
// We split the shared memory in half, corresponding to gate and up matrices:
// [...gate_i, ...up_i] where 0 <= i < stages.
static constexpr int32_t S_NUM_128 =
2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
__shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
if (valid) {
s_expert_offsets[i + threadIdx.x + 1] = value;
}
const int32_t tid = threadIdx.x;
const int32_t warp_id = tid / WARP_SIZE;
const int32_t lane_id = tid % WARP_SIZE;
running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1);
}
auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
if (!lane_id) {
s_expert_offsets[0] = 0;
}
// block handles one (expert e, group g)
int32_t pid = blockIdx.x;
int32_t e = pid / G;
int32_t g = pid % G;
const int32_t n_tokens = counts[e * stride_counts_e];
if (!n_tokens) {
return; // Exit ASAP.
}
__syncthreads();
const Idx_t stride_i_t_128 = stride_i_t / 8u;
int32_t total_tokens = s_expert_offsets[E];
int32_t n_tokens_lower, n_tokens_upper;
const int warp_position_yq = warp_id * (H / NUM_WARPS);
const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
// A single block will handle tokens_per_block tokens.
// Each block i iterates over tokens of a slice of n_tokens =
// expert_counts[i], with the size of chunk being
// (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
// updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
// Each warp will get space to store its hidden dim for gate and up.
__int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES);
__int128_t* smem_load_ptr = s_hidden_load + lane_id;
const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max);
int32_t compute_pipeline_offset_64 = 0;
int32_t load_stage_offset{};
const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
__int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
lane_id;
__int64_t* s_gate64_ptr = smem_compute_ptr;
__int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
int tokens_lower, tokens_upper;
token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
tokens_upper);
Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
int token_id = tokens_lower;
int32_t t_load{};
if (token_id < tokens_upper) {
expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
expert_offset = s_expert_offsets[expert_id];
next_expert_offset = s_expert_offsets[expert_id + 1];
if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
// Specialize this, but can be likely fused.
if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
return;
}
n_tokens_lower = blockIdx.y;
n_tokens_upper = blockIdx.y + 1;
} else {
// This thread block has no work to do.
auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
auto calc_id = [&](int32_t id) {
if (id < residual) {
return min(n_tokens, id * (chunk_size + 1));
} else {
return min(n_tokens, id * chunk_size + residual);
}
};
n_tokens_lower = calc_id(blockIdx.y);
n_tokens_upper = calc_id(blockIdx.y + 1);
}
if (n_tokens_lower >= n_tokens_upper) {
return;
}
int t_load_bound = H / (GROUP_SIZE * NUM_WARPS);
// We do calculations here, using constexpr wherever possible.
const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
const Idx_t base_yq =
e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
stride_i_t_128 * n_tokens_lower;
auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
auto y_s_ptr =
_y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
stride_yq_t * n_tokens_lower + 4 * lane_id;
int32_t t_load = n_tokens_lower, load_stage_id = 0;
auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
int32_t stage_offset{};
Idx_t base_i = ((expert_id * stride_i_e) / 8) +
(token_id - expert_offset) * stride_i_t / 8;
const Idx_t gate_warp_offset =
warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
const __int128_t* input_128_ptr =
reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
__int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
auto token_offset = token_id - expert_offset;
static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
static constexpr int32_t LOAD_STAGE_MOD =
NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
// Two halves of all threads in a block conduct global loads for gate and up,
// repsectively.
auto load_and_advance_y_pred = [&] {
if (t_load < t_load_bound) {
// Here we are simply continuing to load data
// from the current token.
auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
if (t_load < n_tokens_upper) {
auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
// It is very important that LOAD_STAGE_SIZE is constexpr to avoid
// unnecessary ALU ops.
load_stage_offset += LOAD_STAGE_SIZE;
load_stage_offset %= LOAD_STAGE_MOD;
stage_offset += LOAD_STAGE_SIZE;
stage_offset %= LOAD_STAGE_MOD;
cp_async4(smem_load_ptr_staged, load_ptr);
load_ptr += GROUP_SIZE / 8;
++t_load;
} else if (token_id + 1 < tokens_upper) {
// We loaded everything from the current token, let's move on
// to the next one, and we checked that we have more tokens to load.
++token_id;
t_load = 0;
if (token_id >= next_expert_offset) {
// We need to find the next expert.
do {
// This is a loop because it's possible
// that some experts are assigned 0 tokens.
// NOTE: We are guaranteed that there's at least
// one more token left so we don't have to check for
// expert_id bounds.
++expert_id;
// This skips 1 memory read.
expert_offset = next_expert_offset;
next_expert_offset = s_expert_offsets[expert_id + 1];
} while (next_expert_offset == expert_offset);
base_i = expert_id * (stride_i_e / 8);
token_offset = 0;
load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
if (tid < HALF_THREAD_COUNT) {
cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
gate_128_ptr += stride_i_t_128;
} else {
// We remain within the same expert, so just
// move by H/4 __int128_t (2 * H/8).
base_i += stride_yq_t / 4;
token_offset++;
cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
up_128_ptr += stride_i_t_128;
}
load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
// It is very important that LOAD_STAGE_SIZE is constexpr to avoid
// unnecessary ALU ops.
load_stage_offset += LOAD_STAGE_SIZE;
load_stage_offset %= LOAD_STAGE_MOD;
cp_async4(smem_load_ptr_staged, load_ptr);
load_ptr += GROUP_SIZE / 8;
++t_load;
++load_stage_id;
}
// We fence even if there is nothing to load to simplify pipelining.
cp_async_fence();
};
// We need to warm-up the pipeline.
#pragma unroll
for (int i = 0; i < NUM_STAGES - 1; i++) {
load_and_advance_y_pred();
}
__nv_fp8x4_e4m3* y_q_base_ptr =
reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
__int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
lane_id;
__int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
for (auto j = tokens_lower; j < tokens_upper; j++) {
const Idx_t base_ys = expert_id * stride_ys_e;
auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
__nv_fp8x4_e4m3* y_q_ptr =
y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
warp_position_yq * stride_yq_h) /
4;
const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
for (int i = 0; i < COMPUTE_LIMIT; i++) {
cp_async_wait<NUM_STAGES - 2>();
__syncthreads();
load_and_advance_y_pred();
int32_t compute_pipeline_offset_64 = 0;
__int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64;
__int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64;
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
__nv_bfloat162 results_bf162[2];
// COMPUTE_STAGE_SIZE/MOD must also be constexpr!
compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE;
compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
cp_async_wait<NUM_STAGES - 2>();
__syncthreads();
__int64_t gate64 = *gate64_ptr;
__int64_t up64 = *up64_ptr;
// We double-buffer pipelined loads so that the next load will
// concurrently run with compute without overwrites.
load_and_advance_y_pred();
// Compute
__nv_bfloat162 res[2];
__nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
__nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64);
auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
// STAGE_SIZE must also be constexpr!
compute_pipeline_offset_64 += STAGE_SIZE;
compute_pipeline_offset_64 %= STAGE_MOD;
// Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
__int64_t gate64 = *s_gate_compute_64;
__nv_bfloat162* s_gate_compute_32 =
reinterpret_cast<__nv_bfloat162*>(&gate64);
__int64_t up64 = *s_up_compute_64;
__nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
#pragma unroll
for (int32_t k = 0; k < 2; ++k) {
__nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k]));
res[k] = __hmul2(gate, s_up_comp[k]);
}
auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
_y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
__nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
if constexpr (USE_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
__nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
auto y_s2 = make_bfloat162(inv_y, inv_y);
for (int i = 0; i < 2; i++) {
// For silu, we make sure that div is emitted.
float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
results_bf162[i] = __float22bfloat162_rn(gate);
}
#pragma unroll
for (int32_t k = 0; k < 2; ++k) {
res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min),
__bfloat162bfloat162(fp8_max));
}
for (int i = 0; i < 2; i++) {
results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
}
*y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]);
y_q_ptr += WARP_SIZE * stride_yq_h;
auto _y_max2 =
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
if (!lane_id) {
*y_s_ptr = y_s;
y_s_ptr += stride_ys_g;
}
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
// An entire group is assigned to a single warp, so a simple warp reduce
// is used.
__nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
if constexpr (USE_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
auto inv_y = __float2bfloat16_rn(1.f) / y_s;
auto y_s2 = make_bfloat162(inv_y, inv_y);
#pragma unroll
for (int32_t i = 0; i < 2; ++i) {
results_bf162[i] =
clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
__bfloat162bfloat162(fp8_max));
}
auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
*reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
y_q_ptr += stride_yq_t;
if (lane_id == 0) {
*y_s_ptr = y_s;
y_s_ptr += stride_ys_t;
}
}
#endif
}
#endif
} // namespace vllm
@ -568,14 +475,14 @@ void silu_and_mul_quant(torch::Tensor& out, // [..., d]
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}
void persistent_masked_m_silu_mul_quant(
const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& tokens_per_expert, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
bool use_ue8m0) {
void silu_mul_fp8_quant_deep_gemm_cuda(
const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& counts, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
#ifndef USE_ROCM
// This kernel relies heavily on cp.async and fp8 support.
// This kernel currently only supports H % 128 == 0 and assumes a
// fixed GROUP_SIZE of 128.
TORCH_CHECK(input.dtype() == torch::kBFloat16);
@ -584,6 +491,10 @@ void persistent_masked_m_silu_mul_quant(
TORCH_CHECK(y_s.dtype() == torch::kFloat32);
TORCH_CHECK(input.size(-1) % 256 == 0);
// Check that num_parallel_tokens is of power of 2 and between 1 and 64.
TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
using Idx_t = int64_t;
Idx_t E = input.size(0);
@ -599,54 +510,81 @@ void persistent_masked_m_silu_mul_quant(
Idx_t stride_ys_t = y_s.stride(1);
Idx_t stride_ys_g = y_s.stride(2);
Idx_t stride_counts_e = tokens_per_expert.stride(0);
Idx_t stride_counts_e = counts.stride(0);
static constexpr int GROUP_SIZE = 128;
#define KERNEL_FN \
if (use_ue8m0) { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \
NUM_PARALLEL_TOKENS, true> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \
stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \
stride_counts_e); \
} else { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \
NUM_PARALLEL_TOKENS, false> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \
stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \
stride_counts_e); \
}
#define KERNEL_CALL_H \
if (H % (4 * GROUP_SIZE) == 0) { \
static constexpr int NUM_WARPS = 4; \
populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
KERNEL_FN \
} else { \
static constexpr int NUM_WARPS = 1; \
populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
KERNEL_FN \
}
#define KERNEL_CALL_TOP_LEVEL \
if (num_parallel_tokens == 1) { \
static constexpr int NUM_PARALLEL_TOKENS = 1; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 2) { \
static constexpr int NUM_PARALLEL_TOKENS = 2; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 4) { \
static constexpr int NUM_PARALLEL_TOKENS = 4; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 8) { \
static constexpr int NUM_PARALLEL_TOKENS = 8; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 16) { \
static constexpr int NUM_PARALLEL_TOKENS = 16; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 32) { \
static constexpr int NUM_PARALLEL_TOKENS = 32; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 64) { \
static constexpr int NUM_PARALLEL_TOKENS = 64; \
KERNEL_CALL_H \
}
Idx_t G;
dim3 block, grid;
auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
G = H / Idx_t(group_size * num_warps);
grid = dim3(E * G, _num_parallel_tokens);
block = dim3(num_warps * WARP_SIZE);
};
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
#define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \
static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \
int sms = SILU_V2_BLOCK_COUNT; \
static constexpr int max_shared_mem_bytes = \
GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2; \
dim3 grid(sms), block(THREAD_COUNT); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
VLLM_DISPATCH_FP8_TYPES( \
y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel< \
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \
USE_UE8M0, GROUP_SIZE, STAGES> \
<<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \
stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \
stride_ys_g, stride_counts_e); \
});
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
if (!use_ue8m0) {
if (H >= 4096) {
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
} else {
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
}
} else {
if (H >= 4096) {
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
} else {
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
}
}
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
"silu_mul_fp8_quant_deep_gemm_kernel",
[&] { KERNEL_CALL_TOP_LEVEL });
#endif
}

View File

@ -1,11 +1,15 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/all.h>
#ifndef USE_ROCM
#include "../per_token_group_quant_8bit.h"
#endif
#include <cmath>
#include "dispatch_utils.h"
#include "quantization/vectorization_utils.cuh"
#include "cub_helpers.h"
#include "../../cub_helpers.h"
#include "../../dispatch_utils.h"
#include "../vectorization_utils.cuh"
static inline __device__ int8_t float_to_int8_rn(float x) {
#ifdef USE_ROCM
@ -21,6 +25,7 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
float dst = std::nearbyint(x);
// saturate
// See https://github.com/pytorch/pytorch/issues/127666
// See https://github.com/llvm/llvm-project/issues/95183
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -79,6 +84,7 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
static_cast<int32_t>(std::numeric_limits<int8_t>::max());
// saturate
// See https://github.com/pytorch/pytorch/issues/127666
// See https://github.com/llvm/llvm-project/issues/95183
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -170,6 +176,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
// 2. quantize
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
@ -187,6 +194,7 @@ struct MinMax {
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
// add a value to the MinMax
__host__ __device__ MinMax& operator+=(float v) {
min = fminf(min, v);
max = fmaxf(max, v);
@ -220,6 +228,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
const scalar_t* row_in = input + token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size;
// 1. calculate min & max
MinMax thread_mm;
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
[&] __device__(const scalar_t& src) {
@ -252,6 +261,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
const float inv_s = 1.f / scale_sh;
const azp_t azp = azp_sh;
// 2. quantize
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
@ -322,4 +332,14 @@ void dynamic_scaled_int8_quant(
hidden_size);
}
});
}
}
#ifndef USE_ROCM
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max) {
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
int8_min, int8_max);
}
#endif

View File

@ -5,7 +5,7 @@
#include <hip/hip_bf16.h>
#include <hip/hip_bfloat16.h>
#include "../../../../attention/attention_dtypes.h"
#include "../../../attention/attention_dtypes.h"
namespace vllm {
#ifdef USE_ROCM

View File

@ -1,7 +1,7 @@
#include "common.cuh"
#include "dispatch_utils.h"
#include "cub_helpers.h"
#include "quantization/vectorization_utils.cuh"
#include "../../cub_helpers.h"
#include "../vectorization_utils.cuh"
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/Exceptions.h>

View File

@ -1,6 +1,6 @@
#pragma once
#include "../../../../attention/attention_dtypes.h"
#include "../../../attention/attention_dtypes.h"
#include <assert.h>
#include <float.h>
#include <stdint.h>

View File

@ -1,6 +1,6 @@
#include <ATen/cuda/CUDAContext.h>
#include "quantization/w8a8/per_token_group_quant_8bit.h"
#include "../per_token_group_quant_8bit.h"
#include <cmath>
@ -8,9 +8,9 @@
#include <torch/all.h>
#include "quantization/vectorization.cuh"
#include "quantization/vectorization_utils.cuh"
#include "dispatch_utils.h"
#include "../vectorization.cuh"
#include "../vectorization_utils.cuh"
#include "../../dispatch_utils.h"
__device__ __forceinline__ float GroupReduceMax(float val) {
unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
@ -212,4 +212,4 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
double fp8_max, bool scale_ue8m0) {
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
fp8_min, fp8_max, scale_ue8m0);
}
}

View File

@ -6,7 +6,7 @@
#include "quantization/vectorization.cuh"
// TODO(luka/varun):refactor common.cuh to use this file instead
#include "quantization/w8a8/fp8/common.cuh"
#include "quantization/fp8/common.cuh"
namespace vllm {

View File

@ -17,32 +17,28 @@ FILE_HEAD = """
namespace MARLIN_NAMESPACE_NAME {
""".strip()
TEMPLATE = (
"template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
"{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, "
"{{stages}}, "
"{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>"
"( MARLIN_KERNEL_PARAMS );"
)
TEMPLATE = ("template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
"{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, "
"{{stages}}, "
"{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>"
"( MARLIN_KERNEL_PARAMS );")
# int8 with zero point case (vllm::kU8) is also supported,
# we don't add it to reduce wheel size.
SCALAR_TYPES = [
"vllm::kU4",
"vllm::kU4B8",
"vllm::kU8B128",
"vllm::kFE4M3fn",
"vllm::kFE2M1f",
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
"vllm::kFE2M1f"
]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
(128, 64, 128)]
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
# group_blocks:
@ -63,12 +59,11 @@ def generate_new_kernels():
all_template_str_list = []
for group_blocks, m_blocks, thread_configs in itertools.product(
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
):
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
# act order case only support gptq-int4 and gptq-int8
if group_blocks == 0 and scalar_type not in [
"vllm::kU4B8",
"vllm::kU8B128",
"vllm::kU4B8", "vllm::kU8B128"
]:
continue
if thread_configs[2] == 256:
@ -98,7 +93,8 @@ def generate_new_kernels():
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
is_zp_float_list = [False]
if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
if dtype == "fp16" and scalar_type == "vllm::kU4" and \
group_blocks == 4:
# HQQ (is_zp_float = true) only supports
# 4bit quantization and fp16
is_zp_float_list.append(True)

View File

@ -12,21 +12,20 @@ from functools import reduce
from typing import Optional, Union
import jinja2
from vllm_cutlass_library_extension import (
DataType,
EpilogueScheduleTag,
EpilogueScheduleType,
MixedInputKernelScheduleType,
TileSchedulerTag,
TileSchedulerType,
VLLMDataType,
VLLMDataTypeNames,
VLLMDataTypeSize,
VLLMDataTypeTag,
VLLMDataTypeTorchDataTypeTag,
VLLMDataTypeVLLMScalarTypeTag,
VLLMKernelScheduleTag,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
EpilogueScheduleType,
MixedInputKernelScheduleType,
TileSchedulerTag,
TileSchedulerType, VLLMDataType,
VLLMDataTypeNames,
VLLMDataTypeSize, VLLMDataTypeTag,
VLLMDataTypeTorchDataTypeTag,
VLLMDataTypeVLLMScalarTypeTag,
VLLMKernelScheduleTag)
# yapf: enable
#
# Generator templating
@ -287,23 +286,18 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
tile_shape = (
f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
)
cluster_shape = (
f"{schedule_config.cluster_shape_mnk[0]}"
+ f"x{schedule_config.cluster_shape_mnk[1]}"
+ f"x{schedule_config.cluster_shape_mnk[2]}"
)
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
"::"
)[-1]
epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
"::"
)[-1]
tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
f"x{schedule_config.cluster_shape_mnk[1]}" +
f"x{schedule_config.cluster_shape_mnk[2]}")
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
.split("::")[-1]
epilogue_schedule = EpilogueScheduleTag[
schedule_config.epilogue_schedule].split("::")[-1]
tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
.split("::")[-1]
return (
f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+ f"_{epilogue_schedule}_{tile_scheduler}"
)
return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
f"_{epilogue_schedule}_{tile_scheduler}")
# mostly unique shorter sch_sig
@ -322,24 +316,18 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
# unique type_name
def generate_type_signature(kernel_types: TypeConfig):
return str(
"".join(
[
VLLMDataTypeNames[getattr(kernel_types, field.name)]
for field in fields(TypeConfig)
]
)
)
return str("".join([
VLLMDataTypeNames[getattr(kernel_types, field.name)]
for field in fields(TypeConfig)
]))
def generate_type_option_name(kernel_types: TypeConfig):
return ", ".join(
[
f"{field.name.replace('b_', 'with_') + '_type'}="
+ VLLMDataTypeNames[getattr(kernel_types, field.name)]
for field in fields(TypeConfig)
]
)
return ", ".join([
f"{field.name.replace('b_', 'with_')+'_type'}=" +
VLLMDataTypeNames[getattr(kernel_types, field.name)]
for field in fields(TypeConfig)
])
def is_power_of_two(n):
@ -347,6 +335,7 @@ def is_power_of_two(n):
def to_cute_constant(value: list[int]):
def _to_cute_constant(value: int):
if is_power_of_two(value):
return f"_{value}"
@ -361,11 +350,11 @@ def to_cute_constant(value: list[int]):
def unique_schedules(impl_configs: list[ImplConfig]):
# Use dict over set for deterministic ordering
return list(
{
sch: None for impl_config in impl_configs for sch in impl_config.schedules
}.keys()
)
return list({
sch: None
for impl_config in impl_configs
for sch in impl_config.schedules
}.keys())
def unsigned_type_with_bitwidth(num_bits):
@ -391,7 +380,7 @@ template_globals = {
"gen_type_sig": generate_type_signature,
"unique_schedules": unique_schedules,
"unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
"gen_type_option_name": generate_type_option_name,
"gen_type_option_name": generate_type_option_name
}
@ -409,28 +398,23 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
sources = []
sources.append(
(
"machete_mm_dispatch",
mm_dispatch_template.render(impl_configs=impl_configs),
)
)
sources.append((
"machete_mm_dispatch",
mm_dispatch_template.render(impl_configs=impl_configs),
))
prepack_types = []
for impl_config in impl_configs:
convert_type = (
impl_config.types.a
if impl_config.types.b_group_scale == DataType.void
else impl_config.types.b_group_scale
)
convert_type = impl_config.types.a \
if impl_config.types.b_group_scale == DataType.void \
else impl_config.types.b_group_scale
prepack_types.append(
PrepackTypeConfig(
a=impl_config.types.a,
b_num_bits=VLLMDataTypeSize[impl_config.types.b],
convert=convert_type,
accumulator=impl_config.types.accumulator,
)
)
))
def prepacked_type_key(prepack_type: PrepackTypeConfig):
# For now, we can just use the first accumulator type seen since
@ -446,14 +430,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
unique_prepack_types.append(prepack_type)
prepack_types_seen.add(key)
sources.append(
(
"machete_prepack",
prepack_dispatch_template.render(
types=unique_prepack_types,
),
)
)
sources.append((
"machete_prepack",
prepack_dispatch_template.render(types=unique_prepack_types, ),
))
# Split up impls across files
num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
@ -486,12 +466,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
curr_impl_in_file += len(files_impls[-1][-1].schedules)
for part, file_impls in enumerate(files_impls):
sources.append(
(
f"machete_mm_impl_part{part + 1}",
mm_impl_template.render(impl_configs=file_impls),
)
)
sources.append((
f"machete_mm_impl_part{part+1}",
mm_impl_template.render(impl_configs=file_impls),
))
return sources
@ -536,7 +514,8 @@ def generate():
# For now we use the same heuristic for all types
# Heuristic is currently tuned for H100s
default_heuristic = [
(cond, ScheduleConfig(*tile_config, **sch_common_params)) # type: ignore
(cond, ScheduleConfig(*tile_config,
**sch_common_params)) # type: ignore
for cond, tile_config in default_tile_heuristic_config.items()
]
@ -562,18 +541,14 @@ def generate():
a_token_scale=DataType.void,
out=a,
accumulator=DataType.f32,
)
for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
for a in (DataType.f16, DataType.bf16)
)
) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
for a in (DataType.f16, DataType.bf16))
impl_configs += [
ImplConfig(x[0], x[1], x[2])
for x in zip(
GPTQ_kernel_type_configs,
itertools.repeat(get_unique_schedules(default_heuristic)),
itertools.repeat(default_heuristic),
)
for x in zip(GPTQ_kernel_type_configs,
itertools.repeat(get_unique_schedules(default_heuristic)),
itertools.repeat(default_heuristic))
]
AWQ_kernel_type_configs = list(
@ -586,18 +561,14 @@ def generate():
a_token_scale=DataType.void,
out=a,
accumulator=DataType.f32,
)
for b in (DataType.u4, DataType.u8)
for a in (DataType.f16, DataType.bf16)
)
) for b in (DataType.u4, DataType.u8)
for a in (DataType.f16, DataType.bf16))
impl_configs += [
ImplConfig(x[0], x[1], x[2])
for x in zip(
AWQ_kernel_type_configs,
itertools.repeat(get_unique_schedules(default_heuristic)),
itertools.repeat(default_heuristic),
)
for x in zip(AWQ_kernel_type_configs,
itertools.repeat(get_unique_schedules(default_heuristic)),
itertools.repeat(default_heuristic))
]
# TODO: Support W4A8 when ready

View File

@ -1,6 +1,7 @@
#pragma once
#include <torch/all.h>
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
// 8-bit per-token-group quantization helper used by both FP8 and INT8
void per_token_group_quant_8bit(const torch::Tensor& input,
torch::Tensor& output_q,

View File

@ -1,12 +0,0 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/all.h>
#include "quantization/w8a8/per_token_group_quant_8bit.h"
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max) {
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
int8_min, int8_max);
}

View File

@ -23,7 +23,7 @@
#include <algorithm>
#include "../attention/dtype_fp8.cuh"
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
#include "../quantization/fp8/amd/quant_utils.cuh"
// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
#if !defined(HIP_FP8_TYPE_OCP)

View File

@ -11,7 +11,7 @@
#include "../cuda_compat.h"
#include "dispatch_utils.h"
#include "quantization/w8a8/fp8/common.cuh"
#include "quantization/fp8/common.cuh"
#if defined(__HIPCC__) && \
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))

View File

@ -44,245 +44,6 @@ __global__ void apply_repetition_penalties_kernel(
}
}
static inline __device__ uint16_t extractBinIdx(float x) {
union {
__half h;
uint16_t u16;
} tmp;
tmp.h = __float2half_rn(x);
tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
return 511 - (tmp.u16 >> 7);
}
template <int kNumThreadsPerBlock = 512>
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
const int* rowEnds, int* outIndices,
float* outLogits, int stride0, int stride1) {
// The number of bins in the histogram.
static constexpr int kNumBins = 512;
// The top-k width.
static constexpr int kTopK = 2048;
// The number of elements per thread for the final top-k sort.
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
// The class to sort the elements during the final top-k sort.
using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
kNumTopKItemsPerThread, int>;
// The number of slots for the final pass.
static constexpr int kNumFinalItems = 3072;
// The number of elements per thread for the final sort.
static constexpr int kNumFinalItemsPerThread =
kNumFinalItems / kNumThreadsPerBlock;
// The class to sort the elements during the final pass.
using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
kNumFinalItemsPerThread, int>;
// The class to compute the inclusive prefix-sum over the histogram.
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
// Shared memory to compute the block scan.
__shared__ typename Scan::TempStorage smemScan;
// The structure to store the final items (for the final pass).
struct FinalItems {
// Shared memory to store the indices for the final pass.
int indices[kNumFinalItems];
// Shared memory to store the logits for the final pass.
float logits[kNumFinalItems];
};
// Shared memory to compute the block sort.
__shared__ union {
FinalItems items;
typename FinalSort::TempStorage finalSort;
typename TopKSort::TempStorage topKSort;
} smemFinal;
// Shared memory to store the histogram.
__shared__ int smemHistogram[kNumBins];
// Shared memory to store the selected indices.
__shared__ int smemIndices[kTopK];
// Shared memory to store the selected logits.
__shared__ float smemLogits[kTopK];
// Shared memory to store the threshold bin.
__shared__ int smemThresholdBinIdx[1];
// Shared memory counter to register the candidates for the final phase.
__shared__ int smemFinalDstIdx[1];
// The row computed by this block.
int rowIdx = blockIdx.x;
// The range of logits within the row.
int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
// The length of the row.
int rowLen = rowEnd - rowStart;
// Shortcut if the length of the row is smaller than Top-K. Indices are not
// sorted by their corresponding logit.
if (rowLen <= kTopK) {
for (int rowIt = threadIdx.x; rowIt < rowLen;
rowIt += kNumThreadsPerBlock) {
int idx = rowStart + rowIt;
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
outLogits[rowIdx * kTopK + rowIt] =
logits[rowIdx * stride0 + idx * stride1];
}
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
rowIt += kNumThreadsPerBlock) {
outIndices[rowIdx * kTopK + rowIt] = -1;
outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
}
return;
}
// Clear the histogram.
if (threadIdx.x < kNumBins) {
smemHistogram[threadIdx.x] = 0;
}
// Make sure the histogram is ready.
__syncthreads();
// Fetch elements one-by-one.
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
rowIt += kNumThreadsPerBlock) {
uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
atomicAdd(&smemHistogram[idx], 1);
}
// Make sure the histogram is ready.
__syncthreads();
// Read the values from SMEM.
int binCount{0};
if (threadIdx.x < kNumBins) {
binCount = smemHistogram[threadIdx.x];
}
// Make sure each thread has read its value.
__syncthreads();
// Compute the prefix sum.
int prefixSum{0}, totalSum{0};
Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
// Update the histogram with the prefix sums.
if (threadIdx.x < kNumBins) {
smemHistogram[threadIdx.x] = prefixSum;
}
// Make sure the data is in shared memory.
__syncthreads();
// Find the last valid bin.
if (threadIdx.x < kNumBins) {
int nextPrefixSum =
threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
smemThresholdBinIdx[0] = threadIdx.x;
}
}
// Clear the counter to store the items for the final phase.
if (threadIdx.x == 0) {
smemFinalDstIdx[0] = 0;
}
// Make sure the data is in shared memory.
__syncthreads();
// The threshold bin.
int thresholdBinIdx = smemThresholdBinIdx[0];
// Fetch elements one-by-one and populate the shared memory buffers.
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
rowIt += kNumThreadsPerBlock) {
float logit = logits[rowIdx * stride0 + rowIt * stride1];
uint16_t idx = extractBinIdx(logit);
if (idx < thresholdBinIdx) {
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
smemLogits[dstIdx] = logit;
smemIndices[dstIdx] = rowIt;
} else if (idx == thresholdBinIdx) {
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
if (dstIdx < kNumFinalItems) {
smemFinal.items.logits[dstIdx] = logit;
smemFinal.items.indices[dstIdx] = rowIt;
}
}
}
// Make sure the elements are in shared memory.
__syncthreads();
// The logits of the elements to be sorted in the final pass.
float finalLogits[kNumFinalItemsPerThread];
// The indices of the elements to be sorted in the final pass.
int finalIndices[kNumFinalItemsPerThread];
// Init.
#pragma unroll
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
finalLogits[ii] = -FLT_MAX;
}
// Read the elements from SMEM.
#pragma unroll
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
if (srcIdx < smemFinalDstIdx[0]) {
finalLogits[ii] = smemFinal.items.logits[srcIdx];
finalIndices[ii] = smemFinal.items.indices[srcIdx];
}
}
// Make sure the shared memory has been read.
__syncthreads();
// Sort the elements.
FinalSort(smemFinal.finalSort)
.SortDescendingBlockedToStriped(finalLogits, finalIndices);
// Copy the data back to the shared memory storage.
int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
#pragma unroll
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
int dstIdx = baseIdx + srcIdx;
if (dstIdx < kTopK) {
smemLogits[dstIdx] = finalLogits[ii];
smemIndices[dstIdx] = finalIndices[ii];
}
}
// Make sure the data is in shared memory.
__syncthreads();
// The topK logits.
float topKLogits[kNumTopKItemsPerThread];
// The topK indices.
int topKIndices[kNumTopKItemsPerThread];
// Load from shared memory.
#pragma unroll
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
}
// Sort the elements.
TopKSort(smemFinal.topKSort)
.SortDescendingBlockedToStriped(topKLogits, topKIndices);
// Store to global memory.
#pragma unroll
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
outIndices[offset] = topKIndices[ii] - rowStart;
outLogits[offset] = topKLogits[ii];
}
}
} // namespace vllm
void apply_repetition_penalties_(
@ -324,20 +85,4 @@ void apply_repetition_penalties_(
repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
tile_size);
});
}
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices,
torch::Tensor& values, int64_t numRows, int64_t stride0,
int64_t stride1) {
// Compute the results on the device.
constexpr int kNumThreadsPerBlock = 512;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::topKPerRow<kNumThreadsPerBlock>
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
values.data_ptr<float>(), static_cast<int>(stride0),
static_cast<int>(stride1));
}
}

View File

@ -33,11 +33,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#endif
ops.def(
"persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
"y_q, Tensor! y_s,"
"bool use_ue8m0) -> ()");
ops.impl("persistent_masked_m_silu_mul_quant", torch::kCUDA,
&persistent_masked_m_silu_mul_quant);
"silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
"y_q, Tensor! y_s, int group_size, "
"bool use_ue8m0, int num_parallel_tokens) -> ()");
ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
&silu_mul_fp8_quant_deep_gemm_cuda);
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@ -188,13 +188,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("apply_repetition_penalties_", torch::kCUDA,
&apply_repetition_penalties_);
// Optimized top-k per row operation
ops.def(
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
"Tensor! indices, Tensor! values, int numRows, int stride0, "
"int stride1) -> ()");
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
// Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def(
@ -727,12 +720,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"int quant_block_size, str kv_cache_dtype) -> ()");
cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
&indexer_k_quant_and_cache);
cache_ops.def(
"cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
"dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
cache_ops.impl("cp_gather_indexer_k_quant_cache", torch::kCUDA,
&cp_gather_indexer_k_quant_cache);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {

View File

@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Important: We build with an old version of Ubuntu to maintain broad
# Important: We build with an old version of Ubuntu to maintain broad
# compatibility with other Linux OSes. The main reason for this is that the
# glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version.
@ -356,14 +356,75 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Install FlashInfer pre-compiled kernel cache and binaries
# https://docs.flashinfer.ai/installation.html
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.4.0 \
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config
# If we need to build FlashInfer wheel before its release:
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
# $ git checkout v0.2.6.post1
# $ python -m flashinfer.aot
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.3.1"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
ARG FLASHINFER_AOT_COMPILE=false
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
pushd flashinfer
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
fi
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
# Build AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
# Install with no-build-isolation since we already built AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
else
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
uv pip install --system . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
fi
popd
rm -rf flashinfer
BASH
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
@ -400,7 +461,7 @@ RUN set -eux; \
# Install EP kernels(pplx-kernels and DeepEP)
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
ENV CUDA_HOME=/usr/local/cuda
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
&& bash install_python_libraries.sh
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
@ -481,7 +542,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \
BITSANDBYTES_VERSION="0.46.1"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.14.0'
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
ENV VLLM_USAGE_SOURCE production-docker-image

Some files were not shown because too many files have changed in this diff Show More