Compare commits

..

4 Commits

Author SHA1 Message Date
4c42267293 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 02:26:20 +00:00
24f68342b4 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 02:17:42 +00:00
c5d963835b updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 01:54:01 +00:00
b313220727 updates
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-27 23:51:36 +00:00
717 changed files with 12609 additions and 45991 deletions

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.31
- name: "exact_match,flexible-extract"
value: 0.47
limit: 1319
num_fewshot: 5

View File

@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml Meta-Llama-3-8B-QQQ.yaml

View File

@ -10,24 +10,15 @@ set -x
set -o pipefail set -o pipefail
check_gpus() { check_gpus() {
if command -v nvidia-smi; then # check the number of GPUs and GPU type.
# check the number of GPUs and GPU type. declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then if [[ $gpu_count -gt 0 ]]; then
echo "GPU found." echo "GPU found."
else else
echo "Need at least 1 GPU to run benchmarking." echo "Need at least 1 GPU to run benchmarking."
exit 1 exit 1
fi fi
if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
fi
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
@ -99,15 +90,9 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB # wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1
sleep 1 done
done
elif command -v amd-smi; then
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
sleep 1
done
fi
# remove vllm config file # remove vllm config file
rm -rf ~/.config/vllm rm -rf ~/.config/vllm

View File

@ -64,11 +64,9 @@
"disable_log_requests": "", "disable_log_requests": "",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
"speculative_config": { "speculative_model": "turboderp/Qwama-0.5B-Instruct",
"model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4,
"num_speculative_tokens": 4, "speculative_draft_tensor_parallel_size": 1
"draft_tensor_parallel_size": 1
}
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -3,10 +3,10 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -14,10 +14,10 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -31,10 +31,10 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -48,7 +48,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
@ -57,7 +57,7 @@ steps:
agents: agents:
queue: tpu_queue_postmerge queue: tpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins: plugins:
@ -82,7 +82,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"

View File

@ -105,33 +105,19 @@ fi
if [[ $commands == *" entrypoints/openai "* ]]; then if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \ commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_chat.py \
--ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "} --ignore=entrypoints/openai/test_prompt_validation.py "}
fi fi
#ignore certain Entrypoints/llm tests #ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \ commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py # --ignore=entrypoints/openai/test_oot_registration.py

View File

@ -5,8 +5,8 @@
set -ex set -ex
set -o pipefail set -o pipefail
# cd 2 levels into the working directory # cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/../.." cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

View File

@ -10,4 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
docker build -t cpu-test -f docker/Dockerfile.s390x . docker build -t cpu-test -f Dockerfile.ppc64le .

View File

@ -8,19 +8,15 @@ set -ex
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
}
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,6 +36,8 @@ function cpu_tests() {
# Run basic model test # Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/kernels/test_cache.py -m cpu_model pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model

View File

@ -9,7 +9,6 @@ python3 use_existing_torch.py
# Try building the docker image # Try building the docker image
DOCKER_BUILDKIT=1 docker build . \ DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \ --target vllm-openai \
--platform "linux/arm64" \ --platform "linux/arm64" \
-t gh200-test \ -t gh200-test \

View File

@ -5,7 +5,7 @@
set -ex set -ex
# Try building the docker image # Try building the docker image
docker build -t hpu-test-env -f docker/Dockerfile.hpu . docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup # Setup cleanup
# certain versions of HPU software stack have a bug that can # certain versions of HPU software stack have a bug that can

View File

@ -3,7 +3,7 @@
set -euox pipefail set -euox pipefail
if [[ $# -lt 4 ]]; then if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1 exit 1
fi fi

View File

@ -35,7 +35,7 @@ else
date "+%s" > /tmp/neuron-docker-build-timestamp date "+%s" > /tmp/neuron-docker-build-timestamp
fi fi
docker build -t "${image_name}" -f docker/Dockerfile.neuron . docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {

View File

@ -1,9 +1,9 @@
#!/bin/bash #!/bin/bash
set -xue set -e
# Build the docker image. # Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu . docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup. # Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; } remove_docker_container() { docker rm -f tpu-test || true; }
@ -17,14 +17,10 @@ source /etc/environment
docker run --privileged --net host --shm-size=16G -it \ docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest tpu-info \ && python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \ && python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_USE_V1=1 \ && export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \ && export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& echo TEST_0 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
&& echo TEST_1 \ && echo TEST_1 \
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \ && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \ && echo TEST_2 \
@ -36,14 +32,11 @@ docker run --privileged --net host --shm-size=16G -it \
&& echo TEST_5 \ && echo TEST_5 \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \ && python3 /workspace/vllm/examples/offline_inference/tpu.py \
&& echo TEST_6 \ && echo TEST_6 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \ && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
&& echo TEST_7 \ && echo TEST_7 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \ && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
&& echo TEST_8 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
&& echo TEST_9 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling # TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu . docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {

View File

@ -1,38 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Setup cleanup
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
# Run the image
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
function cpu_tests() {
# offline inference
podman exec cpu-test-ubi9-ppc bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman exec cpu-test-ubi9-ppc bash -c "
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c cpu_tests

View File

@ -104,7 +104,7 @@ steps:
- label: Entrypoints Test # 40min - label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/entrypoints/llm - tests/entrypoints/llm
@ -118,7 +118,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -150,12 +150,11 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
- pushd ../examples/offline_inference - pushd ../examples/offline_inference
- python3 rlhf.py - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
mirror_hardwares: [amd]
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -163,13 +162,18 @@ steps:
- tests/tracing - tests/tracing
commands: commands:
- pytest -v -s metrics - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing - pytest -v -s tracing
##### fast check tests ##### ##### fast check tests #####
##### 1 GPU test ##### ##### 1 GPU test #####
- label: Regression Test # 5min - label: Regression Test # 5min
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
@ -200,6 +204,7 @@ steps:
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s v1/core - pytest -v -s v1/core
- pytest -v -s v1/entrypoints
- pytest -v -s v1/engine - pytest -v -s v1/engine
- pytest -v -s v1/entrypoints - pytest -v -s v1/entrypoints
- pytest -v -s v1/sample - pytest -v -s v1/sample
@ -280,21 +285,13 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -304,6 +301,7 @@ steps:
# these tests need to be separated, cannot combine # these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/test_pass_manager.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
source_file_dependencies: source_file_dependencies:
@ -313,7 +311,7 @@ steps:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each - label: Kernels Test %N # 1h each
# mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/attention - vllm/attention
@ -323,7 +321,7 @@ steps:
parallelism: 4 parallelism: 4
- label: Tensorizer Test # 11min - label: Tensorizer Test # 11min
# mirror_hardwares: [amd] mirror_hardwares: [amd]
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/model_loader - vllm/model_executor/model_loader
@ -339,14 +337,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- benchmarks/ - benchmarks/
commands: commands:
- bash scripts/run-benchmarks.sh - bash run-benchmarks.sh
- label: Benchmarks CLI Test # 10min
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Quantization Test # 33min - label: Quantization Test # 33min
source_file_dependencies: source_file_dependencies:
@ -381,14 +372,12 @@ steps:
- label: OpenAI-Compatible Tool Use # 20 min - label: OpenAI-Compatible Tool Use # 20 min
fast_check: false fast_check: false
#mirror_hardwares: [ amd ] mirror_hardwares: [ amd ]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
- tests/mistral_tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s tool_use
- pytest -v -s mistral_tool_use
##### models test ##### ##### models test #####
@ -400,9 +389,7 @@ steps:
- pytest -v -s models/test_transformers.py - pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531 # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- label: Language Models Test (Standard) # 32min - label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
@ -412,8 +399,6 @@ steps:
- tests/models/embedding/language - tests/models/embedding/language
- tests/models/encoder_decoder/language - tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install causal-conv1d
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model - pytest -v -s models/embedding/language -m core_model
@ -425,8 +410,6 @@ steps:
- tests/models/embedding/language - tests/models/embedding/language
- tests/models/encoder_decoder/language - tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install causal-conv1d
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model' - pytest -v -s models/embedding/language -m 'not core_model'
@ -443,12 +426,11 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal - pytest -v -s models/multimodal
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model - pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model - pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
- label: Multi-Modal Models Test (Extended) 1 # 48m - label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true optional: true
@ -462,7 +444,10 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model' - pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@ -478,7 +463,6 @@ steps:
# This test is used only in PR development phase to test individual models and should never run on main # This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test - label: Custom Models Test
mirror_hardwares: [amd]
optional: true optional: true
commands: commands:
- echo 'Testing custom models...' - echo 'Testing custom models...'
@ -490,7 +474,6 @@ steps:
##### multi gpus test ##### ##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min - label: Distributed Comm Ops Test # 7min
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -537,7 +520,7 @@ steps:
- vllm/v1/engine/ - vllm/v1/engine/
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@ -552,7 +535,6 @@ steps:
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
@ -619,6 +601,8 @@ steps:
# requires multi-GPU testing for validation. # requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -9,7 +9,7 @@ body:
value: > value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model. #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
- type: textarea - type: textarea
attributes: attributes:
label: The model to consider. label: The model to consider.

View File

@ -35,7 +35,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*) FIX #xxxx (*link existing issues this PR will resolve*)
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**

2
.github/mergify.yml vendored
View File

@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/ - files~=\.buildkite/
- files~=^cmake/ - files~=^cmake/
- files=CMakeLists.txt - files=CMakeLists.txt
- files~=^docker/Dockerfile - files~=^Dockerfile
- files~=^requirements.*\.txt - files~=^requirements.*\.txt
- files=setup.py - files=setup.py
actions: actions:

View File

@ -50,7 +50,7 @@ jobs:
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu - name: Build the Docker image vllm cpu
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env . run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster - name: Configuration of docker images, network and namespace for the kind cluster
run: | run: |

3
.gitignore vendored
View File

@ -203,6 +203,3 @@ benchmarks/**/*.json
# Linting # Linting
actionlint actionlint
shellcheck*/ shellcheck*/
# Ingore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -1,6 +1,3 @@
default_install_hook_types:
- pre-commit
- commit-msg
default_stages: default_stages:
- pre-commit # Run locally - pre-commit # Run locally
- manual # Run in CI - manual # Run in CI
@ -11,6 +8,7 @@ repos:
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3 rev: v0.9.3
hooks: hooks:
@ -121,12 +119,6 @@ repos:
language: system language: system
always_run: true always_run: true
pass_filenames: false pass_filenames: false
- id: update-dockerfile-graph
name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh
language: script
files: ^docker/Dockerfile$
pass_filenames: false
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# #
# Note: the CUDA torch version is derived from pyproject.toml and various # Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@ -230,12 +230,10 @@ set(VLLM_EXT_SRC
"csrc/cache_kernels.cu" "csrc/cache_kernels.cu"
"csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu" "csrc/attention/paged_attention_v2.cu"
"csrc/attention/merge_attn_states.cu"
"csrc/pos_encoding_kernels.cu" "csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu" "csrc/quantization/fp8/common.cu"
@ -243,7 +241,6 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu" "csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -285,6 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu"
@ -609,51 +607,21 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
#
# For the Marlin MOE kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
)
if (NOT moe_marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin MOE generation failed."
" Result: \"${moe_marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
else()
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}" SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}") CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else() else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found" message(STATUS "Not building Marlin MOE kernels as no compatible archs found"

View File

@ -240,8 +240,6 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
fi fi
COPY examples examples COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
# Although we build Flashinfer with AOT mode, there's still # Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to # some issues w.r.t. JIT compilation. Therefore we need to

69
Dockerfile.cpu Normal file
View File

@ -0,0 +1,69 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,4 @@
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
COPY ./ /workspace/vllm COPY ./ /workspace/vllm

View File

@ -1,6 +1,6 @@
# default base image # default base image
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict # uninstall transformers-neuronx package explicitly to avoid version conflict

View File

@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
&& python -m pip install -U pip uv \ && python -m pip install -U pip uv \
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \ && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& cd /tmp && touch control && cd /tmp && touch control
@ -126,16 +126,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
FROM base-builder AS cv-builder FROM base-builder AS cv-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG OPENCV_VERSION=86 ARG OPENCV_VERSION=84
# patch for version 4.11.0.86
ARG OPENCV_PATCH=97f3f39
ARG ENABLE_HEADLESS=1 ARG ENABLE_HEADLESS=1
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \ git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
cd opencv-python && \ cd opencv-python && \
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \ sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
python -m build --wheel --installer=uv --outdir /opencvwheels/ python -m build --wheel --installer=uv --outdir /opencvwheels/
############################################################### ###############################################################
@ -151,15 +148,9 @@ COPY --from=arrow-builder /tmp/control /dev/null
COPY --from=cv-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null
ARG VLLM_TARGET_DEVICE=cpu ARG VLLM_TARGET_DEVICE=cpu
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
# this step installs vllm and populates uv cache # this step installs vllm and populates uv cache
# with all the transitive dependencies # with all the transitive dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
uv pip install maturin && \
uv build --wheel --out-dir /hf_wheels/
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@ -168,7 +159,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \ uv pip install pandas pythran pybind11 && \
# sentencepiece.pc is in some pkgconfig inside uv cache # sentencepiece.pc is in some pkgconfig inside uv cache
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@ -247,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& python -m pip install -U pip uv --no-cache \ && python -m pip install -U pip uv --no-cache \
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
&& make -C /numactl install \ && make -C /numactl install \
&& uv pip install 'cmake<4' \ && uv pip install cmake \
&& cmake --install /lapack/build \ && cmake --install /lapack/build \
&& uv pip uninstall cmake && uv pip uninstall cmake
@ -256,9 +247,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm

View File

@ -1,18 +1,18 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="db8e93b4" ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b" ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION= ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d" ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006" ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git" ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="295f2ed4" ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.21.0" ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
ARG AITER_BRANCH="8970b25b" ARG AITER_BRANCH="21d47a9"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ && apt-get install -y software-properties-common git curl sudo vim less \
&& add-apt-repository ppa:deadsnakes/ppa \ && add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@ -42,7 +42,7 @@ RUN apt-get update -y \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH ARG HIPBLASLT_BRANCH
@ -60,8 +60,7 @@ RUN cd hipBLAS-common \
RUN git clone https://github.com/ROCm/hipBLASLt RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \ RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \ && git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \ && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \ && cd build/release \
&& make package && make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -111,24 +110,11 @@ RUN git clone ${FA_REPO}
RUN cd flash-attention \ RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \ && git checkout ${FA_BRANCH} \
&& git submodule update --init \ && git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install && cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS final FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \ dpkg -i /install/*deb \
@ -144,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl ARG AITER_REPO
ARG AITER_BRANCH
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
ARG BASE_IMAGE ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
ARG LEGACY_HIPBLASLT_OPTION ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH ARG RCCL_BRANCH
ARG RCCL_REPO ARG RCCL_REPO
@ -161,8 +154,6 @@ ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -176,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
cd ../../python && \ cd ../../python && \
export PYARROW_PARALLEL=4 && \ export PYARROW_PARALLEL=4 && \
export ARROW_BUILD_TYPE=release && \ export ARROW_BUILD_TYPE=release && \
uv pip install -r requirements-build.txt && \ uv pip install -r requirements/build.txt && \
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
FROM python-install AS numa-build FROM python-install AS numa-build
@ -96,22 +96,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
python setup.py bdist_wheel python setup.py bdist_wheel
FROM python-install AS hf-xet-builder
# Install hf-xet
WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/huggingface/xet-core.git && \
cd xet-core/hf_xet/ && \
uv pip install maturin patchelf && \
python -m maturin build --release --out dist && \
mkdir -p /tmp/hf-xet/dist && \
cp dist/*.whl /tmp/hf-xet/dist/
# Final build stage # Final build stage
FROM python-install AS vllm-cpu FROM python-install AS vllm-cpu
ARG PYTHON_VERSION ARG PYTHON_VERSION
@ -136,15 +120,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
sed -i '/^torch/d' requirements/build.txt && \ sed -i '/^torch/d' requirements/build.txt && \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
uv pip install -v \ uv pip install -v \
$ARROW_WHL_FILE \ $ARROW_WHL_FILE \
$VISION_WHL_FILE \ $VISION_WHL_FILE \
$HF_XET_WHL_FILE \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \ --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
-r requirements/build.txt \ -r requirements/build.txt \
@ -169,4 +150,3 @@ WORKDIR /home/vllm
# Set the default entrypoint # Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -10,14 +10,19 @@ Easy, fast, and cheap LLM serving for everyone
</h3> </h3>
<p align="center"> <p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p> </p>
--- ---
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
---
*Latest News* 🔥 *Latest News* 🔥
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
@ -98,7 +103,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing ## Contributing
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
## Sponsors ## Sponsors
@ -121,7 +126,6 @@ Compute Resources:
- Databricks - Databricks
- DeepInfra - DeepInfra
- Google Cloud - Google Cloud
- Intel
- Lambda Lab - Lambda Lab
- Nebius - Nebius
- Novita AI - Novita AI

View File

@ -41,39 +41,29 @@ become available.
<td><code>synthetic</code></td> <td><code>synthetic</code></td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace-VisionArena</strong></td> <td><strong>HuggingFace</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;">🟡</td>
<td style="text-align: center;"></td> <td style="text-align: center;">🟡</td>
<td><code>lmarena-ai/VisionArena-Chat</code></td> <td>Specify your dataset path on HuggingFace</td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace-InstructCoder</strong></td> <td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td> <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
✅: supported ✅: supported
🟡: Partial support
🚧: to be supported 🚧: to be supported
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
If you need support for other dataset formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf`
--- ---
## Example - Online Benchmark ## Example - Online Benchmark
@ -81,7 +71,8 @@ become available.
First start serving your model First start serving your model
```bash ```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
vllm serve ${MODEL_NAME} --disable-log-requests
``` ```
Then run the benchmarking script Then run the benchmarking script
@ -89,13 +80,12 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
--backend vllm \ NUM_PROMPTS=10
--model NousResearch/Hermes-3-Llama-3.1-8B \ BACKEND="vllm"
--endpoint /v1/completions \ DATASET_NAME="sharegpt"
--dataset-name sharegpt \ DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
--num-prompts 10
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -132,105 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--num-prompts 1000 --num-prompts "${NUM_PROMPTS}"
``` ```
### InstructCoder Benchmark with Speculative Decoding ### HuggingFaceDataset Examples
``` bash Currently, HuggingFaceDataset only supports dataset formats
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
--speculative-model "[ngram]" \ formats, please consider contributing.
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Other HuggingFaceDataset Examples
```bash ```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
DATASET_SPLIT='train'
DATASET_SUBSET='chart2text(cauldron)'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmms-lab/LLaVA-OneVision-Data \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--hf-subset "chart2text(cauldron)" \ --num-prompts "${NUM_PROMPTS}" \
--num-prompts 10 --hf-subset "${DATASET_SUBSET}"
``` ```
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--num-prompts 10 --num-prompts "${NUM_PROMPTS}" \
```
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
``` ```
--- ---
## Example - Offline Throughput Benchmark ## Example - Offline Throughput Benchmark
```bash ```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model "${MODEL_NAME}" \
--dataset-name sonnet \ --dataset-name "${DATASET_NAME}" \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path "${DATASET_PATH}" \
--num-prompts 10 --num-prompts "${NUM_PROMPTS}"
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -244,13 +217,19 @@ Total num output tokens: 1500
### VisionArena Benchmark for Vision Language Models ### VisionArena Benchmark for Vision Language Models
``` bash ``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--backend vllm-chat \ --backend "vllm-chat" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path "${DATASET_PATH}" \
--num-prompts 1000 \ --num-prompts "${NUM_PROMPTS}" \
--hf-split train --hf-split "${DATASET_SPLIT}"
``` ```
The `num prompt tokens` now includes image token counts The `num prompt tokens` now includes image token counts
@ -261,83 +240,29 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-model="[ngram]" \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
### Benchmark with LoRA Adapters ### Benchmark with LoRA Adapters
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model meta-llama/Llama-2-7b-hf \ --model "${MODEL_NAME}" \
--backend vllm \ --backend "${BACKEND}" \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path "${DATASET_PATH}" \
--dataset_name sharegpt \ --dataset_name "${DATASET_NAME}" \
--num-prompts 10 \ --num-prompts "${NUM_PROMPTS}" \
--max-loras 2 \ --max-loras "${MAX_LORAS}" \
--max-lora-rank 8 \ --max-lora-rank "${MAX_LORA_RANK}" \
--enable-lora \ ${ENABLE_LORA} \
--lora-path yard1/llama-2-7b-sql-lora-test --lora-path "${LORA_PATH}"
``` ```

View File

@ -219,15 +219,7 @@ async def async_request_deepspeed_mii(
if response.status == 200: if response.status == 200:
parsed_resp = await response.json() parsed_resp = await response.json()
output.latency = time.perf_counter() - st output.latency = time.perf_counter() - st
if "choices" in parsed_resp: output.generated_text = parsed_resp["text"][0]
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.success = True output.success = True
else: else:
output.error = response.reason or "" output.error = response.reason or ""
@ -497,9 +489,3 @@ ASYNC_REQUEST_FUNCS = {
"scalellm": async_request_openai_completions, "scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions, "sglang": async_request_openai_completions,
} }
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_openai_chat_completions)
]

View File

@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import dataclass from dataclasses import dataclass
from functools import cache from functools import cache
from io import BytesIO from typing import Any, Optional, Union
from typing import Any, Callable, Optional, Union
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -240,24 +239,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
""" """
Process a single image input and return a multimedia content dictionary. Process a single image input and return a multimedia content dictionary.
Supports three input types: For a PIL.Image.Image input:
- Converts the image to RGB.
- Saves the image as a JPEG in-memory.
- Encodes the JPEG data as a base64 string.
- Returns a dictionary with the image as a base64 data URL.
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key For a string input:
containing raw image data. - Loads the bytes as a PIL.Image.Image. - Treats the string as a URL or file path.
- Prepends "file://" if the string doesn't start with "http://" or
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as "file://".
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns - Returns a dictionary with the image URL.
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises: Raises:
ValueError: If the input is not a supported type. ValueError: If the input is neither a PIL.Image.Image nor a string.
""" """
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image): if isinstance(image, Image.Image):
image = image.convert("RGB") image = image.convert("RGB")
with io.BytesIO() as image_data: with io.BytesIO() as image_data:
@ -276,8 +272,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
("http://", "file://")) else f"file://{image}") ("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}} return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" raise ValueError(
" or str or dictionary with raw image bytes.") f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -288,7 +284,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
class RandomDataset(BenchmarkDataset): class RandomDataset(BenchmarkDataset):
# Default values copied from benchmark_serving.py for the random dataset. # Default values copied from benchmark_serving.py for the random dataset.
DEFAULT_PREFIX_LEN = 0 DEFAULT_PREFIX_LEN = 0
DEFAULT_RANGE_RATIO = 0.0 DEFAULT_RANGE_RATIO = 1.0
DEFAULT_INPUT_LEN = 1024 DEFAULT_INPUT_LEN = 1024
DEFAULT_OUTPUT_LEN = 128 DEFAULT_OUTPUT_LEN = 128
@ -308,32 +304,19 @@ class RandomDataset(BenchmarkDataset):
output_len: int = DEFAULT_OUTPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN,
**kwargs, **kwargs,
) -> list[SampleRequest]: ) -> list[SampleRequest]:
# Enforce range_ratio < 1
assert range_ratio < 1.0, (
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
)
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
prefix_token_ids = (np.random.randint( prefix_token_ids = (np.random.randint(
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
# New sampling logic: [X * (1 - b), X * (1 + b)] input_low = int(input_len * range_ratio)
input_low = int(input_len * (1 - range_ratio)) output_low = int(output_len * range_ratio)
input_high = int(input_len * (1 + range_ratio))
output_low = int(output_len * (1 - range_ratio))
output_high = int(output_len * (1 + range_ratio))
# Add logging for debugging
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
logger.info("Sampling output_len from [%s, %s]", output_low,
output_high)
input_lens = np.random.randint(input_low, input_lens = np.random.randint(input_low,
input_high + 1, input_len + 1,
size=num_requests) size=num_requests)
output_lens = np.random.randint(output_low, output_lens = np.random.randint(output_low,
output_high + 1, output_len + 1,
size=num_requests) size=num_requests)
offsets = np.random.randint(0, vocab_size, size=num_requests) offsets = np.random.randint(0, vocab_size, size=num_requests)
@ -485,11 +468,11 @@ class SonnetDataset(BenchmarkDataset):
# Determine how many poem lines to use. # Determine how many poem lines to use.
num_input_lines = round((input_len - base_offset) / avg_len) num_input_lines = round((input_len - base_offset) / avg_len)
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) num_prefix_lines = round((prefix_len - base_offset) / avg_len)
prefix_lines = self.data[:num_prefix_lines] prefix_lines = self.data[:num_prefix_lines]
samples = [] samples = []
while len(samples) < num_requests: for _ in range(num_requests):
extra_lines = random.choices(self.data, extra_lines = random.choices(self.data,
k=num_input_lines - num_prefix_lines) k=num_input_lines - num_prefix_lines)
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@ -497,14 +480,13 @@ class SonnetDataset(BenchmarkDataset):
prompt_formatted = tokenizer.apply_chat_template( prompt_formatted = tokenizer.apply_chat_template(
msg, add_generation_prompt=True, tokenize=False) msg, add_generation_prompt=True, tokenize=False)
prompt_len = len(tokenizer(prompt_formatted).input_ids) prompt_len = len(tokenizer(prompt_formatted).input_ids)
if prompt_len <= input_len: samples.append(
samples.append( SampleRequest(
SampleRequest( prompt=prompt_formatted
prompt=prompt_formatted if return_prompt_formatted else prompt,
if return_prompt_formatted else prompt, prompt_len=prompt_len,
prompt_len=prompt_len, expected_output_len=output_len,
expected_output_len=output_len, ))
))
return samples return samples
@ -580,47 +562,48 @@ class BurstGPTDataset(BenchmarkDataset):
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# HuggingFace Dataset Base Implementation # HuggingFace Dataset Implementation
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
class HuggingFaceDataset(BenchmarkDataset):
"""Base class for datasets hosted on HuggingFace."""
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
class HuggingFaceDataset(BenchmarkDataset):
"""
Dataset class for processing a HuggingFace dataset with conversation data
and optional images.
"""
def __init__( def __init__(
self, self,
dataset_path: str,
dataset_split: str, dataset_split: str,
dataset_subset: Optional[str] = None, dataset_subset: Optional[str] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(dataset_path=dataset_path, **kwargs) super().__init__(**kwargs)
self.dataset_split = dataset_split self.dataset_split = dataset_split
self.dataset_subset = dataset_subset self.dataset_subset = dataset_subset
self.load_data() self.load_data()
def load_data(self) -> None: def load_data(self) -> None:
"""Load data from HuggingFace datasets.""" if not self.dataset_path:
raise ValueError("dataset_path must be provided for loading data.")
self.data = load_dataset( self.data = load_dataset(
self.dataset_path, self.dataset_path,
name=self.dataset_subset, name=self.dataset_subset,
split=self.dataset_split, split=self.dataset_split,
streaming=True, streaming=True,
) )
self.data = self.data.shuffle(seed=self.random_seed) if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
# ----------------------------------------------------------------------------- "HuggingFaceDataset currently only supports datasets with "
# Conversation Dataset Implementation "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
# ----------------------------------------------------------------------------- "Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
class ConversationDataset(HuggingFaceDataset): self.data = self.data.shuffle(seed=self.random_seed).filter(
"""Dataset for conversation data with multimodal support.""" lambda x: len(x["conversations"]) >= 2)
SUPPORTED_DATASET_PATHS = {
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
}
def sample(self, def sample(self,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
@ -628,13 +611,10 @@ class ConversationDataset(HuggingFaceDataset):
output_len: Optional[int] = None, output_len: Optional[int] = None,
enable_multimodal_chat: bool = False, enable_multimodal_chat: bool = False,
**kwargs) -> list: **kwargs) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(
lambda x: len(x["conversations"]) >= 2)
sampled_requests = [] sampled_requests = []
dynamic_output = output_len is None dynamic_output = output_len is None
for item in filtered_data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
conv = item["conversations"] conv = item["conversations"]
@ -679,12 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
""" """
DEFAULT_OUTPUT_LEN = 128 DEFAULT_OUTPUT_LEN = 128
SUPPORTED_DATASET_PATHS = { VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
"lmarena-ai/VisionArena-Chat":
lambda x: x["conversation"][0][0]["content"], def __init__(
"lmarena-ai/vision-arena-bench-v0.1": self,
lambda x: x["turns"][0][0]["content"] **kwargs,
} ) -> None:
super().__init__(**kwargs)
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
if self.dataset_subset is None and self.dataset_split != "train":
raise ValueError("Dataset split must be 'train'.")
self.load_data()
def load_data(self) -> None:
dataset = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = dataset.shuffle(seed=self.random_seed)
def sample( def sample(
self, self,
@ -700,11 +697,7 @@ class VisionArenaDataset(HuggingFaceDataset):
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) prompt = item["turns"][0][0]["content"]
if parser_fn is None:
raise ValueError(
f"Unsupported dataset path: {self.dataset_path}")
prompt = parser_fn(item)
mm_content = process_image(item["images"][0]) mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat: if enable_multimodal_chat:
@ -722,96 +715,3 @@ class VisionArenaDataset(HuggingFaceDataset):
)) ))
self.maybe_oversample_requests(sampled_requests, num_requests) self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests return sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------
class InstructCoderDataset(HuggingFaceDataset):
"""
InstructCoder Dataset.
https://huggingface.co/datasets/likaixin/InstructCoder
InstructCoder is the dataset designed for general code editing. It consists
of 114,239 instruction-input-output triplets, and covers multiple distinct
code editing scenario.
"""
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
SUPPORTED_DATASET_PATHS = {
"likaixin/InstructCoder",
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# AIMO Dataset Implementation
# -----------------------------------------------------------------------------
class AIMODataset(HuggingFaceDataset):
"""
Dataset class for processing a AIMO dataset with reasoning questions.
"""
SUPPORTED_DATASET_PATHS = {
"AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
"AI-MO/NuminaMath-CoT"
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt, completion = item['problem'], item["solution"]
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
completion_len = len(completion_ids)
output_len = completion_len if dynamic_output else output_len
assert isinstance(output_len, int) and output_len > 0
if dynamic_output and not is_valid_sequence(prompt_len,
completion_len,
max_prompt_len=2048,
max_total_len=32000):
continue
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=None,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests

View File

@ -7,6 +7,9 @@ On the server side, run one of the following commands:
--swap-space 16 \ --swap-space 16 \
--disable-log-requests --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run: On the client side, run:
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend <backend> \ --backend <backend> \
@ -34,8 +37,7 @@ from datetime import datetime
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
RequestFuncOutput) RequestFuncOutput)
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
@ -50,11 +52,9 @@ try:
except ImportError: except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_dataset import (AIMODataset, BurstGPTDataset, from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
ConversationDataset, HuggingFaceDataset, RandomDataset, SampleRequest, ShareGPTDataset,
InstructCoderDataset, RandomDataset, SonnetDataset, VisionArenaDataset)
SampleRequest, ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -156,7 +156,7 @@ def calculate_metrics(
if outputs[i].success: if outputs[i].success:
output_len = outputs[i].output_tokens output_len = outputs[i].output_tokens
if not output_len: if output_len is None:
# We use the tokenizer to count the number of output tokens # We use the tokenizer to count the number of output tokens
# for some serving backends instead of looking at # for some serving backends instead of looking at
# len(outputs[i].itl) since multiple output tokens may be # len(outputs[i].itl) since multiple output tokens may be
@ -261,7 +261,6 @@ async def benchmark(
goodput_config_dict: dict[str, float], goodput_config_dict: dict[str, float],
max_concurrency: Optional[int], max_concurrency: Optional[int],
lora_modules: Optional[Iterable[str]], lora_modules: Optional[Iterable[str]],
extra_body: Optional[dict],
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
@ -289,7 +288,6 @@ async def benchmark(
logprobs=logprobs, logprobs=logprobs,
multi_modal_content=test_mm_content, multi_modal_content=test_mm_content,
ignore_eos=ignore_eos, ignore_eos=ignore_eos,
extra_body=extra_body,
) )
test_output = await request_func(request_func_input=test_input) test_output = await request_func(request_func_input=test_input)
@ -316,8 +314,7 @@ async def benchmark(
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs, logprobs=logprobs,
multi_modal_content=test_mm_content, multi_modal_content=test_mm_content,
ignore_eos=ignore_eos, ignore_eos=ignore_eos)
extra_body=extra_body)
profile_output = await request_func(request_func_input=profile_input) profile_output = await request_func(request_func_input=profile_input)
if profile_output.success: if profile_output.success:
print("Profiler started") print("Profiler started")
@ -367,8 +364,7 @@ async def benchmark(
output_len=output_len, output_len=output_len,
logprobs=logprobs, logprobs=logprobs,
multi_modal_content=mm_content, multi_modal_content=mm_content,
ignore_eos=ignore_eos, ignore_eos=ignore_eos)
extra_body=extra_body)
tasks.append( tasks.append(
asyncio.create_task( asyncio.create_task(
limited_request_func(request_func_input=request_func_input, limited_request_func(request_func_input=request_func_input,
@ -590,39 +586,19 @@ def main(args: argparse.Namespace):
return_prompt_formatted=True) return_prompt_formatted=True)
elif args.dataset_name == "hf": elif args.dataset_name == "hf":
# all following datasets are implemented from the # Choose between VisionArenaDataset
# HuggingFaceDataset base class # and HuggingFaceDataset based on provided parameters.
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: dataset_class = (VisionArenaDataset if args.dataset_path
dataset_class = VisionArenaDataset == VisionArenaDataset.VISION_ARENA_DATASET_PATH
args.hf_split = "train" and args.hf_subset is None else HuggingFaceDataset)
args.hf_subset = None
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_class = InstructCoderDataset
args.hf_split = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_class = ConversationDataset
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_class = AIMODataset
args.hf_split = "train"
else:
supported_datasets = set([
dataset_name for cls in HuggingFaceDataset.__subclasses__()
for dataset_name in cls.SUPPORTED_DATASET_PATHS
])
raise ValueError(
f"Unsupported dataset path: {args.dataset_path}. "
"Huggingface dataset only supports dataset_path"
f" from one of following: {supported_datasets}. "
"Please consider contributing if you would "
"like to add support for additional dataset formats.")
input_requests = dataset_class( input_requests = dataset_class(
dataset_path=args.dataset_path, dataset_path=args.dataset_path,
dataset_subset=args.hf_subset, dataset_subset=args.hf_subset,
dataset_split=args.hf_split, dataset_split=args.hf_split,
random_seed=args.seed,
).sample( ).sample(
num_requests=args.num_prompts, num_requests=args.num_prompts,
tokenizer=tokenizer, tokenizer=tokenizer,
random_seed=args.seed,
output_len=args.hf_output_len, output_len=args.hf_output_len,
) )
@ -657,26 +633,6 @@ def main(args: argparse.Namespace):
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
goodput_config_dict = check_goodput_args(args) goodput_config_dict = check_goodput_args(args)
# Collect the sampling parameters.
sampling_params = {
k: v
for k, v in {
"top_p": args.top_p,
"top_k": args.top_k,
"min_p": args.min_p,
"temperature": args.temperature
}.items() if v is not None
}
# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
raise ValueError(
"Sampling parameters are only supported by openai-compatible "
"backends.")
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
# Avoid GC processing "static" data - reduce pause times. # Avoid GC processing "static" data - reduce pause times.
gc.collect() gc.collect()
gc.freeze() gc.freeze()
@ -703,7 +659,6 @@ def main(args: argparse.Namespace):
goodput_config_dict=goodput_config_dict, goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules, lora_modules=args.lora_modules,
extra_body=sampling_params,
)) ))
# Save config and results to json # Save config and results to json
@ -921,7 +876,7 @@ if __name__ == "__main__":
"--percentile-metrics", "--percentile-metrics",
type=str, type=str,
default="ttft,tpot,itl", default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. " help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. " "This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".") "Default value is \"ttft,tpot,itl\".")
@ -929,7 +884,7 @@ if __name__ == "__main__":
"--metric-percentiles", "--metric-percentiles",
type=str, type=str,
default="99", default="99",
help="Comma-separated list of percentiles for selected metrics. " help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". " "Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.", "Use \"--percentile-metrics\" to select metrics.",
@ -996,23 +951,18 @@ if __name__ == "__main__":
random_group.add_argument( random_group.add_argument(
"--random-range-ratio", "--random-range-ratio",
type=float, type=float,
default=0.0, default=1.0,
help="Range ratio for sampling input/output length, " help="Range of sampled ratio of input/output length, "
"used only for random sampling. Must be in the range [0, 1) to define " "used only for random sampling.",
"a symmetric sampling range"
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
) )
random_group.add_argument( random_group.add_argument(
"--random-prefix-len", "--random-prefix-len",
type=int, type=int,
default=0, default=0,
help=("Number of fixed prefix tokens before the random context " help="Number of fixed prefix tokens before random "
"in a request. " " context. The length range of context in a random "
"The total input length is the sum of `random-prefix-len` and " " request is [random-prefix-len, "
"a random " " random-prefix-len + random-prefix-len * random-range-ratio).")
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."),
)
hf_group = parser.add_argument_group("hf dataset options") hf_group = parser.add_argument_group("hf dataset options")
hf_group.add_argument("--hf-subset", hf_group.add_argument("--hf-subset",
@ -1031,33 +981,6 @@ if __name__ == "__main__":
"from the sampled HF dataset.", "from the sampled HF dataset.",
) )
sampling_group = parser.add_argument_group("sampling parameters")
sampling_group.add_argument(
"--top-p",
type=float,
default=None,
help="Top-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
help="Top-k sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
help="Min-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--temperature",
type=float,
default=None,
help="Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
"decoding (i.e. temperature==0.0).")
parser.add_argument( parser.add_argument(
'--tokenizer-mode', '--tokenizer-mode',
type=str, type=str,

View File

@ -5,13 +5,16 @@ On the server side, run one of the following commands:
(vLLM OpenAI API server) (vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests vllm serve <your_model> --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run: On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \ python benchmarks/benchmark_serving_structured_output.py \
--backend <backend> \ --backend <backend> \
--model <your_model> \ --model <your_model> \
--dataset json \ --dataset json \
--structured-output-ratio 1.0 \ --structured-output-ratio 1.0 \
--structured-output-backend auto \ --structured-output-backend xgrammar \
--request-rate 10 \ --request-rate 10 \
--num-prompts 1000 --num-prompts 1000
@ -130,11 +133,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
"description": "description":
"An unique optional field to avoid cached schemas" "An unique optional field to avoid cached schemas"
} }
else:
json_schemas = [schema] * args.num_prompts
def gen_prompt(index: int): def gen_prompt(index: int):
return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 schema = json_schemas[index % len(json_schemas)]
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
def get_schema(index: int): def get_schema(index: int):
return json_schemas[index % len(json_schemas)] return json_schemas[index % len(json_schemas)]
@ -964,7 +966,7 @@ if __name__ == "__main__":
"--percentile-metrics", "--percentile-metrics",
type=str, type=str,
default="ttft,tpot,itl", default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. " help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. " "This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".") "Default value is \"ttft,tpot,itl\".")
@ -972,7 +974,7 @@ if __name__ == "__main__":
"--metric-percentiles", "--metric-percentiles",
type=str, type=str,
default="99", default="99",
help="Comma-separated list of percentiles for selected metrics. " help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". " "Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.", "Use \"--percentile-metrics\" to select metrics.",
@ -997,14 +999,12 @@ if __name__ == "__main__":
type=float, type=float,
default=1.0, default=1.0,
help="Ratio of Structured Outputs requests") help="Ratio of Structured Outputs requests")
parser.add_argument("--structured-output-backend", parser.add_argument(
type=str, "--structured-output-backend",
choices=[ type=str,
"outlines", "lm-format-enforcer", "xgrammar", choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
"guidance", "auto" default="xgrammar",
], help="Backend to use for structured outputs")
default="auto",
help="Backend to use for structured outputs")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -11,8 +11,7 @@ from typing import Any, Optional, Union
import torch import torch
import uvloop import uvloop
from benchmark_dataset import (AIMODataset, BurstGPTDataset, from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
ConversationDataset, InstructCoderDataset,
RandomDataset, SampleRequest, ShareGPTDataset, RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset) SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -213,17 +212,14 @@ def run_hf(
max_prompt_len = 0 max_prompt_len = 0
max_output_len = 0 max_output_len = 0
for i in range(len(requests)): for i in range(len(requests)):
prompt = requests[i].prompt prompt, prompt_len, output_len = requests[i]
prompt_len = requests[i].prompt_len
output_len = requests[i].expected_output_len
# Add the prompt to the batch. # Add the prompt to the batch.
batch.append(prompt) batch.append(prompt)
max_prompt_len = max(max_prompt_len, prompt_len) max_prompt_len = max(max_prompt_len, prompt_len)
max_output_len = max(max_output_len, output_len) max_output_len = max(max_output_len, output_len)
if len(batch) < max_batch_size and i != len(requests) - 1: if len(batch) < max_batch_size and i != len(requests) - 1:
# Check if we can add more requests to the batch. # Check if we can add more requests to the batch.
next_prompt_len = requests[i + 1].prompt_len _, next_prompt_len, next_output_len = requests[i + 1]
next_output_len = requests[i + 1].expected_output_len
if (max(max_prompt_len, next_prompt_len) + if (max(max_prompt_len, next_prompt_len) +
max(max_output_len, next_output_len)) <= 2048: max(max_output_len, next_output_len)) <= 2048:
# We can add more requests to the batch. # We can add more requests to the batch.
@ -304,7 +300,6 @@ def get_requests(args, tokenizer):
"input_len": args.input_len, "input_len": args.input_len,
"output_len": args.output_len, "output_len": args.output_len,
} }
if args.dataset_path is None or args.dataset_name == "random": if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["prefix_len"] = args.prefix_len
@ -322,23 +317,18 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "burstgpt": elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf": elif args.dataset_name == "hf":
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: if args.backend != "vllm-chat":
dataset_cls = VisionArenaDataset raise ValueError(
common_kwargs['dataset_subset'] = None "hf datasets only are supported by vllm-chat backend")
common_kwargs['dataset_split'] = "train" # Choose between VisionArenaDataset and HuggingFaceDataset based on
sample_kwargs["enable_multimodal_chat"] = True # provided parameters.
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: dataset_cls = (VisionArenaDataset if args.dataset_path
dataset_cls = InstructCoderDataset == VisionArenaDataset.VISION_ARENA_DATASET_PATH
common_kwargs['dataset_split'] = "train" and args.hf_subset is None else HuggingFaceDataset)
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: common_kwargs['dataset_subset'] = args.hf_subset
dataset_cls = ConversationDataset common_kwargs['dataset_split'] = args.hf_split
common_kwargs['dataset_subset'] = args.hf_subset sample_kwargs["enable_multimodal_chat"] = True
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
else: else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}") raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values # Remove None values
@ -472,17 +462,9 @@ def validate_args(args):
warnings.warn("--hf-subset and --hf-split will be ignored \ warnings.warn("--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.", since --dataset-name is not 'hf'.",
stacklevel=2) stacklevel=2)
elif args.dataset_name == "hf": elif args.dataset_name == "hf" and args.backend != "vllm-chat":
if args.dataset_path in ( raise ValueError(
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() "When --dataset-name is 'hf', backend must be 'vllm-chat'")
| ConversationDataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
else:
raise ValueError(
f"{args.dataset_path} is not supported by hf dataset.")
# --random-range-ratio: only used when dataset_name is 'random' # --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None: if args.dataset_name != 'random' and args.random_range_ratio is not None:
@ -594,30 +576,18 @@ if __name__ == "__main__":
default=None, default=None,
help="Path to the lora adapters to use. This can be an absolute path, " help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.") "a relative path, or a Hugging Face model identifier.")
parser.add_argument( parser.add_argument("--prefix-len",
"--prefix-len", type=int,
type=int, default=None,
default=None, help="Number of prefix tokens per request."
help=f"Number of prefix tokens to be used in RandomDataset " "This is for the RandomDataset and SonnetDataset")
"and SonnetDataset. For RandomDataset, the total input "
"length is the sum of prefix-len (default: "
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
"sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]. For SonnetDataset, "
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
"controls how much of the input is fixed lines versus "
"random lines, but the total input length remains approximately "
"input_len tokens.")
# random dataset # random dataset
parser.add_argument( parser.add_argument(
"--random-range-ratio", "--random-range-ratio",
type=float, type=float,
default=None, default=None,
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " help="Range of sampled ratio of input/output length, "
"for sampling input/output length, " "used only for RandomDataSet.",
"used only for RandomDataset. Must be in the range [0, 1) to "
"define a symmetric sampling range "
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
) )
# hf dtaset # hf dtaset

View File

@ -30,18 +30,19 @@ class BenchmarkConfig(TypedDict):
num_stages: int num_stages: int
def benchmark_config(config: BenchmarkConfig, def benchmark_config(
num_tokens: int, config: BenchmarkConfig,
num_experts: int, num_tokens: int,
shard_intermediate_size: int, num_experts: int,
hidden_size: int, shard_intermediate_size: int,
topk: int, hidden_size: int,
dtype: torch.dtype, topk: int,
use_fp8_w8a8: bool, dtype: torch.dtype,
use_int8_w8a16: bool, use_fp8_w8a8: bool,
num_iters: int = 100, use_int8_w8a16: bool,
block_quant_shape: List[int] = None, num_iters: int = 100,
use_deep_gemm: bool = False) -> float: block_quant_shape: List[int] = None,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype) x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int8_w8a16: if use_int8_w8a16:
@ -114,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
def run(): def run():
from vllm.model_executor.layers.fused_moe import override_config from vllm.model_executor.layers.fused_moe import override_config
with override_config(config): with override_config(config):
if use_deep_gemm: fused_moe(
topk_weights, topk_ids = fused_topk(x, input_gating, topk, x,
False) w1,
return fused_experts( w2,
x, input_gating,
w1, topk,
w2, renormalize=True,
topk_weights, inplace=True,
topk_ids, use_fp8_w8a8=use_fp8_w8a8,
inplace=True, use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8, w1_scale=w1_scale,
w1_scale=w1_scale, w2_scale=w2_scale,
w2_scale=w2_scale, a1_scale=a1_scale,
a1_scale=a1_scale, a2_scale=a2_scale,
a2_scale=a2_scale, block_shape=block_quant_shape,
block_shape=block_quant_shape, )
allow_deep_gemm=True,
)
else:
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
# JIT compilation & warmup # JIT compilation & warmup
run() run()
@ -384,7 +366,6 @@ class BenchmarkWorker:
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
block_quant_shape: List[int] = None, block_quant_shape: List[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]: ) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed) current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype, dtype_str = get_config_dtype_str(dtype,
@ -415,8 +396,7 @@ class BenchmarkWorker:
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
num_iters=100, num_iters=100,
block_quant_shape=block_quant_shape, block_quant_shape=block_quant_shape)
use_deep_gemm=use_deep_gemm)
return config, kernel_time return config, kernel_time
def tune( def tune(
@ -431,7 +411,6 @@ class BenchmarkWorker:
use_int8_w8a16: bool, use_int8_w8a16: bool,
search_space: list[dict[str, int]], search_space: list[dict[str, int]],
block_quant_shape: list[int], block_quant_shape: list[int],
use_deep_gemm: bool,
) -> dict[str, int]: ) -> dict[str, int]:
best_config = None best_config = None
best_time = float("inf") best_time = float("inf")
@ -457,8 +436,7 @@ class BenchmarkWorker:
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
num_iters=20, num_iters=20,
block_quant_shape=block_quant_shape, block_quant_shape=block_quant_shape)
use_deep_gemm=use_deep_gemm)
except triton.runtime.autotuner.OutOfResources: except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile. # Some configurations may be invalid and fail to compile.
continue continue
@ -553,9 +531,6 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
else: else:
if not hasattr(config, "hidden_size"):
# Support for llama4
config = config.text_config
# Default: Mixtral. # Default: Mixtral.
E = config.num_local_experts E = config.num_local_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
@ -575,8 +550,6 @@ def main(args: argparse.Namespace):
else: else:
batch_sizes = [args.batch_size] batch_sizes = [args.batch_size]
use_deep_gemm = bool(args.use_deep_gemm)
ray.init() ray.init()
num_gpus = int(ray.available_resources()["GPU"]) num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -599,10 +572,10 @@ def main(args: argparse.Namespace):
start = time.time() start = time.time()
configs = _distribute( configs = _distribute(
"tune", [(batch_size, E, shard_intermediate_size, hidden_size, "tune",
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space, [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
block_quant_shape, use_deep_gemm) use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
for batch_size in batch_sizes]) for batch_size in batch_sizes])
best_configs = { best_configs = {
M: sort_config(config) M: sort_config(config)
for M, config in zip(batch_sizes, configs) for M, config in zip(batch_sizes, configs)
@ -616,7 +589,7 @@ def main(args: argparse.Namespace):
outputs = _distribute( outputs = _distribute(
"benchmark", "benchmark",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm) use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
for batch_size in batch_sizes]) for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -638,7 +611,6 @@ if __name__ == "__main__":
type=str, type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16"], choices=["auto", "fp8_w8a8", "int8_w8a16"],
default="auto") default="auto")
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true") parser.add_argument("--tune", action="store_true")

16
benchmarks/launch_tgi_server.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
PORT=8000
MODEL=$1
TOKENS=$2
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-v "$PWD/data:/data" \
ghcr.io/huggingface/text-generation-inference:2.2.0 \
--model-id "$MODEL" \
--sharded false \
--max-input-length 1024 \
--max-total-tokens 2048 \
--max-best-of 5 \
--max-concurrent-requests 5000 \
--max-batch-total-tokens "$TOKENS"

View File

@ -33,6 +33,8 @@ endif()
if(MACOSX_FOUND) if(MACOSX_FOUND)
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
"-Xpreprocessor"
"-fopenmp"
"-DVLLM_CPU_EXTENSION") "-DVLLM_CPU_EXTENSION")
else() else()
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
@ -195,7 +197,6 @@ set(VLLM_EXT_SRC
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/cpu/quant.cpp" "csrc/cpu/quant.cpp"
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC}) ${VLLM_EXT_SRC})
endif() endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -105,14 +105,8 @@ def run(command):
else: else:
enc = locale.getpreferredencoding() enc = locale.getpreferredencoding()
output = raw_output.decode(enc) output = raw_output.decode(enc)
if command == 'nvidia-smi topo -m':
# don't remove the leading whitespace of `nvidia-smi topo -m`
# because they are meaningful
output = output.rstrip()
else:
output = output.strip()
err = raw_err.decode(enc) err = raw_err.decode(enc)
return rc, output, err.strip() return rc, output.strip(), err.strip()
def run_and_read_all(run_lambda, command): def run_and_read_all(run_lambda, command):
@ -283,13 +277,12 @@ def get_vllm_version():
if __version__ == "dev": if __version__ == "dev":
return "N/A (dev)" return "N/A (dev)"
if len(__version_tuple__) == 4: # dev build if len(__version_tuple__) == 4: # dev build
git_sha = __version_tuple__[-1][1:] # type: ignore git_sha = __version_tuple__[-1][1:] # type: ignore
return f"{__version__} (git sha: {git_sha}" return f"{__version__} (git sha: {git_sha}"
return __version__ return __version__
def summarize_vllm_build_flags(): def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
@ -489,30 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
if patterns is None: if patterns is None:
patterns = DEFAULT_PIP_PATTERNS patterns = DEFAULT_PIP_PATTERNS
def run_with_pip(): # People generally have `pip` as `pip` or `pip3`
try: # But here it is invoked as `python -mpip`
import importlib.util def run_with_pip(pip):
pip_spec = importlib.util.find_spec('pip') out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
pip_available = pip_spec is not None
except ImportError:
pip_available = False
if pip_available:
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
elif os.environ.get("UV") is not None:
print("uv is set")
cmd = ["uv", "pip", "list", "--format=freeze"]
else:
raise RuntimeError(
"Could not collect pip list output (pip or uv module not available)"
)
out = run_and_read_all(run_lambda, cmd)
return "\n".join(line for line in out.splitlines() return "\n".join(line for line in out.splitlines()
if any(name in line for name in patterns)) if any(name in line for name in patterns))
pip_version = 'pip3' if sys.version[0] == '3' else 'pip' pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
out = run_with_pip() out = run_with_pip([sys.executable, '-mpip'])
return pip_version, out return pip_version, out
@ -538,12 +517,13 @@ def is_xnnpack_available():
else: else:
return "N/A" return "N/A"
def get_env_vars(): def get_env_vars():
env_vars = '' env_vars = ''
secret_terms = ('secret', 'token', 'api', 'access', 'password') secret_terms=('secret', 'token', 'api', 'access', 'password')
report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN", report_prefix = ("TORCH", "NCCL", "PYTORCH",
"OMP_", "MKL_", "NVIDIA") "CUDA", "CUBLAS", "CUDNN",
"OMP_", "MKL_",
"NVIDIA")
for k, v in os.environ.items(): for k, v in os.environ.items():
if any(term in k.lower() for term in secret_terms): if any(term in k.lower() for term in secret_terms):
continue continue
@ -554,7 +534,6 @@ def get_env_vars():
return env_vars return env_vars
def get_env_info(): def get_env_info():
run_lambda = run run_lambda = run
pip_version, pip_list_output = get_pip_packages(run_lambda) pip_version, pip_list_output = get_pip_packages(run_lambda)

View File

@ -1,178 +0,0 @@
#include <optional>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <algorithm>
#include "attention_dtypes.h"
#include "attention_utils.cuh"
namespace vllm {
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case)
template <typename scalar_t, const uint NUM_THREADS>
__global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
const uint head_size) {
using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
const uint token_head_threads = num_tokens * num_heads * threads_per_head;
if (global_idx >= token_head_threads) return;
// global_idx -> token_idx + head_idx + pack_idx
const uint token_head_idx = global_idx / threads_per_head;
const uint pack_idx = global_idx % threads_per_head;
const uint token_idx = token_head_idx / num_heads;
const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
const uint head_offset =
token_idx * num_heads * head_size + head_idx * head_size;
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
scalar_t* output_head_ptr = output + head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
const float max_lse = fmaxf(p_lse, s_lse);
p_lse = p_lse - max_lse;
s_lse = s_lse - max_lse;
const float p_se = expf(p_lse);
const float s_se = expf(s_lse);
const float out_se = p_se + s_se;
const float p_scale = p_se / out_se;
const float s_scale = s_se / out_se;
if (pack_offset < head_size) {
// Pack 128b load
pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
prefix_head_ptr)[pack_offset / pack_size];
pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
suffix_head_ptr)[pack_offset / pack_size];
pack_128b_t o_out_pack;
#pragma unroll
for (uint i = 0; i < pack_size; ++i) {
// Always use float for FMA to keep high precision.
// half(uint16_t), bfloat16, float -> float.
const float p_out_f =
vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
const float s_out_f =
vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
// fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
// float -> half(uint16_t), bfloat16, float.
vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
}
// Pack 128b storage
reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
o_out_pack;
}
// We only need to write to output_lse once per head.
if (output_lse != nullptr && pack_idx == 0) {
float out_lse = logf(out_se) + max_lse;
output_lse[head_idx * num_tokens + token_idx] = out_lse;
}
}
} // namespace vllm
// The following macro is used to dispatch the conversion function based on
// the output data type. The FN is a macro that calls a function with
// template<typename scalar_t>.
#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn) \
{ \
if (scalar_dtype == at::ScalarType::Float) { \
fn(float); \
} else if (scalar_dtype == at::ScalarType::Half) { \
fn(uint16_t); \
} else if (scalar_dtype == at::ScalarType::BFloat16) { \
fn(__nv_bfloat16); \
} else { \
TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
} \
}
#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \
{ \
vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
reinterpret_cast<scalar_t*>(prefix_output.data_ptr()), \
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
num_heads, head_size); \
}
/*@brief Merges the attention states from prefix and suffix
* into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
*
* @param output [n,h,d] The output tensor to store the merged attention states.
* @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
* @param prefix_output [n,h,d] The prefix attention states.
* @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
* states.
* @param suffix_output [n,h,d] The suffix attention states.
* @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
* states.
*/
template <typename scalar_t>
void merge_attn_states_launcher(torch::Tensor& output,
std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse) {
constexpr uint NUM_THREADS = 128;
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>();
}
// Process one pack elements per thread. for float, the
// pack_size is 4 for half/bf16, the pack_size is 8.
const uint threads_per_head = head_size / pack_size;
const uint total_threads = num_tokens * num_heads * threads_per_head;
dim3 block(NUM_THREADS);
dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
auto stream = at::cuda::getCurrentCUDAStream();
LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
}
#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
{ \
merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
prefix_lse, suffix_output, \
suffix_lse); \
}
void merge_attn_states(torch::Tensor& output,
std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse) {
DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
}

View File

@ -78,14 +78,9 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
__m256i reg; __m256i reg;
// normal load
explicit FP16Vec16(const void* ptr) explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
explicit FP16Vec16(const FP32Vec16&); explicit FP16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -115,14 +110,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
__m256i reg; __m256i reg;
// normal load
explicit BF16Vec16(const void* ptr) explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
explicit BF16Vec16(const FP32Vec16&); explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -323,13 +313,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
// normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temproal load
explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
explicit FP32Vec16(__m512 data) : reg(data) {} explicit FP32Vec16(__m512 data) : reg(data) {}
explicit FP32Vec16(const FP32Vec4& data) explicit FP32Vec16(const FP32Vec4& data)
@ -562,33 +547,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
_mm_mask_storeu_epi8(ptr, mask, reg); _mm_mask_storeu_epi8(ptr, mask, reg);
} }
}; };
struct INT8Vec64 : public Vec<INT8Vec64> {
constexpr static int VEC_ELEM_NUM = 64;
union AliasReg {
__m512i reg;
int8_t values[VEC_ELEM_NUM];
};
__m512i reg;
// normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temproal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
void save(int8_t* ptr, const int elem_num) const {
constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
__mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
_mm512_mask_storeu_epi8(ptr, mask, reg);
}
// non-temproal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
};
#endif #endif
template <typename T> template <typename T>
@ -699,22 +657,6 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
#ifdef __AVX512F__
inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
_mm256_stream_si256((__m256i*)ptr, vec.reg);
}
inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
_mm512_stream_si512((__m512i*)ptr, vec.reg);
}
inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
_mm256_stream_si256((__m256i*)ptr, vec.reg);
}
inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
_mm512_stream_ps((float*)ptr, vec.reg);
}
#endif
inline void mem_barrier() { _mm_mfence(); }
}; // namespace vec_op }; // namespace vec_op
#endif #endif

View File

@ -1,781 +0,0 @@
#include "cpu/cpu_types.hpp"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
namespace {
#define MAX_SHM_RANK_NUM 8
#define MAX_THREAD_NUM 12
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
#define MAX_P2P_SEND_TENSOR_NUM 8
template <typename scalar_t>
struct KernelVecType {
using scalar_vec_t = void;
};
template <>
struct KernelVecType<float> {
using scalar_vec_t = vec_op::FP32Vec16;
};
template <>
struct KernelVecType<c10::BFloat16> {
using scalar_vec_t = vec_op::BF16Vec16;
};
template <>
struct KernelVecType<c10::Half> {
using scalar_vec_t = vec_op::FP16Vec16;
};
enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
struct ThreadSHMContext {
volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
int thread_id;
int thread_num;
int rank;
int group_size;
size_t _spinning_count;
int swizzled_ranks[MAX_SHM_RANK_NUM];
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
const int group_size, void* thread_shm_ptr)
: thread_id(thread_id),
thread_num(thread_num),
rank(rank),
group_size(group_size),
_spinning_count(0) {
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
TORCH_CHECK((size_t)this % 64 == 0);
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
shm_contexts[i] = nullptr;
thread_shm_ptrs[i] = nullptr;
swizzled_ranks[i] = (i + rank) % group_size;
thread_stats[i] = ThreadSHMStat::DONE;
}
set_context(rank, this, thread_shm_ptr);
}
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
TORCH_CHECK(ptr);
TORCH_CHECK(thread_shm_ptr);
TORCH_CHECK_EQ(ptr->thread_num, thread_num);
TORCH_CHECK_EQ(ptr->thread_id, thread_id);
shm_contexts[rank] = ptr;
thread_shm_ptrs[rank] = thread_shm_ptr;
}
template <typename T>
T* get_thread_shm_ptr(int rank) {
return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
}
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
void wait_for_all(ThreadSHMStat prev_stat) {
for (int idx = 0; idx < group_size; ++idx) {
int rank = get_swizzled_rank(idx);
while (thread_stats[rank] == prev_stat) {
++_spinning_count;
_mm_pause();
}
}
vec_op::mem_barrier();
}
void wait_for_one(int rank, ThreadSHMStat prev_stat) {
while (thread_stats[rank] == prev_stat) {
++_spinning_count;
_mm_pause();
}
vec_op::mem_barrier();
}
void set_thread_stat(ThreadSHMStat stat) {
for (int idx = 0; idx < group_size; ++idx) {
int rank = get_swizzled_rank(idx);
shm_contexts[rank]->thread_stats[this->rank] = stat;
}
}
void set_thread_stat(int target_rank, ThreadSHMStat stat) {
for (int idx = 0; idx < group_size; ++idx) {
int rank = get_swizzled_rank(idx);
shm_contexts[rank]->thread_stats[target_rank] = stat;
}
}
// barrier for all ranks in the group, used for all2all ops
// DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
void barrier(ThreadSHMStat next_stat) {
if (next_stat == ThreadSHMStat::THREAD_READY) {
set_thread_stat(ThreadSHMStat::THREAD_READY);
wait_for_all(ThreadSHMStat::DONE);
} else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
wait_for_all(ThreadSHMStat::THREAD_READY);
} else if (next_stat == ThreadSHMStat::DONE) {
set_thread_stat(ThreadSHMStat::DONE);
wait_for_all(ThreadSHMStat::SHM_DATA_READY);
} else {
TORCH_CHECK(false, "Invalid next_stat to barrier.");
}
}
std::string to_string() const {
std::stringstream ss;
ss << "SHMContext:";
ss << "\nrank: " << rank;
ss << "\ngroup_size: " << group_size;
ss << "\nthread_num: " << thread_num;
ss << "\nthread_id: " << thread_id;
ss << "\nshm_ctx_stat_loop_seq: [";
for (int i = 0; i < group_size; ++i) {
ss << swizzled_ranks[i] << ", ";
}
ss << "]";
ss << "\nshm_contexts: [";
for (int i = 0; i < group_size; ++i) {
if (shm_contexts[i]) {
ss << shm_contexts[i]->rank << ", ";
}
}
ss << "]";
return ss.str();
}
};
class SHMManager {
public:
explicit SHMManager(const std::string& name, const int rank,
const int group_size)
: _rank(rank),
_group_size(group_size),
_thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
_shm_names({""}),
_shared_mem_ptrs({nullptr}),
_shm_ctx(nullptr) {
_shm_names[rank] = get_shm_name(name, rank);
_shared_mem_ptrs[rank] = init_shm(rank);
_shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
for (int i = 0; i < _thread_num; ++i) {
ThreadSHMContext* ctx = new (_shm_ctx + i)
ThreadSHMContext(i, _thread_num, _rank, _group_size,
compute_thread_shm_ptr(_shm_ctx, i));
}
}
void join(const std::string& name) {
for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
if (rank_idx != _rank) {
TORCH_CHECK(_shm_names[rank_idx].empty());
TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
_shm_names[rank_idx] = get_shm_name(name, rank_idx);
_shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
ThreadSHMContext* target_ctx =
reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
_shm_ctx[thread_idx].set_context(
rank_idx, target_ctx + thread_idx,
compute_thread_shm_ptr(target_ctx, thread_idx));
}
}
}
}
~SHMManager() { destroy_shm(); }
ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
static std::string get_shm_name(const std::string& name, int rank) {
return name + "_" + std::to_string(rank);
}
static int64_t create_singleton_instance(const std::string& name,
const int group_size,
const int rank) {
std::lock_guard<std::mutex> guard(SingletonInstancesLock);
SingletonInstances.emplace_back(
std::make_unique<SHMManager>(name, rank, group_size));
return static_cast<int64_t>(SingletonInstances.size() - 1);
}
static SHMManager* get_singleton_instance(int64_t handle) {
return SingletonInstances[handle].get();
}
protected:
static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
static std::mutex SingletonInstancesLock;
private:
static size_t round_to_alignment(size_t num) {
return ((num + 63) / 64) * 64;
}
int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
int8_t* thread_shm_ptr =
reinterpret_cast<int8_t*>(ctx) +
round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
return thread_shm_ptr +
thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
}
size_t compute_shm_size() {
const size_t rounded_rank_buffer_size =
round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
const size_t rounded_thread_shm_ctx_size =
round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
const size_t shm_size =
rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
return shm_size;
}
void* init_shm(int target_rank) {
const std::string& shm_name = _shm_names[target_rank];
const int local_rank = _rank;
const size_t shm_size = compute_shm_size();
int fd = -1;
if (local_rank == target_rank) {
fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
S_IRUSR | S_IWUSR);
if (fd == -1)
TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
std::to_string(errno));
if (ftruncate(fd, shm_size) == -1)
TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
std::to_string(errno));
} else {
fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
if (fd == -1)
TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
std::to_string(errno));
}
void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
if (shm_ptr == MAP_FAILED) {
TORCH_CHECK(false,
"mmap in SHMManager failed. errno: " + std::to_string(errno));
}
if (close(fd) != 0) {
TORCH_CHECK(
false, "close in SHMManager failed. errno: " + std::to_string(errno));
}
TORCH_CHECK((size_t)shm_ptr % 64 == 0);
return shm_ptr;
}
void destroy_shm() {
std::stringstream ss;
ss << "local rank " << _rank << ": [";
for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
ss << _shm_ctx[thread_id]._spinning_count << ", ";
}
ss << "]\n";
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
if (_shared_mem_ptrs[i] != nullptr) {
munmap(_shared_mem_ptrs[i], compute_shm_size());
}
if (!_shm_names[i].empty()) {
shm_unlink(_shm_names[i].c_str());
}
}
}
int _rank;
int _group_size;
int _thread_num;
std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
ThreadSHMContext* _shm_ctx;
};
namespace shm_cc_ops {
template <typename scalar_t, typename F>
void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
int thread_num = ctx->thread_num;
int64_t total_bytes = elem_num * sizeof(scalar_t);
int64_t total_units_num =
(total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
int64_t per_thread_units_num =
(total_units_num + thread_num - 1) / thread_num;
int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
int64_t max_per_thread_iteration_elem_num =
PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < thread_num; ++i) {
int64_t offset = i * per_thread_elem_num;
int64_t end = std::min(elem_num, offset + per_thread_elem_num);
int64_t curr_elem_num =
std::min(max_per_thread_iteration_elem_num, end - offset);
ThreadSHMContext* thread_ctx = ctx + i;
while (curr_elem_num > 0) {
inner_func(thread_ctx, offset, curr_elem_num);
offset += max_per_thread_iteration_elem_num;
curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
}
}
}
}; // namespace shm_cc_ops
namespace shm_cc_ops {
void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned
int64_t i = 0;
#pragma GCC unroll 4
for (; i < aligned_bytes; i += 64) {
vec_op::INT8Vec64 data(
true, (int8_t*)src + i); // stream loading shm to avoid caching
data.save((int8_t*)dst + i);
}
if (aligned_bytes < bytes) {
vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
}
}
void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
#pragma GCC unroll 4
for (int64_t i = 0; i < bytes; i += 64) {
vec_op::INT8Vec64 data((int8_t*)src + i);
data.nt_save((int8_t*)dst + i);
}
}
void memcpy(void* dst, void* src, const int64_t bytes) {
const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned
int64_t i = 0;
#pragma GCC unroll 4
for (; i < aligned_bytes; i += 64) {
vec_op::INT8Vec64 data((int8_t*)src + i);
data.save((int8_t*)dst + i);
}
if (aligned_bytes < bytes) {
vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
}
}
template <typename scalar_t, int RANKS>
void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
size_t elem_num) {
CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
constexpr int64_t vec_elem_num = vec_t::get_elem_num();
const int worldsize = ctx->group_size;
shm_cc_ops::shm_cc_loop<scalar_t>(
ctx, elem_num,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
int rank = thread_ctx->rank;
scalar_t* thread_shm_ptr =
thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
scalar_t* thread_data_ptr = data + data_offset;
int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
scalar_t* remote_data_ptrs[RANKS - 1];
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
thread_ctx->get_swizzled_rank(idx + 1));
});
thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
thread_data_elem_num);
thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
int64_t aligned_data_elem_num =
(data_elem_num / vec_elem_num) * vec_elem_num;
int64_t i = 0;
#pragma GCC unroll 4
for (; i < aligned_data_elem_num; i += vec_elem_num) {
vec_t local_data(thread_data_ptr + i); // load from cache
vec_op::FP32Vec16 local_data_fp32(local_data);
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
vec_t remote_data(
true, remote_data_ptrs[idx] + i); // stream load from shm
vec_op::FP32Vec16 remote_data_fp32(remote_data);
local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce
});
vec_t reduced_data(local_data_fp32);
reduced_data.save(thread_data_ptr + i);
}
if (i < data_elem_num) {
vec_t local_data(thread_data_ptr + i); // load from cache
vec_op::FP32Vec16 local_data_fp32(local_data);
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
vec_t remote_data(
true, remote_data_ptrs[idx] + i); // stream load from shm
vec_op::FP32Vec16 remote_data_fp32(remote_data);
local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce
});
vec_t reduced_data(local_data_fp32);
reduced_data.save(thread_data_ptr + i,
data_elem_num - aligned_data_elem_num);
}
thread_ctx->barrier(ThreadSHMStat::DONE);
});
return;
}
}; // namespace shm_cc_ops
std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
std::mutex SHMManager::SingletonInstancesLock = {};
template <typename scalar_t>
void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
switch (ctx->group_size) {
case 2:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
break;
case 3:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
break;
case 4:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
break;
case 8:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
break;
default:
TORCH_CHECK(false,
"Invalid world size: " + std::to_string(ctx->group_size));
}
}
template <typename scalar_t>
void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
scalar_t** outputs, const int dst) {
CPU_KERNEL_GUARD_IN(shm_gather_impl)
const int worldsize = ctx->group_size;
TORCH_CHECK_LT(dst, worldsize);
shm_cc_ops::shm_cc_loop<scalar_t>(
ctx, elem_num,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
int rank = thread_ctx->rank;
scalar_t* thread_shm_ptr =
thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
data_elem_num * sizeof(scalar_t));
thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
if (rank == dst) {
shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
data_elem_num * sizeof(scalar_t));
for (int i = 1; i < worldsize; ++i) {
int src_rank = thread_ctx->get_swizzled_rank(i);
scalar_t* src_ptr =
thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank); // shm
scalar_t* dst_ptr = outputs[src_rank] + data_offset;
shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
data_elem_num * sizeof(scalar_t));
}
}
thread_ctx->barrier(ThreadSHMStat::DONE);
});
return;
}
struct MemPiece {
void* ptr;
int64_t size;
template <typename T>
T* data_ptr() {
return reinterpret_cast<T*>(ptr);
}
};
struct TensorListMeta {
int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
int64_t tensor_num;
int64_t total_bytes;
TensorListMeta() : tensor_num(0), total_bytes(0) {
static_assert(sizeof(TensorListMeta) % 64 == 0);
static_assert(sizeof(TensorListMeta) <
MIN_THREAD_PROCESS_SIZE); // To ensure the metadata always
// hold by the thread 0
for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
tensor_bytes[i] = 0;
tensor_ptrs[i] = nullptr;
tensor_types[i] = torch::ScalarType::Undefined;
}
}
// For send and recv
void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
"Re-bind TensorListMeta is not allowed.")
TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
tensor_num = tensor_list.size();
int64_t bytes_sum = 0;
for (int i = 0; i < tensor_list.size(); ++i) {
torch::Tensor& t = tensor_list[i];
TORCH_CHECK(t.is_contiguous());
tensor_bytes[i] = t.nbytes();
tensor_types[i] = t.scalar_type();
tensor_ptrs[i] = t.data_ptr();
bytes_sum += t.nbytes();
}
total_bytes = bytes_sum;
}
// For recv
std::vector<torch::Tensor> generate_tensor_list() {
std::vector<torch::Tensor> tensor_list;
tensor_list.reserve(tensor_num);
for (int i = 0; i < tensor_num; ++i) {
int64_t bytes = tensor_bytes[i];
auto type = tensor_types[i];
int64_t elem_bytes = torch::elementSize(type);
TORCH_CHECK_EQ(bytes % elem_bytes, 0);
int64_t elem_num = bytes / elem_bytes;
auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
tensor_list.emplace_back(torch::empty({elem_num}, options));
}
return tensor_list;
}
MemPiece get_data(int64_t offset) {
for (int i = 0; i < tensor_num; ++i) {
if (offset < tensor_bytes[i]) {
return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
tensor_bytes[i] - offset};
}
offset -= tensor_bytes[i];
}
return {nullptr, 0};
}
private:
void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
int8_t _padding[40];
};
void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
const std::vector<torch::Tensor>& tensor_list) {
CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
std::vector<torch::Tensor> tensor_list_with_metadata;
tensor_list_with_metadata.reserve(1 + tensor_list.size());
auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
tensor_list_with_metadata.emplace_back(
torch::empty({sizeof(TensorListMeta)}, options));
tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
tensor_list.begin(), tensor_list.end());
torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
metadata->bind_tensor_list(tensor_list_with_metadata);
shm_cc_ops::shm_cc_loop<int8_t>(
ctx, metadata->total_bytes,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
int rank = thread_ctx->rank;
// Wait until the receiver set the stat to DONE
thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
int64_t curr_shm_offset = 0;
while (curr_shm_offset < data_elem_num) {
MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
shm_cc_ops::memcpy(
thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
frag.ptr, frag.size);
curr_shm_offset += frag.size;
}
thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
});
}
std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
int64_t src) {
CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
torch::Tensor metadata_tensor =
torch::empty({sizeof(TensorListMeta)}, options);
// Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
ctx->wait_for_one(src, ThreadSHMStat::DONE);
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
ctx->get_thread_shm_ptr<void>(src),
sizeof(TensorListMeta));
TensorListMeta* src_metadata =
reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
std::vector<torch::Tensor> tensor_list_with_metadata =
src_metadata->generate_tensor_list();
TensorListMeta metadata;
metadata.bind_tensor_list(tensor_list_with_metadata);
TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
shm_cc_ops::shm_cc_loop<int8_t>(
ctx, metadata.total_bytes,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
// Wait until the sender set the stat to SHM_DATA_READY
thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
int64_t curr_shm_offset = 0;
while (curr_shm_offset < data_elem_num) {
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
shm_cc_ops::memcpy(
frag.ptr,
thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
frag.size);
curr_shm_offset += frag.size;
}
thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
});
std::vector<torch::Tensor> tensor_list;
tensor_list.reserve(metadata.tensor_num - 1);
tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
tensor_list_with_metadata.end());
return tensor_list;
}
} // namespace
void shm_gather(int64_t handle, torch::Tensor& data,
const std::optional<std::vector<torch::Tensor>>& outputs,
int64_t dst) {
TORCH_CHECK(data.is_contiguous())
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
CPU_KERNEL_GUARD_IN(shm_gather_impl)
if (outputs.has_value()) {
TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
for (int i = 0; i < outputs->size(); ++i) {
output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
}
shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
dst);
} else {
shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
dst);
}
CPU_KERNEL_GUARD_OUT(shm_gather_impl)
});
}
void shm_all_gather(int64_t handle, const torch::Tensor& data,
torch::Tensor& output) {
TORCH_CHECK(data.is_contiguous())
TORCH_CHECK(output.is_contiguous())
const int64_t input_elem_num = data.numel();
const int64_t output_elem_num = output.numel();
TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
const int world_size = output_elem_num / input_elem_num;
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
TORCH_CHECK_EQ(ctx->group_size, world_size);
scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
for (int i = 0; i < world_size; ++i) {
output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
}
shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
ctx->rank);
CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
});
}
void shm_allreduce(int64_t handle, torch::Tensor& data) {
TORCH_CHECK(data.is_contiguous())
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
data.data_ptr<scalar_t>(), data.numel());
CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
});
}
void shm_send_tensor_list(int64_t handle,
const std::vector<torch::Tensor>& tensor_list,
int64_t dst) {
CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
shm_send_tensor_list_impl(
SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
}
std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
auto tensor_list = shm_recv_tensor_list_impl(
SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
return tensor_list;
}
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank) {
return SHMManager::create_singleton_instance(name, group_size, rank);
}
std::string join_shm_manager(int64_t handle, const std::string& name) {
auto shm_manager = SHMManager::get_singleton_instance(handle);
TORCH_CHECK(shm_manager);
shm_manager->join(name);
return shm_manager->get_shm_ctx()->to_string();
}

View File

@ -22,26 +22,6 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale, torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens); torch::Tensor& block_tables, torch::Tensor& seq_lens);
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank);
std::string join_shm_manager(int64_t handle, const std::string& name);
void shm_allreduce(int64_t handle, torch::Tensor& data);
void shm_gather(int64_t handle, torch::Tensor& data,
const std::optional<std::vector<torch::Tensor>>& outputs,
int64_t dst);
void shm_all_gather(int64_t handle, const torch::Tensor& data,
torch::Tensor& output);
void shm_send_tensor_list(int64_t handle,
const std::vector<torch::Tensor>& tensor_list,
int64_t dst);
std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops // vLLM custom ops
@ -151,29 +131,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor? azp, Tensor? bias) -> ()"); " Tensor? azp, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
#endif #endif
// SHM CCL
#ifdef __AVX512F__
ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
&init_shm_manager);
ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
ops.def(
"shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
"()");
ops.impl("shm_gather", torch::kCPU, &shm_gather);
ops.def(
"shm_all_gather(int handle, Tensor data, Tensor! output) -> "
"()");
ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
ops.def(
"shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
"()");
ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
&shm_recv_tensor_list);
#endif
} }
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {

View File

@ -4,11 +4,6 @@
#include <string> #include <string>
#include <sched.h> #include <sched.h>
#endif #endif
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
#include <unistd.h>
#include <sys/syscall.h>
#define gettid() syscall(SYS_gettid)
#endif
#include "cpu_types.hpp" #include "cpu_types.hpp"
@ -23,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
#ifndef VLLM_NUMA_DISABLED #ifndef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) { std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str()); bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask->size > 0); TORCH_CHECK(omp_cpu_mask->size > 0);
std::vector<int> omp_cpu_ids; std::vector<int> omp_cpu_ids;
omp_cpu_ids.reserve(omp_cpu_mask->size); omp_cpu_ids.reserve(omp_cpu_mask->size);

View File

@ -1,39 +0,0 @@
#include <torch/all.h>
#include <torch/cuda.h>
#include <cuda_runtime.h>
// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
// memory, and that UVA (Unified Virtual Addressing) is enabled.
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
// Get raw host pointer from CPU tensor
void* host_ptr = cpu_tensor.data_ptr();
// Get a device pointer corresponding to the pinned host memory
void* device_ptr = nullptr;
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
TORCH_CHECK(err == cudaSuccess,
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
// We'll use the same sizes, strides, and dtype as the CPU tensor.
// TODO: check if layout is respected.
auto sizes = cpu_tensor.sizes();
auto strides = cpu_tensor.strides();
auto options = cpu_tensor.options().device(torch::kCUDA);
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
// memory, so we don't free it here.
auto deleter = [](void*) {
// no-op, since the memory is owned by the original CPU tensor
};
torch::Tensor cuda_tensor =
torch::from_blob(device_ptr, sizes, strides, deleter, options);
TORCH_CHECK(cuda_tensor.device().is_cuda(),
"Resulting tensor is not on CUDA device");
return cuda_tensor;
}

View File

@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs, fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank, torch::Tensor& rank_data, int64_t rank,
bool fully_connected) { bool full_nvlink) {
int world_size = fake_ipc_ptrs.size(); int world_size = fake_ipc_ptrs.size();
if (world_size > 8) if (world_size > 8)
throw std::invalid_argument("world size > 8 is not supported"); throw std::invalid_argument("world size > 8 is not supported");
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
} }
return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(), return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
rank_data.numel(), rank, world_size, rank_data.numel(), rank, world_size,
fully_connected); full_nvlink);
} }
/** /**
@ -142,48 +142,3 @@ void register_graph_buffers(fptr_t _fa,
bytes.reserve(handles.size()); bytes.reserve(handles.size());
fa->register_graph_buffers(bytes, offsets); fa->register_graph_buffers(bytes, offsets);
} }
std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
int64_t size) {
auto device_index = c10::cuda::current_device();
at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
void* buffer;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
auto stream = c10::cuda::getCurrentCUDAStream().stream();
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Allocate buffer
#if defined(USE_ROCM)
// data buffers need to be "uncached" for signal on MI200
AT_CUDA_CHECK(
hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
#else
AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
#endif
AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Create IPC memhandle for the allocated buffer.
// Will use it in open_mem_handle.
auto options =
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
auto handle =
torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
AT_CUDA_CHECK(
cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
}
fptr_t open_mem_handle(torch::Tensor& mem_handle) {
void* ipc_ptr;
AT_CUDA_CHECK(cudaIpcOpenMemHandle(
(void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
cudaIpcMemLazyEnablePeerAccess));
return reinterpret_cast<fptr_t>(ipc_ptr);
}
void free_shared_buffer(fptr_t buffer) {
AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
}

View File

@ -5,10 +5,6 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#if defined(USE_ROCM)
typedef __hip_bfloat16 nv_bfloat16;
#endif
#include <iostream> #include <iostream>
#include <array> #include <array>
#include <limits> #include <limits>
@ -16,7 +12,6 @@ typedef __hip_bfloat16 nv_bfloat16;
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace vllm {
#define CUDACHECK(cmd) \ #define CUDACHECK(cmd) \
do { \ do { \
cudaError_t e = cmd; \ cudaError_t e = cmd; \
@ -27,37 +22,24 @@ namespace vllm {
} \ } \
} while (0) } while (0)
// Maximal number of blocks in allreduce kernel. namespace vllm {
constexpr int kMaxBlocks = 36; constexpr int kMaxBlocks = 36;
// Default number of blocks in allreduce kernel.
#ifndef USE_ROCM
const int defaultBlockLimit = 36;
CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
#else
const int defaultBlockLimit = 16;
hipPointer_attribute rangeStartAddrAttr =
HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
#endif
// Counter may overflow, but it's fine since unsigned int overflow is // Counter may overflow, but it's fine since unsigned int overflow is
// well-defined behavior. // well-defined behavior.
using FlagType = uint32_t; using FlagType = uint32_t;
// Two sets of peer counters are needed for two syncs: starting and ending an
// operation. The reason is that it's possible for peer GPU block to arrive at
// the second sync point while the current GPU block haven't passed the first
// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
// waiting for counter. We use alternating counter array to avoid this
// possibility.
struct Signal { struct Signal {
alignas(128) FlagType start[kMaxBlocks][8]; alignas(128) FlagType self_counter[kMaxBlocks][8];
alignas(128) FlagType end[kMaxBlocks][8]; // Two sets of peer counters are needed for two syncs. The reason is that
alignas(128) FlagType _flag[kMaxBlocks]; // incremental flags for each rank // it's possible for peer GPU block to arrive at the second sync point while
// the current GPU block haven't passed the first sync point. Thus, peer GPU
// may write counter+1 while current GPU is busy waiting for counter. We use
// alternating counter array to avoid this possibility.
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
}; };
struct __align__(16) RankData { struct __align__(16) RankData {
const void* ptrs[8]; const void* __restrict__ ptrs[8];
}; };
struct __align__(16) RankSignals { struct __align__(16) RankSignals {
@ -152,29 +134,27 @@ DINLINE O downcast(array_t<float, O::size> val) {
} }
} }
#if !defined(USE_ROCM)
static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) { static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag), asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
"l"(flag_addr)); "l"(flag_addr));
#else #else
asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag), asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
"l"(flag_addr)); "l"(flag_addr));
#endif #endif
} }
static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) { static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
FlagType flag; FlagType flag;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm volatile("ld.acquire.sys.global.u32 %0, [%1];" asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
: "=r"(flag) : "=r"(flag)
: "l"(flag_addr)); : "l"(flag_addr));
#else #else
asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;" asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
: "=r"(flag) : "=r"(flag)
: "l"(flag_addr)); : "l"(flag_addr));
#endif #endif
return flag; return flag;
} }
@ -190,99 +170,37 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
return flag; return flag;
} }
// This function is meant to be used as the first synchronization in the all // is_start: whether this is the very first synchronization barrier.
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for // need_fence: whether a memory fence is needed. If true, a release-acquire
// prior memory accesses. Note: volatile writes will not be reordered against // semantic is used to enforce memory access order before and after this
// other volatile writes. // barrier.
template <int ngpus> template <int ngpus, bool is_start, bool need_fence = false>
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg, DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
int rank) { int rank) {
uint32_t flag = self_sg->_flag[blockIdx.x] + 1; if constexpr (!is_start) __syncthreads();
static_assert(
!(is_start && need_fence)); // Start barrier shouldn't need fence.
if (threadIdx.x < ngpus) { if (threadIdx.x < ngpus) {
auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank]; // Increment the counter. Technically we only need one counter, but we use
auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x]; // multiple per block to eliminate the need to share the counter via smem.
// Write the expected counter value to peer and wait for correct value auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
// from peer.
st_flag_volatile(peer_counter_ptr, flag);
while (ld_flag_volatile(self_counter_ptr) != flag);
}
__syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
// This function is meant to be used as the second or the final
// synchronization barrier in the all reduce kernel. If it's the final
// synchronization barrier, we don't need to make any visibility guarantees
// for prior memory accesses.
template <int ngpus, bool final_sync = false>
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
__syncthreads();
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
// Write the expected counter value to peer and wait for correct value from // Write the expected counter value to peer and wait for correct value from
// peer. // peer.
if constexpr (!final_sync) { auto peer_counter_ptr =
st_flag_release(peer_counter_ptr, flag); &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
while (ld_flag_acquire(self_counter_ptr) != flag); auto self_counter_ptr =
&self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
if constexpr (need_fence) {
st_flag_release(peer_counter_ptr, val);
while (ld_flag_acquire(self_counter_ptr) != val);
} else { } else {
st_flag_volatile(peer_counter_ptr, flag); st_flag_volatile(peer_counter_ptr, val);
while (ld_flag_volatile(self_counter_ptr) != flag); while (ld_flag_volatile(self_counter_ptr) != val);
} }
} }
if constexpr (!final_sync) __syncthreads(); if constexpr (is_start || need_fence) __syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
} }
#else
template <int ngpus>
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
int rank) {
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
// wait until we got true from all ranks
while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
__ATOMIC_RELAXED,
__MEMORY_SCOPE_DEVICE) < flag);
}
__syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
template <int ngpus, bool final_sync = false>
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
__syncthreads();
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
flag,
final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
__MEMORY_SCOPE_SYSTEM);
// wait until we got true from all ranks
while (
__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
__MEMORY_SCOPE_DEVICE) < flag);
}
if constexpr (!final_sync) __syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
#endif
template <typename P, int ngpus, typename A> template <typename P, int ngpus, typename A>
DINLINE P packed_reduce(const P* ptrs[], int idx) { DINLINE P packed_reduce(const P* ptrs[], int idx) {
A tmp = upcast(ptrs[0][idx]); A tmp = upcast(ptrs[0][idx]);
@ -302,13 +220,13 @@ __global__ void __launch_bounds__(512, 1)
// note: we don't reorder the address so the accumulation order is the same // note: we don't reorder the address so the accumulation order is the same
// for all ranks, ensuring bitwise identical results // for all ranks, ensuring bitwise identical results
auto dp = *_dp; auto dp = *_dp;
barrier_at_start<ngpus>(sg, self_sg, rank); multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// do the actual reduction // do the actual reduction
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += gridDim.x * blockDim.x) { idx += gridDim.x * blockDim.x) {
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx); ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
} }
barrier_at_end<ngpus, true>(sg, self_sg, rank); multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
} }
template <typename P> template <typename P>
@ -337,20 +255,18 @@ __global__ void __launch_bounds__(512, 1)
tmps[i] = get_tmp_buf<P>(sg.signals[target]); tmps[i] = get_tmp_buf<P>(sg.signals[target]);
} }
auto tmp_out = tmps[0]; auto tmp_out = tmps[0];
barrier_at_start<ngpus>(sg, self_sg, rank); multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// stage 1: reduce scatter // stage 1: reduce scatter
for (int idx = start + tid; idx < end; idx += stride) { for (int idx = start + tid; idx < end; idx += stride) {
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx); tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
} }
barrier_at_end<ngpus>(sg, self_sg, rank); multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
// stage 2: allgather. Note: it's important to match the tid between // stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed // the two stages, because visibility across devices is only guaranteed
// between threads that have the same tid. If thread i computes the sum of // between threads that have the same tid. If thread i computes the sum of
// start + i in the first stage, then thread i also gathers start + i from // start + i in the first stage, then thread i also gathers start + i from all
// all ranks. // ranks.
for (int idx = tid; idx < largest_part; idx += stride) { for (int idx = tid; idx < largest_part; idx += stride) {
#pragma unroll #pragma unroll
for (int i = 0; i < ngpus; i++) { for (int i = 0; i < ngpus; i++) {
@ -371,22 +287,21 @@ class CustomAllreduce {
public: public:
int rank_; int rank_;
int world_size_; int world_size_;
// Full NVLink or xGMI connection between GPUs. bool full_nvlink_;
bool fully_connected_;
RankSignals sg_; RankSignals sg_;
// Stores a map from a pointer to its peer pointers from all ranks. // Stores an map from a pointer to its peer pointters from all ranks.
std::unordered_map<void*, RankData*> buffers_; std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_; Signal* self_sg_;
// Stores rank data from all ranks. This is mainly for cuda graph purposes. // Stores rank data from all ranks. This is mainly for cuda graph purposes.
// For cuda graph to work, all kernel arguments must be fixed during graph // For cuda graph to work, all kernel arguments must be fixed during graph
// capture time. However, the peer pointers are not known during graph // capture time. However, the peer pointers are not known during graph capture
// capture time. Therefore, during capture, we increment the rank data // time. Therefore, during capture, we increment the rank data pointer and use
// pointer and use that as the argument to the kernel. The kernel arguments // that as the argument to the kernel. The kernel arguments are stored in
// are stored in graph_unreg_buffers_. The actual peer pointers will be // graph_unreg_buffers_. The actual peer pointers will be filled in at the
// filled in at the memory pointed to by the pointers in // memory pointed to by the pointers in graph_unreg_buffers_ when
// graph_unreg_buffers_ when the IPC handles are exchanged between ranks. // the IPC handles are exchanged between ranks.
// //
// The overall process looks like this: // The overall process looks like this:
// 1. Graph capture. // 1. Graph capture.
@ -404,18 +319,17 @@ class CustomAllreduce {
* Signals are an array of ipc-enabled buffers from all ranks. * Signals are an array of ipc-enabled buffers from all ranks.
* For each of the buffer, the layout is as follows: * For each of the buffer, the layout is as follows:
* | -- sizeof(Signal) -- | ------ a few MB ----- | * | -- sizeof(Signal) -- | ------ a few MB ----- |
* The first section is for allreduce synchronization, and the second * The first section is for allreduce synchronization, and the second section
* section is for storing the intermediate results required by some * is for storing the intermediate results required by some allreduce algos.
* allreduce algos.
* *
* Note: this class does not own any device memory. Any required buffers * Note: this class does not own any device memory. Any required buffers
* are passed in from the constructor. * are passed in from the constructor.
*/ */
CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz, CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
int rank, int world_size, bool fully_connected = true) int rank, int world_size, bool full_nvlink = true)
: rank_(rank), : rank_(rank),
world_size_(world_size), world_size_(world_size),
fully_connected_(fully_connected), full_nvlink_(full_nvlink),
self_sg_(signals[rank]), self_sg_(signals[rank]),
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)), d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) { d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@ -447,7 +361,8 @@ class CustomAllreduce {
void* base_ptr; void* base_ptr;
// note: must share the base address of each allocation, or we get wrong // note: must share the base address of each allocation, or we get wrong
// address // address
if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr, if (cuPointerGetAttribute(&base_ptr,
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
(CUdeviceptr)ptr) != CUDA_SUCCESS) (CUdeviceptr)ptr) != CUDA_SUCCESS)
throw std::runtime_error("failed to get pointer attr"); throw std::runtime_error("failed to get pointer attr");
CUDACHECK(cudaIpcGetMemHandle( CUDACHECK(cudaIpcGetMemHandle(
@ -481,11 +396,11 @@ class CustomAllreduce {
// Note: when registering graph buffers, we intentionally choose to not // Note: when registering graph buffers, we intentionally choose to not
// deduplicate the addresses. That means if the allocator reuses some // deduplicate the addresses. That means if the allocator reuses some
// addresses, they will be registered again. This is to account for the // addresses, they will be registered again. This is to account for the remote
// remote possibility of different allocation patterns between ranks. For // possibility of different allocation patterns between ranks. For example,
// example, rank 1 may get the same input address for the second allreduce, // rank 1 may get the same input address for the second allreduce, but rank 2
// but rank 2 got a different address. IPC handles have internal reference // got a different address. IPC handles have internal reference counting
// counting mechanism so overhead should be small. // mechanism so overhead should be small.
void register_graph_buffers( void register_graph_buffers(
const std::vector<std::string>& handles, const std::vector<std::string>& handles,
const std::vector<std::vector<int64_t>>& offsets) { const std::vector<std::vector<int64_t>>& offsets) {
@ -516,15 +431,15 @@ class CustomAllreduce {
/** /**
* Performs allreduce, assuming input has already been registered. * Performs allreduce, assuming input has already been registered.
* *
* Block and grid default configs are results after careful grid search. * Block and grid default configs are results after careful grid search. Using
* Using 36 blocks give the best or close to the best runtime on the devices * 36 blocks give the best or close to the best runtime on the devices I
* I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
* only take a small amount of SMs. Not quite sure the underlying reason, * take a small amount of SMs. Not quite sure the underlying reason, but my
* but my guess is that too many SMs will cause contention on NVLink bus. * guess is that too many SMs will cause contention on NVLink bus.
*/ */
template <typename T> template <typename T>
void allreduce(cudaStream_t stream, T* input, T* output, int size, void allreduce(cudaStream_t stream, T* input, T* output, int size,
int threads = 512, int block_limit = defaultBlockLimit) { int threads = 512, int block_limit = 36) {
auto d = packed_t<T>::P::size; auto d = packed_t<T>::P::size;
if (size % d != 0) if (size % d != 0)
throw std::runtime_error( throw std::runtime_error(
@ -558,11 +473,13 @@ class CustomAllreduce {
#define KL(ngpus, name) \ #define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \ name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
rank_, size); rank_, size);
// TODO(hanzhi713): Threshold is different for A100 and H100.
// Add per device threshold.
#define REDUCE_CASE(ngpus) \ #define REDUCE_CASE(ngpus) \
case ngpus: { \ case ngpus: { \
if (world_size_ == 2) { \ if (world_size_ == 2) { \
KL(ngpus, cross_device_reduce_1stage); \ KL(ngpus, cross_device_reduce_1stage); \
} else if (fully_connected_) { \ } else if (full_nvlink_) { \
if ((world_size_ <= 4 && bytes < 512 * 1024) || \ if ((world_size_ <= 4 && bytes < 512 * 1024) || \
(world_size_ <= 8 && bytes < 256 * 1024)) { \ (world_size_ <= 8 && bytes < 256 * 1024)) { \
KL(ngpus, cross_device_reduce_1stage); \ KL(ngpus, cross_device_reduce_1stage); \
@ -580,8 +497,7 @@ class CustomAllreduce {
REDUCE_CASE(8) REDUCE_CASE(8)
default: default:
throw std::runtime_error( throw std::runtime_error(
"custom allreduce only supports num gpus in (2,4,6,8). Actual " "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
"num "
"gpus = " + "gpus = " +
std::to_string(world_size_)); std::to_string(world_size_));
} }
@ -595,10 +511,9 @@ class CustomAllreduce {
} }
} }
}; };
/** /**
* To inspect PTX/SASS, copy paste this header file to compiler explorer and * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
add a template instantiation: a template instantiation:
* template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *, * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
half *, int, int, int); half *, int, int, int);
*/ */

View File

@ -1,9 +1,9 @@
/** /**
* This is a standalone test for custom allreduce. * This is a standalone test for custom allreduce.
* To compile, make sure you have MPI and NCCL installed in your system. * To compile, make sure you have MPI and NCCL installed in your system.
* export MPI_HOME=XXX * export MPI_HOME=xxx
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
* *
* Warning: this C++ test is not designed to be very readable and was used * Warning: this C++ test is not designed to be very readable and was used
* during the rapid prototyping process. * during the rapid prototyping process.
@ -22,15 +22,7 @@
#include "cuda_profiler_api.h" #include "cuda_profiler_api.h"
#include "custom_all_reduce.cuh" #include "custom_all_reduce.cuh"
#include "mpi.h" #include "mpi.h"
#ifdef USE_ROCM #include "nccl.h"
#include <hip/hip_bf16.h>
typedef __hip_bfloat16 nv_bfloat16;
#include "rccl/rccl.h"
#include "custom_all_reduce_hip.cuh"
#else
#include "nccl.h"
#include "custom_all_reduce.cuh"
#endif
#define MPICHECK(cmd) \ #define MPICHECK(cmd) \
do { \ do { \
@ -51,29 +43,16 @@ typedef __hip_bfloat16 nv_bfloat16;
} \ } \
} while (0) } while (0)
#ifdef USE_ROCM
__global__ void dummy_kernel() { __global__ void dummy_kernel() {
for (int i = 0; i < 100; i++) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint64_t start = wall_clock64();
uint64_t cycles_elapsed;
do {
cycles_elapsed = wall_clock64() - start;
} while (cycles_elapsed < 100);
}
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
}
#else #else
__global__ void dummy_kernel() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
#else
for (int i = 0; i < 100; i++) { for (int i = 0; i < 100; i++) {
long long int start = clock64(); long long int start = clock64();
while (clock64() - start < 150000000); // approximately 98.4ms on P40 while (clock64() - start < 150000000); // approximately 98.4ms on P40
} }
#endif
}
#endif #endif
}
template <typename T> template <typename T>
__global__ void set_data(T* data, int size, int myRank) { __global__ void set_data(T* data, int size, int myRank) {
@ -142,14 +121,8 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
* registration, they are allocated and registered together in the test for * registration, they are allocated and registered together in the test for
* convenience. * convenience.
*/ */
#ifdef USE_ROCM
CUDACHECK(hipExtMallocWithFlags(
(void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
hipDeviceMallocUncached));
#else
CUDACHECK( CUDACHECK(
cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal))); cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
#endif
CUDACHECK( CUDACHECK(
cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal))); cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T))); CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@ -338,18 +311,13 @@ int main(int argc, char** argv) {
bool performance_test = true; bool performance_test = true;
cudaProfilerStart(); cudaProfilerStart();
// Uncomment to scan through different block size configs. // Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) { // for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) { // for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024, // run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test); // performance_test);
// } // }
// } // }
#ifdef USE_ROCM
const int block_limit = 16;
#else
const int block_limit = 36;
#endif
// Scan through different sizes to test performance. // Scan through different sizes to test performance.
for (int sz = 512; sz <= (8 << 20); sz *= 2) { for (int sz = 512; sz <= (8 << 20); sz *= 2) {
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test); run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);

View File

@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
// in case the final state is separated between the last "smem_exchange" and // in case the final state is separated between the last "smem_exchange" and
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
// (which occurs when `final_state_position` is a non-positive index) // (which occurs when `final_state_position` is a non-positivie index)
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){ if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
input_t vals_load[kNElts] = {0}; input_t vals_load[kNElts] = {0};

View File

@ -1,103 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import glob
import itertools
import os
import subprocess
import jinja2
FILE_HEAD = """
// auto generated by generate.py
// clang-format off
#include "kernel.h"
#include "marlin_template.h"
namespace MARLIN_NAMESPACE_NAME {
""".strip()
TEMPLATE = ("template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
"{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, "
"{{stages}}, "
"{{'true' if has_act_order else 'false'}}, "
"{{'true' if has_zp else 'false'}}, "
"{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>"
"( MARLIN_KERNEL_PARAMS );")
# int8 with zero point case (vllm::kU8) is also supported,
# we don't add it to reduce wheel size.
SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
# group_blocks:
# = 0 : act order case
# = -1 : channelwise quantization
# > 0 : group_size=16*group_blocks
GROUP_BLOCKS = [0, -1, 2, 4, 8]
DTYPES = ["fp16", "bf16"]
def remove_old_kernels():
for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
subprocess.call(["rm", "-f", filename])
def generate_new_kernels():
for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
has_zp = "B" not in scalar_type
all_template_str_list = []
for group_blocks, m_blocks, thread_configs in itertools.product(
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
has_act_order = group_blocks == 0
if has_zp and has_act_order:
continue
if thread_configs[2] == 256:
if m_blocks <= 1 and thread_configs[0] != 128:
continue
if m_blocks > 1 and thread_configs[0] != 64:
continue
k_blocks = thread_configs[0] // 16
n_blocks = thread_configs[1] // 16
threads = thread_configs[2]
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
template_str = jinja2.Template(TEMPLATE).render(
scalar_t=c_dtype,
w_type_id=scalar_type + ".id()",
threads=threads,
thread_m_blocks=max(m_blocks, 1),
thread_n_blocks=n_blocks,
thread_k_blocks=k_blocks,
m_block_size_8=m_blocks == 0.5,
stages="pipe_stages",
has_act_order=has_act_order,
has_zp=has_zp,
group_blocks=group_blocks,
is_zp_float=False,
)
all_template_str_list.append(template_str)
file_content = FILE_HEAD + "\n\n"
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
f.write(file_content)
if __name__ == "__main__":
remove_old_kernels()
generate_new_kernels()

View File

@ -1,44 +0,0 @@
#ifndef MARLIN_NAMESPACE_NAME
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
#endif
#include "quantization/gptq_marlin/marlin.cuh"
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
#include "core/scalar_type.hpp"
#define MARLIN_KERNEL_PARAMS \
const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, \
const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool use_atomic_add, \
bool use_fp32_reduce
namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t, // compute dtype, half or nv_float16
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
const int threads, // number of threads in a threadblock
const int thread_m_blocks, // number of 16x16 blocks in the m
// dimension (batchsize) of the
// threadblock
const int thread_n_blocks, // same for n dimension (output)
const int thread_k_blocks, // same for k dimension (reduction)
const bool m_block_size_8, // whether m_block_size == 8
// only works when thread_m_blocks == 1
const int stages, // number of stages for the async global->shared
// fetch pipeline
const bool has_act_order, // whether act_order is enabled
const bool has_zp, // whether zero-points are enabled
const int group_blocks, // number of consecutive 16x16 blocks
// with a separate quantization scale
const bool is_zp_float // is zero point of float16 type?
>
__global__ void Marlin(MARLIN_KERNEL_PARAMS);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,927 +0,0 @@
/*
* Modified by Neural Magic
* Copyright (C) Marlin.2024 Elias Frantar
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Adapted from https://github.com/IST-DASLab/marlin
*/
#ifndef MARLIN_NAMESPACE_NAME
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
#endif
#include "kernel.h"
#include "core/registration.h"
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
static_assert(std::is_same<scalar_t, half>::value || \
std::is_same<scalar_t, nv_bfloat16>::value, \
"only float16 and bfloat16 is supported");
namespace MARLIN_NAMESPACE_NAME {
__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
template <int moe_block_size>
__global__ void permute_cols_kernel(
int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
int4* __restrict__ out_int4_ptr,
const int32_t* __restrict__ sorted_token_ids_ptr,
const int32_t* __restrict__ expert_ids_ptr,
const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
int size_k, int top_k) {};
} // namespace marlin
torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
torch::Tensor& b_q_weight, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none,
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) {
TORCH_CHECK_NOT_IMPLEMENTED(false,
"marlin_gemm(..) requires CUDA_ARCH >= 8.0");
return torch::empty({1, 1});
}
#else
// For a given "a" of size [M,K] performs a permutation of the K columns based
// on the given "perm" indices.
template <int moe_block_size>
__global__ void permute_cols_kernel(
int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
int4* __restrict__ out_int4_ptr,
const int32_t* __restrict__ sorted_token_ids_ptr,
const int32_t* __restrict__ expert_ids_ptr,
const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
int size_k, int top_k) {
int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
int32_t block_sorted_ids[moe_block_size];
int block_num_valid_tokens = 0;
int64_t old_expert_id = 0;
int64_t expert_id = 0;
int row_stride = size_k * sizeof(half) / 16;
auto read_moe_block_data = [&](int block_id) {
block_num_valid_tokens = moe_block_size;
int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
for (int i = 0; i < moe_block_size / 4; i++) {
tmp_block_sorted_ids[i] =
((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
}
for (int i = 0; i < moe_block_size; i++) {
if (block_sorted_ids[i] >= size_m * top_k) {
block_num_valid_tokens = i;
break;
};
}
};
auto permute_row = [&](int row) {
int iters = size_k / default_threads;
int rest = size_k % default_threads;
int in_offset = (row / top_k) * row_stride;
int out_offset = row * row_stride;
half const* a_row_half =
reinterpret_cast<half const*>(a_int4_ptr + in_offset);
half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
int base_k = 0;
for (int i = 0; i < iters; i++) {
int cur_k = base_k + threadIdx.x;
int src_pos = perm_int_ptr[cur_k];
out_half[cur_k] = a_row_half[src_pos];
base_k += default_threads;
}
if (rest) {
if (threadIdx.x < rest) {
int cur_k = base_k + threadIdx.x;
int src_pos = perm_int_ptr[cur_k];
out_half[cur_k] = a_row_half[src_pos];
}
}
};
for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
old_expert_id = expert_id;
int tmp_expert_id = expert_ids_ptr[index];
if (tmp_expert_id == -1) continue;
expert_id = tmp_expert_id;
perm_int_ptr += (expert_id - old_expert_id) * size_k;
read_moe_block_data(index);
for (int i = 0; i < block_num_valid_tokens; i++)
permute_row(block_sorted_ids[i]);
}
}
typedef struct {
int thread_k;
int thread_n;
int num_threads;
} thread_config_t;
thread_config_t small_batch_thread_configs[] = {
// Ordered by priority
// thread_k, thread_n, num_threads
{128, 128, 256},
{64, 128, 128}};
thread_config_t large_batch_thread_configs[] = {
// Ordered by priority
// thread_k, thread_n, num_threads
{64, 256, 256},
{64, 128, 128}};
typedef struct {
int blocks_per_sm;
thread_config_t tb_cfg;
} exec_config_t;
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
int prob_n, int prob_k, int num_bits, int group_size,
bool has_act_order, bool is_k_full) {
bool cache_scales_chunk = has_act_order && !is_k_full;
int tb_n = th_config.thread_n;
int tb_k = th_config.thread_k;
// Get max scale groups per thread-block
int tb_groups;
if (group_size == -1) {
tb_groups = 1;
} else if (group_size == 0) {
tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size
} else {
tb_groups = div_ceil(tb_k, group_size);
}
if (cache_scales_chunk) {
int load_groups =
tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
load_groups = max(load_groups, 32); // We load at least 32 scale groups
return load_groups * tb_n * 2;
} else {
int tb_scales = tb_groups * tb_n * 2;
return tb_scales * pipe_stages;
}
}
int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
int prob_m, int prob_n, int prob_k, int num_bits,
int group_size, bool has_act_order, bool is_k_full,
int has_zp, int is_zp_float) {
int pack_factor = 32 / num_bits;
// Get B size
int tb_k = th_config.thread_k;
int tb_n = th_config.thread_n;
int tb_m = thread_m_blocks * 16;
// shm size for block_sorted_ids/block_topk_weights
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
int sh_block_meta_size = tb_m * 4 * 2;
int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
int sh_s_size =
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
group_size, has_act_order, is_k_full);
int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
int sh_zp_size = 0;
if (has_zp) {
if (is_zp_float)
sh_zp_size = sh_s_size;
else if (num_bits == 4)
sh_zp_size = sh_s_size / 4;
else if (num_bits == 8)
sh_zp_size = sh_s_size / 2;
}
int total_size = sh_a_size + sh_b_size + sh_s_size + sh_zp_size +
sh_g_idx_size + sh_block_meta_size;
return total_size;
}
bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
int prob_m, int prob_n, int prob_k, int num_bits,
int group_size, bool has_act_order, bool is_k_full,
int has_zp, int is_zp_float, int max_shared_mem) {
// Sanity
if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
th_config.num_threads == -1) {
return false;
}
// Verify K/N are divisible by thread K/N
if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
return false;
}
// Verify min for thread K/N
if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
return false;
}
// num_threads must be at least 128 (= 4 warps)
if (th_config.num_threads < 128) {
return false;
}
// Check that pipeline fits into cache
int cache_size = get_kernel_cache_size(
th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float);
return cache_size <= max_shared_mem;
}
#define __GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
M_BLOCK_SIZE_8, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, \
NUM_THREADS, IS_ZP_FLOAT) \
else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \
thread_n_blocks == THREAD_N_BLOCKS && \
thread_k_blocks == THREAD_K_BLOCKS && \
m_block_size_8 == M_BLOCK_SIZE_8 && \
has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \
is_zp_float == IS_ZP_FLOAT) { \
kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, \
THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8, \
pipe_stages, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, \
IS_ZP_FLOAT>; \
}
#define GPTQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, true, false, 0, NUM_THREADS, \
false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, false, 0, \
NUM_THREADS, false) \
\
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 8, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 8, \
NUM_THREADS, false)
#define GPTQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, false, 0, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, false, 0, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, false, 0, \
NUM_THREADS, false) \
\
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 8, \
NUM_THREADS, false) \
\
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 8, \
NUM_THREADS, false) \
\
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 8, \
NUM_THREADS, false)
#define AWQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 2, NUM_THREADS, \
false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 8, NUM_THREADS, \
false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 8, \
NUM_THREADS, false)
#define AWQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 8, \
NUM_THREADS, false) \
\
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 8, \
NUM_THREADS, false) \
\
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 2, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, false) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 8, \
NUM_THREADS, false)
// We currently have 4-bit models only with group_blocks == 4
#define HQQ_GET_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
true) \
__GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, true) \
__GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, true) \
__GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, true) \
__GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4, \
NUM_THREADS, true)
template <typename scalar_t>
MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
int thread_m_blocks, int thread_n_blocks,
int thread_k_blocks, bool m_block_size_8,
bool has_act_order, bool has_zp,
int group_blocks, int num_threads,
bool is_zp_float) {
int num_bits = q_type.size_bits();
auto kernel = MarlinDefault;
if (false) {
}
GPTQ_GET_IF_M1(vllm::kU4B8, 8, 8, 256)
GPTQ_GET_IF_M1(vllm::kU4B8, 8, 4, 128)
GPTQ_GET_IF_M234(vllm::kU4B8, 16, 4, 256)
GPTQ_GET_IF_M234(vllm::kU4B8, 8, 4, 128)
GPTQ_GET_IF_M1(vllm::kU8B128, 8, 8, 256)
GPTQ_GET_IF_M1(vllm::kU8B128, 8, 4, 128)
GPTQ_GET_IF_M234(vllm::kU8B128, 16, 4, 256)
GPTQ_GET_IF_M234(vllm::kU8B128, 8, 4, 128)
AWQ_GET_IF_M1(vllm::kU4, 8, 8, 256)
AWQ_GET_IF_M1(vllm::kU4, 8, 4, 128)
AWQ_GET_IF_M234(vllm::kU4, 16, 4, 256)
AWQ_GET_IF_M234(vllm::kU4, 8, 4, 128)
return kernel;
}
template <typename scalar_t>
exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
int prob_n, int prob_k, int thread_m_blocks,
bool m_block_size_8, int num_bits,
int group_size, bool has_act_order,
bool is_k_full, bool has_zp,
bool is_zp_float, int max_shared_mem) {
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
thread_config_t* thread_configs = thread_m_blocks > 1
? large_batch_thread_configs
: small_batch_thread_configs;
int thread_configs_size =
thread_m_blocks > 1
? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
: sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
int count = 0;
constexpr int device_max_reg_size = 255 * 1024;
for (int i = 0; i < thread_configs_size; i++) {
thread_config_t th_config = thread_configs[i];
if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp,
is_zp_float, max_shared_mem)) {
continue;
}
int cache_size = get_kernel_cache_size(
th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
group_size, has_act_order, is_k_full, has_zp, is_zp_float);
int group_blocks = 0;
if (!has_act_order) {
group_blocks = group_size == -1 ? -1 : group_size / 16;
}
auto kernel = get_marlin_kernel<scalar_t>(
q_type, thread_m_blocks, th_config.thread_n / 16,
th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
group_blocks, th_config.num_threads, is_zp_float);
if (kernel == MarlinDefault) continue;
if (thread_m_blocks > 1) {
exec_cfg = {1, th_config};
break;
} else {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr, kernel);
int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
int allow_count = min(device_max_reg_size / reg_size,
max_shared_mem / (cache_size + 1024));
allow_count = max(min(allow_count, 4), 1);
if (allow_count > count) {
count = allow_count;
exec_cfg = {count, th_config};
};
}
}
return exec_cfg;
}
template <typename scalar_t>
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
void* zp, void* g_idx, void* perm, void* a_tmp,
void* sorted_token_ids, void* expert_ids,
void* num_tokens_past_padded, void* topk_weights,
int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
int prob_m, int prob_n, int prob_k, void* workspace,
vllm::ScalarType const& q_type, bool has_act_order,
bool is_k_full, bool has_zp, int num_groups, int group_size,
int dev, cudaStream_t stream, int thread_k, int thread_n,
int sms, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) {
int thread_m_blocks = div_ceil(moe_block_size, 16);
bool m_block_size_8 = moe_block_size == 8;
if (has_zp) {
TORCH_CHECK(
q_type == vllm::kU4 || q_type == vllm::kU8,
"q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
} else {
TORCH_CHECK(
q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
"q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
q_type.str());
}
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
", ", prob_n, ", ", prob_k, "]");
int group_blocks = 0;
if (has_act_order) {
if (is_k_full) {
TORCH_CHECK(group_size != -1);
group_blocks = group_size / 16;
TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
" is not divisible by group_blocks = ", group_blocks);
} else {
TORCH_CHECK(group_size == 0);
group_blocks = 0;
}
} else {
if (group_size == -1) {
group_blocks = -1;
} else {
group_blocks = group_size / 16;
TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
" is not divisible by group_blocks = ", group_blocks);
}
}
int num_bits = q_type.size_bits();
const int4* A_ptr = (const int4*)A;
const int4* B_ptr = (const int4*)B;
int4* C_ptr = (int4*)C;
int4* C_tmp_ptr = (int4*)C_tmp;
const int4* s_ptr = (const int4*)s;
const int4* zp_ptr = (const int4*)zp;
const int* g_idx_ptr = (const int*)g_idx;
const int* perm_ptr = (const int*)perm;
int4* a_tmp_ptr = (int4*)a_tmp;
const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
const int32_t* num_tokens_past_padded_ptr =
(const int32_t*)num_tokens_past_padded;
const float* topk_weights_ptr = (const float*)topk_weights;
int* locks = (int*)workspace;
if (has_act_order) {
// Permute A columns
auto kernel = permute_cols_kernel<8>;
if (moe_block_size == 8) {
} else if (moe_block_size == 16)
kernel = permute_cols_kernel<16>;
else if (moe_block_size == 32)
kernel = permute_cols_kernel<32>;
else if (moe_block_size == 48)
kernel = permute_cols_kernel<48>;
else if (moe_block_size == 64)
kernel = permute_cols_kernel<64>;
else
TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
// avoid ">>>" being formatted to "> > >"
// clang-format off
kernel<<<sms, default_threads, 0, stream>>>(
A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
// clang-format on
A_ptr = a_tmp_ptr;
prob_m = prob_m * top_k;
top_k = 1;
// If we have a full K, then we can run the non-act-order version of Marlin
// (since the weight rows are reordered by increasing group ids, and by
// having a full K, we have full original groups)
if (is_k_full) has_act_order = false;
}
int max_shared_mem = 0;
cudaDeviceGetAttribute(&max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
TORCH_CHECK(max_shared_mem > 0);
// Set thread config
exec_config_t exec_cfg;
thread_config_t thread_tfg;
if (thread_k != -1 && thread_n != -1) {
thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
exec_cfg = exec_config_t{1, thread_tfg};
TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
" is not divisible by thread_n = ", thread_n);
TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
" is not divisible by thread_k = ", thread_k);
} else {
// Auto config
exec_cfg = determine_exec_config<scalar_t>(
q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
max_shared_mem);
thread_tfg = exec_cfg.tb_cfg;
}
int num_threads = thread_tfg.num_threads;
thread_k = thread_tfg.thread_k;
thread_n = thread_tfg.thread_n;
int blocks = sms * exec_cfg.blocks_per_sm;
if (exec_cfg.blocks_per_sm > 1)
max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
int thread_k_blocks = thread_k / 16;
int thread_n_blocks = thread_n / 16;
TORCH_CHECK(is_valid_config(thread_tfg, thread_m_blocks, prob_m, prob_n,
prob_k, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, max_shared_mem),
"Invalid thread config: thread_m_blocks = ", thread_m_blocks,
", thread_k = ", thread_tfg.thread_k,
", thread_n = ", thread_tfg.thread_n,
", num_threads = ", thread_tfg.num_threads, " for MKN = [",
prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
", group_size = ", group_size,
", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
", max_shared_mem = ", max_shared_mem);
auto kernel = get_marlin_kernel<scalar_t>(
q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
if (kernel == MarlinDefault) {
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
", ", prob_k, "]", ", has_act_order = ", has_act_order,
", num_groups = ", num_groups, ", group_size = ", group_size,
", thread_m_blocks = ", thread_m_blocks,
", thread_n_blocks = ", thread_n_blocks,
", thread_k_blocks = ", thread_k_blocks,
", num_bits = ", num_bits);
}
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
max_shared_mem);
// avoid ">>>" being formatted to "> > >"
// clang-format off
kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,
sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce);
// clang-format on
}
} // namespace MARLIN_NAMESPACE_NAME
torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
torch::Tensor& b_q_weight, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none,
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) {
vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
int pack_factor = 32 / b_q_type.size_bits();
if (moe_block_size != 8) {
TORCH_CHECK(moe_block_size % 16 == 0,
"unsupported moe_block_size=", moe_block_size);
TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64,
"unsupported moe_block_size=", moe_block_size);
}
// Verify A
TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
", size_m = ", size_m);
TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
", size_k = ", size_k);
// Verify B
TORCH_CHECK(
size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
" is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
"Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
", size_k = ", size_k,
", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
TORCH_CHECK(
b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
"b_q_weight.size(2) = ", b_q_weight.size(2),
" is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
int actual_size_n =
(b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
", actual_size_n = ", actual_size_n);
// Verify device and strides
TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
// thread_k: `k` size of a thread_tile in `weights` (can usually be left as
// auto -1)
int thread_k = -1;
// thread_n: `n` size of a thread_tile in `weights` (can usually be left as
// auto -1)
int thread_n = -1;
// sms: number of SMs to use for the kernel
int sms = -1;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
// Alloc buffers
const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
torch::Tensor c;
if (c_or_none.has_value()) {
c = c_or_none.value();
TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
TORCH_CHECK(c.size(0) == size_m * top_k,
"Shape mismatch: c.size(0) = ", c.size(0),
", size_m * topk = ", size_m * top_k);
TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
", size_n = ", size_n);
} else {
c = torch::empty({size_m * top_k, size_n}, options);
}
// Alloc C tmp buffer that is going to be used for the global reduce
torch::Tensor c_tmp;
auto options_fp32 =
torch::TensorOptions().dtype(at::kFloat).device(a.device());
if (use_fp32_reduce && !use_atomic_add) {
// max num of threadblocks is sms * 4
long max_c_tmp_size = min(
(long)size_n * sorted_token_ids.size(0),
(long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
if (moe_block_size == 8) max_c_tmp_size *= 2;
c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
} else {
c_tmp = torch::empty({0}, options_fp32);
}
// Detect groupsize and act_order
int num_groups = -1;
int group_size = -1;
int rank = b_scales.sizes().size();
TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
" is not size_n = ", size_n);
num_groups = b_scales.size(1);
torch::Tensor g_idx, perm, a_tmp;
;
if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
g_idx = g_idx_or_none.value();
perm = perm_or_none.value();
TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
// Verify g_idx and perm
TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
(g_idx.size(-1) == size_k && perm.size(-1) == size_k),
"Unexpected g_idx.size(-1) = ", g_idx.size(-1),
" and perm.size(-1) = ", perm.size(-1),
", where size_k = ", size_k);
} else {
g_idx = torch::empty({0}, options);
perm = torch::empty({0}, options);
a_tmp = torch::empty({0}, options);
}
bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
if (has_act_order) {
a_tmp = torch::empty({size_m * top_k, size_k}, options);
if (is_k_full) {
TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
", is not divisible by num_groups = ", num_groups);
group_size = size_k / num_groups;
} else {
group_size = 0;
}
} else {
a_tmp = torch::empty({0}, options);
if (num_groups > 1) {
TORCH_CHECK(
size_k % num_groups == 0, "size_k = ", size_k,
", is not divisible by b_scales.size(1) = ", b_scales.size(1));
group_size = size_k / num_groups;
} else {
group_size = -1;
}
}
torch::Tensor b_zeros;
if (b_zeros_or_none.has_value()) {
b_zeros = b_zeros_or_none.value();
TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
} else {
b_zeros = torch::empty({0}, options);
}
bool has_zp = b_zeros.size(-1) > 0;
if (has_zp) {
TORCH_CHECK(
b_q_type == vllm::kU4,
"b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
} else {
TORCH_CHECK(
b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
"b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
b_q_type.str());
}
if (has_zp && is_zp_float) {
TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
"Computation type must be float16 (half) when using float zero "
"points.");
}
// Verify b_zeros
if (has_zp) {
int rank = b_zeros.sizes().size();
TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
if (is_zp_float) {
TORCH_CHECK(b_zeros.size(2) == size_n,
"b_zeros dim 2 = ", b_zeros.size(2),
" is not size_n = ", size_n);
TORCH_CHECK(num_groups == b_zeros.size(1),
"b_zeros dim 1 = ", b_zeros.size(1),
" is not num_groups = ", num_groups);
TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
} else {
TORCH_CHECK(b_zeros.size(1) == num_groups,
"b_zeros dim 1 = ", b_zeros.size(1),
" is not num_groups = ", num_groups);
TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
"b_zeros dim 2 = ", b_zeros.size(2),
" is not size_n / pack_factor = ", size_n / pack_factor);
}
}
// Verify workspace size
TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
"size_n = ", size_n, ", is not divisible by min_thread_n = ",
MARLIN_NAMESPACE_NAME::min_thread_n);
int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
int min_workspace_size = min(
max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
TORCH_CHECK(workspace.numel() >= min_workspace_size,
"workspace.numel = ", workspace.numel(),
" is below min_workspace_size = ", min_workspace_size);
int dev = a.get_device();
if (a.scalar_type() == at::ScalarType::Half) {
MARLIN_NAMESPACE_NAME::marlin_mm<half>(
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
is_k_full, has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
} else {
TORCH_CHECK(false,
"moe_wna16_marlin_gemm only supports bfloat16 and float16");
}
return c;
}
#endif
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
}

View File

@ -13,6 +13,7 @@
template <typename scalar_t, int bit, int GROUPS> template <typename scalar_t, int bit, int GROUPS>
__global__ void moe_wna16_gemm_kernel( __global__ void moe_wna16_gemm_kernel(
const scalar_t* __restrict__ input, scalar_t* __restrict__ output, const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales, const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
const uint32_t* __restrict__ qzeros, const uint32_t* __restrict__ qzeros,
@ -53,6 +54,8 @@ __global__ void moe_wna16_gemm_kernel(
if (token_index / top_k >= size_m) break; if (token_index / top_k >= size_m) break;
num_valid_tokens = m + 1; num_valid_tokens = m + 1;
if (blockIdx.z == 0 && offset_n < size_n)
output[token_index * size_n + offset_n] = Dtype::int2num(0);
if (expert_id != -1) { if (expert_id != -1) {
int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N); int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
@ -281,7 +284,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
int64_t BLOCK_SIZE_K, int64_t bit) { int64_t BLOCK_SIZE_K, int64_t bit) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
output.zero_(); auto options =
torch::TensorOptions().dtype(input.dtype()).device(input.device());
const int num_experts = b_qweight.size(0); const int num_experts = b_qweight.size(0);
const int size_m = input.size(0); const int size_m = input.size(0);
@ -298,9 +302,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
const uint32_t* b_qzeros_ptr; const uint32_t* b_qzeros_ptr;
if (b_qzeros.has_value()) if (b_qzeros.has_value())
b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>(); b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
const float* topk_weights_ptr = nullptr; const float* topk_weights_ptr;
if (topk_weights.has_value()) if (topk_weights.has_value())
topk_weights_ptr = (const float*)topk_weights.value().data_ptr<float>(); topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
int groups_per_block_row = BLOCK_SIZE_K / group_size; int groups_per_block_row = BLOCK_SIZE_K / group_size;
TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8"); TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");

View File

@ -43,17 +43,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm); m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
m.def( m.def(
"moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none," "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
"Tensor! b_q_weight, Tensor! b_scales, Tensor? b_zeros_or_none," "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
"Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace," "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
"Tensor sorted_token_ids," "int b_q_type, SymInt size_m, "
"Tensor! expert_ids, Tensor! num_tokens_past_padded," "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
"Tensor! topk_weights, int moe_block_size, int top_k, " "topk, "
"bool mul_topk_weights, bool is_ep, int b_q_type_id," "int moe_block_size, bool replicate_input, bool apply_weights)"
"int size_m, int size_n, int size_k," " -> Tensor");
"bool is_full_k, bool use_atomic_add,"
"bool use_fp32_reduce, bool is_zp_float) -> Tensor");
// conditionally compiled so impl registration is in source file // conditionally compiled so impl registration is in source file
#endif #endif

View File

@ -52,15 +52,6 @@ void paged_attention_v2(
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step); const int64_t blocksparse_head_sliding_step);
#ifndef USE_ROCM
void merge_attn_states(torch::Tensor& output,
std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse);
#endif
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
double epsilon); double epsilon);
@ -128,8 +119,6 @@ void advance_step_flashinfer(
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr, torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds); torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
#ifndef USE_ROCM #ifndef USE_ROCM
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
const torch::Tensor& codebooks, const torch::Tensor& codebooks,
@ -154,8 +143,7 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
#endif #endif
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
int64_t n, int64_t n);
std::optional<at::ScalarType> const& dtype);
torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
int64_t type, int64_t row); int64_t type, int64_t row);
@ -277,10 +265,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
const std::optional<at::Tensor>& has_initial_state, const std::optional<at::Tensor>& has_initial_state,
bool silu_activation, int64_t pad_slot_id); bool silu_activation, int64_t pad_slot_id);
#ifndef USE_ROCM
using fptr_t = int64_t; using fptr_t = int64_t;
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs, fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank, torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
bool fully_connected);
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
fptr_t reg_buffer, int64_t reg_buffer_sz_bytes); fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
void dispose(fptr_t _fa); void dispose(fptr_t _fa);
@ -291,7 +279,4 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
void register_graph_buffers(fptr_t _fa, void register_graph_buffers(fptr_t _fa,
const std::vector<std::vector<int64_t>>& handles, const std::vector<std::vector<int64_t>>& handles,
const std::vector<std::vector<int64_t>>& offsets); const std::vector<std::vector<int64_t>>& offsets);
std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle( #endif
int64_t size);
int64_t open_mem_handle(torch::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer);

View File

@ -30,6 +30,9 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
fp8_type* __restrict__ out, float* __restrict__ scale, fp8_type* __restrict__ out, float* __restrict__ scale,
scalar_t const* __restrict__ input, float const* __restrict__ scale_ub, scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
const int hidden_size) { const int hidden_size) {
float const min_scaling_factor =
1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
int const tid = threadIdx.x; int const tid = threadIdx.x;
int const token_idx = blockIdx.x; int const token_idx = blockIdx.x;
@ -64,8 +67,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
token_scale = block_absmax_val_maybe; token_scale = block_absmax_val_maybe;
} }
// token scale computation // token scale computation
token_scale = max(token_scale / quant_type_max_v<fp8_type>, token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
min_scaling_factor<fp8_type>::val()); min_scaling_factor);
scale[token_idx] = token_scale; scale[token_idx] = token_scale;
} }
__syncthreads(); __syncthreads();

View File

@ -1,12 +1,20 @@
#pragma once #pragma once
#include "quantization/vectorization.cuh" #include "quantization/vectorization.cuh"
#include "quantization/utils.cuh"
#include <cmath> #include <cmath>
#include <c10/core/ScalarType.h>
#ifdef USE_ROCM #ifndef USE_ROCM
#include <c10/util/Float8_e4m3fn.h>
#define MAYBE_HOST_DEVICE C10_HOST_DEVICE
#else
#include <ATen/hip/HIPContext.h>
#include <c10/util/Float8_e4m3fn.h>
#include <c10/util/Float8_e4m3fnuz.h>
#include "amd/quant_utils.cuh" #include "amd/quant_utils.cuh"
// ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
#define MAYBE_HOST_DEVICE
#endif #endif
// Determines the preferred FP8 type for the current platform. // Determines the preferred FP8 type for the current platform.
@ -23,6 +31,29 @@ static bool is_fp8_ocp() {
#endif #endif
} }
template <typename T>
struct fp8_e4m3_adjusted_max;
template <>
struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
static constexpr c10::Float8_e4m3fn val() {
return std::numeric_limits<c10::Float8_e4m3fn>::max();
}
};
// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
template <>
struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
static constexpr c10::Float8_e4m3fnuz val() {
return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
}
};
template <typename T>
MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
fp8_e4m3_adjusted_max<T>::val();
namespace vllm { namespace vllm {
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@ -45,8 +76,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
x = val / scale; x = val / scale;
} }
float r = float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>)); fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
#ifndef USE_ROCM #ifndef USE_ROCM
return static_cast<fp8_type>(r); return static_cast<fp8_type>(r);
#else #else
@ -92,7 +123,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
// Finally, since cache[0] contains the maximum for this thread block, // Finally, since cache[0] contains the maximum for this thread block,
// atomically write the max to the target location // atomically write the max to the target location
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>); atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
} }
} }

View File

@ -14,7 +14,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
float* __restrict__ scales, // [num_tokens] float* __restrict__ scales, // [num_tokens]
scalar_t const* __restrict__ input, // [..., hidden_size] scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size] scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size, float const* scale_ub, float const var_epsilon,
float const min_scaling_factor, int32_t const hidden_size,
scalar_t* __restrict__ residual = nullptr) { scalar_t* __restrict__ residual = nullptr) {
float rms = 0.0f; float rms = 0.0f;
float token_scale = 0.0f; float token_scale = 0.0f;
@ -26,8 +27,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
// Compute scale // Compute scale
vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
has_residual>( has_residual>(
&token_scale, scales, input, weight, rms, scale_ub, hidden_size, &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
residual); hidden_size, residual);
// RMS Norm + Quant // RMS Norm + Quant
if constexpr (std::is_same_v<scalar_out_t, int8_t>) { if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@ -49,7 +50,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
float* __restrict__ scales, // [num_tokens] float* __restrict__ scales, // [num_tokens]
scalar_t const* __restrict__ input, // [..., hidden_size] scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size] scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size, float const* scale_ub, float const var_epsilon,
float const min_scaling_factor, int32_t const hidden_size,
scalar_t* __restrict__ residual = nullptr) { scalar_t* __restrict__ residual = nullptr) {
// For vectorization, token_input and token_output pointers need to be // For vectorization, token_input and token_output pointers need to be
// aligned at 8-byte and 4-byte addresses respectively. // aligned at 8-byte and 4-byte addresses respectively.
@ -58,8 +60,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
if (can_vectorize) { if (can_vectorize) {
return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t, return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
has_residual>( has_residual>(
out, scales, input, weight, scale_ub, var_epsilon, hidden_size, out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
residual); hidden_size, residual);
} }
float rms = 0.0f; float rms = 0.0f;
@ -70,8 +72,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
var_epsilon, residual); var_epsilon, residual);
// Compute Scale // Compute Scale
vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>( vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
&token_scale, scales, input, weight, rms, scale_ub, hidden_size, &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
residual); hidden_size, residual);
// RMS Norm + Quant // RMS Norm + Quant
if constexpr (std::is_same_v<scalar_out_t, int8_t>) { if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@ -103,6 +105,11 @@ void rms_norm_dynamic_per_token_quant_dispatch(
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const float min_scaling_factor =
out.dtype() == torch::kInt8
? std::numeric_limits<float>::epsilon()
: 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
if (residual.has_value()) { if (residual.has_value()) {
VLLM_DISPATCH_QUANT_TYPES( VLLM_DISPATCH_QUANT_TYPES(
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] { out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
@ -112,7 +119,8 @@ void rms_norm_dynamic_per_token_quant_dispatch(
out.data_ptr<scalar_t>(), scales.data_ptr<float>(), out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(), input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr, scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>()); var_epsilon, min_scaling_factor, hidden_size,
residual->data_ptr<scalar_in_t>());
}); });
} else { } else {
@ -124,7 +132,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
out.data_ptr<scalar_t>(), scales.data_ptr<float>(), out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(), input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr, scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
var_epsilon, hidden_size, nullptr); var_epsilon, min_scaling_factor, hidden_size, nullptr);
}); });
} }
} }

View File

@ -5,7 +5,6 @@
*/ */
#include "quantization/vectorization.cuh" #include "quantization/vectorization.cuh"
#include "quantization/utils.cuh"
#include "quant_conversions.cuh" #include "quant_conversions.cuh"
#ifndef USE_ROCM #ifndef USE_ROCM
@ -52,11 +51,11 @@ __device__ void compute_dynamic_per_token_scales(
float* __restrict__ token_scale, float* __restrict__ all_token_scales, float* __restrict__ token_scale, float* __restrict__ all_token_scales,
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
float const rms, float const* __restrict__ scale_ub, float const rms, float const* __restrict__ scale_ub,
int32_t const hidden_size, float const min_scaling_factor, int32_t const hidden_size,
scalar_t const* __restrict__ residual = nullptr) { scalar_t const* __restrict__ residual = nullptr) {
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size); int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
; ;
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>}; constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
float block_absmax_val_maybe = 0.0f; float block_absmax_val_maybe = 0.0f;
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
@ -84,7 +83,7 @@ __device__ void compute_dynamic_per_token_scales(
scale = block_absmax_val_maybe; scale = block_absmax_val_maybe;
} }
// token scale computation // token scale computation
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val()); scale = max(scale / qmax, min_scaling_factor);
s_token_scale = scale; // Shared memory store s_token_scale = scale; // Shared memory store
all_token_scales[blockIdx.x] = scale; // Global output store all_token_scales[blockIdx.x] = scale; // Global output store
} }
@ -185,7 +184,7 @@ __device__ void compute_dynamic_per_token_scales(
float* __restrict__ token_scale, float* __restrict__ all_token_scales, float* __restrict__ token_scale, float* __restrict__ all_token_scales,
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
float const rms, float const* __restrict__ scale_ub, float const rms, float const* __restrict__ scale_ub,
int32_t const hidden_size, float const min_scaling_factor, int32_t const hidden_size,
scalar_t const* __restrict__ residual = nullptr) { scalar_t const* __restrict__ residual = nullptr) {
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size); int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
; ;
@ -201,7 +200,7 @@ __device__ void compute_dynamic_per_token_scales(
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]); reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
} }
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>}; constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
int32_t const num_vec_elems = hidden_size >> 2; int32_t const num_vec_elems = hidden_size >> 2;
float block_absmax_val_maybe = 0.0f; float block_absmax_val_maybe = 0.0f;
@ -249,7 +248,7 @@ __device__ void compute_dynamic_per_token_scales(
scale = block_absmax_val_maybe; scale = block_absmax_val_maybe;
} }
// token scale computation // token scale computation
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val()); scale = max(scale / qmax, min_scaling_factor);
s_token_scale = scale; // shared memory store s_token_scale = scale; // shared memory store
all_token_scales[blockIdx.x] = scale; // global output store all_token_scales[blockIdx.x] = scale; // global output store
} }

View File

@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
template <typename fp8_type> template <typename fp8_type>
static __device__ __forceinline__ fp8_type float_to_fp8(float const x) { static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
float const r = float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>)); fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
return static_cast<fp8_type>(r); return static_cast<fp8_type>(r);
} }

View File

@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
dfloat2 v; dfloat2 v;
dequantize_kernel(vx, ib, iqs, v); dequantize_kernel(vx, ib, iqs, v);
y[iybs + iqs + 0] = convert_from_half<dst_t>(v.x); y[iybs + iqs + 0] = v.x;
y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y); y[iybs + iqs + y_offset] = v.y;
} }
template<typename dst_t> template<typename dst_t>
@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
half dall = __low2half(x[i].dm); half dall = __low2half(x[i].dm);
half dmin = __high2half(x[i].dm); half dmin = __high2half(x[i].dm);
y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)))); y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)));
y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)))); y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)));
y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)))); y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)));
y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)))); y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)));
} }
template<typename dst_t> template<typename dst_t>
@ -148,9 +148,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
const uint8_t * q = x[i].qs + 32*n; const uint8_t * q = x[i].qs + 32*n;
const uint8_t * hm = x[i].hmask; const uint8_t * hm = x[i].hmask;
for (int l = l0; l < l0+4; ++l) { for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
y[l] = convert_from_half<dst_t>(__hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
}
} }
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
@ -190,8 +188,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
const half d2 = __hmul(dall, __int2half_rn(sc)); const half d2 = __hmul(dall, __int2half_rn(sc));
const half m2 = __hmul(dmin, __int2half_rn(m)); const half m2 = __hmul(dmin, __int2half_rn(m));
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; ++l) {
y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1)); y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2)); y[l +32] = __hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2);
} }
} }
@ -222,11 +220,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m)); const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
uint8_t hm = 1 << (2*il); uint8_t hm = 1 << (2*il);
y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1)); y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1)); y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
hm <<= 1; hm <<= 1;
y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2)); y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2);
y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2)); y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2);
} }
template<typename dst_t> template<typename dst_t>
@ -249,10 +247,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
const uint8_t qh = x[i].qh[32*ip + il]; const uint8_t qh = x[i].qh[32*ip + il];
const int8_t * sc = x[i].scales + is; const int8_t * sc = x[i].scales + is;
y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)))); y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)))); y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)))); y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)))); y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
} }
template<typename dst_t> template<typename dst_t>
@ -271,7 +269,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
const uint32_t aux32 = q2[2] | (q2[3] << 16); const uint32_t aux32 = q2[2] | (q2[3] << 16);
const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f; const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
} }
template<typename dst_t> template<typename dst_t>
@ -288,7 +286,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
} }
@ -305,7 +303,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
} }
template<typename dst_t> template<typename dst_t>
@ -326,8 +324,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f; const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
} }
} }
@ -347,8 +345,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
const uint8_t signs = x[i].signs[4*ib + il]; const uint8_t signs = x[i].signs[4*ib + il];
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
} }
} }
@ -369,7 +367,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
grid32[0] &= 0x0f0f0f0f; grid32[0] &= 0x0f0f0f0f;
for (int j = 0; j < 8; ++j) { for (int j = 0; j < 8; ++j) {
y[j] = d * (q[j] + delta); y[j] = __float2half(d * (q[j] + delta));
} }
} }
@ -394,7 +392,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
grid32[0] &= 0x0f0f0f0f; grid32[0] &= 0x0f0f0f0f;
for (int j = 0; j < 8; ++j) { for (int j = 0; j < 8; ++j) {
y[j] = d * (q[j] + delta); y[j] = __float2half(d * (q[j] + delta));
} }
} }
@ -411,8 +409,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
const uint8_t * q4 = x[ib].qs + 4*il; const uint8_t * q4 = x[ib].qs + 4*il;
const float d = __half2float(x[ib].d); const float d = __half2float(x[ib].d);
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
} }
} }
@ -429,8 +427,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
const uint8_t * q4 = x[i].qs + 16*ib + 4*il; const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
} }
} }
@ -524,8 +522,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y); dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
} }
template<typename dst_t> static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
switch (type) { switch (type) {
case 2: case 2:
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>; return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;

View File

@ -1063,8 +1063,7 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
typedef half dfloat; // dequantize float typedef half dfloat; // dequantize float
typedef half2 dfloat2; typedef half2 dfloat2;
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
template<typename dst_t> typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
typedef void (*load_tiles_cuda_t)( typedef void (*load_tiles_cuda_t)(
@ -1076,25 +1075,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
// Utility function // Utility function
template<typename dst_t>
static __device__ __forceinline__ dst_t convert_from_half(half val) {
return val;
}
template<>
__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
return __float2bfloat16(__half2float(val));
#else
return __half2float(val);
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
}
template<>
__device__ __forceinline__ float convert_from_half<float>(half val) {
return __half2float(val);
}
#if defined(USE_ROCM) #if defined(USE_ROCM)
#ifndef __has_builtin #ifndef __has_builtin

View File

@ -71,19 +71,14 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
} }
torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight
int64_t type, int64_t m, int64_t n, int64_t type, int64_t m, int64_t n) {
std::optional<at::ScalarType> const& dtype) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(W)); const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
auto dtype_ = dtype.value_or(torch::kFloat16); auto options =
auto options = torch::TensorOptions().dtype(dtype_).device(W.device()); torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
at::Tensor DW = torch::empty({m, n}, options); at::Tensor DW = torch::empty({m, n}, options);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] { to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
});
return DW; return DW;
} }

View File

@ -129,7 +129,7 @@ static __device__ __forceinline__ void moe_q(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q4_0 8 #define MOE_X_Q4_0 64
#define MOE_Y_Q4_0 128 #define MOE_Y_Q4_0 128
#define NWARPS_Q4_0 8 #define NWARPS_Q4_0 8
#else #else
@ -190,7 +190,7 @@ static void ggml_moe_q4_0_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q4_1 8 #define MOE_X_Q4_1 64
#define MOE_Y_Q4_1 128 #define MOE_Y_Q4_1 128
#define NWARPS_Q4_1 8 #define NWARPS_Q4_1 8
#else #else
@ -251,7 +251,7 @@ static void ggml_moe_q4_1_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q5_0 8 #define MOE_X_Q5_0 64
#define MOE_Y_Q5_0 128 #define MOE_Y_Q5_0 128
#define NWARPS_Q5_0 8 #define NWARPS_Q5_0 8
#else #else
@ -312,7 +312,7 @@ static void ggml_moe_q5_0_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q5_1 8 #define MOE_X_Q5_1 64
#define MOE_Y_Q5_1 128 #define MOE_Y_Q5_1 128
#define NWARPS_Q5_1 8 #define NWARPS_Q5_1 8
#else #else
@ -373,7 +373,7 @@ static void ggml_moe_q5_1_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q8_0 8 #define MOE_X_Q8_0 64
#define MOE_Y_Q8_0 128 #define MOE_Y_Q8_0 128
#define NWARPS_Q8_0 8 #define NWARPS_Q8_0 8
#else #else
@ -434,7 +434,7 @@ static void ggml_moe_q8_0_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q2_K 8 #define MOE_X_Q2_K 64
#define MOE_Y_Q2_K 128 #define MOE_Y_Q2_K 128
#define NWARPS_Q2_K 8 #define NWARPS_Q2_K 8
#else #else
@ -495,7 +495,7 @@ static void ggml_moe_q2_K_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q3_K 8 #define MOE_X_Q3_K 64
#define MOE_Y_Q3_K 128 #define MOE_Y_Q3_K 128
#define NWARPS_Q3_K 8 #define NWARPS_Q3_K 8
#else #else
@ -556,7 +556,7 @@ static void ggml_moe_q3_K_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q4_K 8 #define MOE_X_Q4_K 64
#define MOE_Y_Q4_K 128 #define MOE_Y_Q4_K 128
#define NWARPS_Q4_K 8 #define NWARPS_Q4_K 8
#else #else
@ -617,7 +617,7 @@ static void ggml_moe_q4_K_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q5_K 8 #define MOE_X_Q5_K 64
#define MOE_Y_Q5_K 128 #define MOE_Y_Q5_K 128
#define NWARPS_Q5_K 8 #define NWARPS_Q5_K 8
#else #else
@ -678,7 +678,7 @@ static void ggml_moe_q5_K_q8_1_cuda(
} }
#if defined(USE_ROCM) #if defined(USE_ROCM)
#define MOE_X_Q6_K 8 #define MOE_X_Q6_K 64
#define MOE_Y_Q6_K 128 #define MOE_Y_Q6_K 128
#define NWARPS_Q6_K 8 #define NWARPS_Q6_K 8
#else #else

View File

@ -1785,7 +1785,7 @@ __global__ void Marlin(
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \ <<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \
num_groups, prob_m, prob_n, prob_k, lda, locks, \ num_groups, prob_m, prob_n, prob_k, lda, locks, \
part_use_atomic_add, use_fp32_reduce); \ use_atomic_add, use_fp32_reduce); \
} \ } \
} }
@ -2215,10 +2215,6 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
thread_m_blocks = exec_cfg.max_m_blocks; thread_m_blocks = exec_cfg.max_m_blocks;
} }
// atomic add reduce have better performance only when m * n is small
bool part_use_atomic_add =
use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
if (false) { if (false) {
} }
GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256) GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)

View File

@ -9,11 +9,7 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <iostream> #include <iostream>
#ifndef MARLIN_NAMESPACE_NAME namespace marlin {
#define MARLIN_NAMESPACE_NAME marlin
#endif
namespace MARLIN_NAMESPACE_NAME {
// Marlin params // Marlin params
@ -27,7 +23,6 @@ static constexpr int pipe_stages =
static constexpr int min_thread_n = 64; static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64; static constexpr int min_thread_k = 64;
static constexpr int max_thread_n = 256;
static constexpr int tile_size = 16; static constexpr int tile_size = 16;
static constexpr int max_par = 16; static constexpr int max_par = 16;
@ -89,4 +84,4 @@ __device__ inline void cp_async_wait() {
#endif #endif
} // namespace MARLIN_NAMESPACE_NAME } // namespace marlin

View File

@ -5,11 +5,7 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#ifndef MARLIN_NAMESPACE_NAME namespace marlin {
#define MARLIN_NAMESPACE_NAME marlin
#endif
namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t> template <typename scalar_t>
class ScalarType {}; class ScalarType {};
@ -58,7 +54,7 @@ class ScalarType<nv_bfloat16> {
using FragS = Vec<nv_bfloat162, 1>; using FragS = Vec<nv_bfloat162, 1>;
using FragZP = Vec<nv_bfloat162, 4>; using FragZP = Vec<nv_bfloat162, 4>;
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static __device__ float inline num2float(const nv_bfloat16 x) { static __device__ float inline num2float(const nv_bfloat16 x) {
return __bfloat162float(x); return __bfloat162float(x);
} }
@ -78,6 +74,6 @@ class ScalarType<nv_bfloat16> {
#endif #endif
}; };
} // namespace MARLIN_NAMESPACE_NAME } // namespace marlin
#endif #endif

View File

@ -1,59 +0,0 @@
#pragma once
/**
* Quantization utilities including:
* Adjusted maximum values for qtypes.
* Minimum scaling factors for qtypes.
*/
#include <cmath>
#include <torch/types.h>
#ifndef USE_ROCM
#include <c10/util/Float8_e4m3fn.h>
#define MAYBE_HOST_DEVICE C10_HOST_DEVICE
#else
#include <ATen/hip/HIPContext.h>
#include <c10/util/Float8_e4m3fn.h>
#include <c10/util/Float8_e4m3fnuz.h>
// ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
#define MAYBE_HOST_DEVICE
#endif
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
std::is_same_v<T, c10::Float8_e4m3fnuz> ||
std::is_same_v<T, int8_t>>>
struct quant_type_max {
static constexpr T val() { return std::numeric_limits<T>::max(); }
};
// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
template <>
struct quant_type_max<c10::Float8_e4m3fnuz> {
static constexpr c10::Float8_e4m3fnuz val() {
return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
}
};
template <typename T>
MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
quant_type_max<T>::val();
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
std::is_same_v<T, c10::Float8_e4m3fnuz> ||
std::is_same_v<T, int8_t>>>
struct min_scaling_factor {
C10_DEVICE C10_ALWAYS_INLINE static float val() {
return 1.0f / (quant_type_max_v<T> * 512.0f);
}
};
template <>
struct min_scaling_factor<int8_t> {
C10_DEVICE C10_ALWAYS_INLINE static float val() {
return std::numeric_limits<float>::epsilon();
}
};

View File

@ -272,7 +272,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const float scale, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq, const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads] const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int q_stride,
@ -292,13 +291,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int rowid = laneid / 16; const int rowid = laneid / 16;
const auto seq_idx = blockIdx.x; const auto seq_idx = blockIdx.x;
// NOTE queries with sequence len > 1 are prefills and taken care by another
// kernel.
if (query_start_loc_ptr != nullptr &&
(query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
return;
}
const auto partition_idx = blockIdx.y; const auto partition_idx = blockIdx.y;
constexpr int T_PAR_SIZE = 256; // token partition size set to 256 constexpr int T_PAR_SIZE = 256; // token partition size set to 256
@ -385,10 +377,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
// fetch Q in shared across warps and then write to registers // fetch Q in shared across warps and then write to registers
const int local_qhead_idx = 4 * warpid + rowid; const int local_qhead_idx = 4 * warpid + rowid;
const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
const int64_t query_start_off = static_cast<int64_t>( const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
const scalar_t* q_ptr = const scalar_t* q_ptr =
q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) { if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@ -786,7 +777,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const float scale, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq, const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads] const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int q_stride,
@ -804,12 +794,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const int lane4id = laneid % 4; const int lane4id = laneid % 4;
const auto seq_idx = blockIdx.x; const auto seq_idx = blockIdx.x;
// NOTE queries with sequence len > 1 are prefills and taken care by another
// kernel.
if (query_start_loc_ptr != nullptr &&
(query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
return;
}
const auto partition_idx = blockIdx.y; const auto partition_idx = blockIdx.y;
const auto partition_size = blockDim.x; const auto partition_size = blockDim.x;
const auto max_num_partitions = gridDim.y; const auto max_num_partitions = gridDim.y;
@ -898,11 +882,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
} }
// fetch q elements // fetch q elements
// every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
const int64_t query_start_off = static_cast<int64_t>(
query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
const scalar_t* q_ptr = const scalar_t* q_ptr =
q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE; q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr); const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
const int qhead_elemh8 = laneid / 4; const int qhead_elemh8 = laneid / 4;
@ -1285,19 +1267,10 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size] // max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_partitions) { const int max_num_partitions) {
const auto num_heads = gridDim.x; const auto num_heads = gridDim.x;
const auto head_idx = blockIdx.x; const auto head_idx = blockIdx.x;
const auto seq_idx = blockIdx.y; const auto seq_idx = blockIdx.y;
// NOTE queries with sequence len > 1 are prefills and taken care by another
// kernel.
if (query_start_loc_ptr != nullptr &&
(query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
return;
}
const int context_len = context_lens[seq_idx]; const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
@ -1466,9 +1439,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
__fdividef(1.0f, shared_global_exp_sum + 1e-6f); __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
acc *= inv_global_exp_sum; acc *= inv_global_exp_sum;
const int64_t query_start_off = static_cast<int64_t>( OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
static_cast<int64_t>(head_idx) * HEAD_SIZE; static_cast<int64_t>(head_idx) * HEAD_SIZE;
if constexpr (std::is_same<OUTT, bit8_t>::value) { if constexpr (std::is_same<OUTT, bit8_t>::value) {
out_ptr[threadIdx.x] = out_ptr[threadIdx.x] =
@ -1495,7 +1466,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
const float scale, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq, const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads] const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int q_stride,
@ -1522,7 +1492,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const float scale, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq, const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads] const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int q_stride,
@ -1546,7 +1515,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_partitions) { const int max_num_partitions) {
UNREACHABLE_CODE UNREACHABLE_CODE
} }
@ -1554,34 +1522,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ #define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \
paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \ paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
HEAD_SIZE, NTHR, ALIBI_ENABLED, \ HEAD_SIZE, NTHR, ALIBI_ENABLED, \
GQA_RATIO> \ GQA_RATIO> \
<<<grid, block, 0, stream>>>( \ <<<grid, block, 0, stream>>>( \
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
max_ctx_blocks, k_scale_ptr, v_scale_ptr); k_scale_ptr, v_scale_ptr);
#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \ #define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \
paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \ paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
HEAD_SIZE, NTHR, ALIBI_ENABLED, \ HEAD_SIZE, NTHR, ALIBI_ENABLED, \
GQA_RATIO> \ GQA_RATIO> \
<<<grid, block, 0, stream>>>( \ <<<grid, block, 0, stream>>>( \
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
max_ctx_blocks, k_scale_ptr, v_scale_ptr); k_scale_ptr, v_scale_ptr);
#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \ #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \
paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \ paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
PARTITION_SIZE, NPAR_LOOPS> \ PARTITION_SIZE, NPAR_LOOPS> \
<<<reduce_grid, reduce_block, 0, stream>>>( \ <<<reduce_grid, reduce_block, 0, stream>>>( \
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \ out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \
context_lens_ptr, query_start_loc_ptr, max_num_partitions); context_lens_ptr, max_num_partitions);
template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE, template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD, int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@ -1591,10 +1559,9 @@ void paged_attention_custom_launcher(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, const int num_kv_heads, float scale, torch::Tensor& value_cache, const int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& context_lens, torch::Tensor& block_tables, torch::Tensor& context_lens,
const std::optional<torch::Tensor>& query_start_loc, int max_context_len, int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale, torch::Tensor& k_scale, torch::Tensor& v_scale) {
torch::Tensor& v_scale) { int num_seqs = query.size(0);
int num_seqs = block_tables.size(0);
int num_heads = query.size(1); int num_heads = query.size(1);
int head_size = query.size(2); int head_size = query.size(2);
int max_num_blocks_per_seq = block_tables.size(1); int max_num_blocks_per_seq = block_tables.size(1);
@ -1602,13 +1569,6 @@ void paged_attention_custom_launcher(
int kv_block_stride = key_cache.stride(0); int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1); int kv_head_stride = key_cache.stride(1);
// NOTE: query start location is optional for V0 decode should not be used.
// If batch contains mix of prefills and decode, prefills should be skipped.
const int* query_start_loc_ptr =
query_start_loc
? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
: nullptr;
// NOTE: alibi_slopes is optional. // NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr = const float* alibi_slopes_ptr =
alibi_slopes alibi_slopes
@ -1740,8 +1700,8 @@ void paged_attention_custom_launcher(
paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \ paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
PSIZE, ALIBI_ENABLED>( \ PSIZE, ALIBI_ENABLED>( \
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ num_kv_heads, scale, block_tables, context_lens, max_context_len, \
max_context_len, alibi_slopes, k_scale, v_scale); alibi_slopes, k_scale, v_scale);
#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
PSIZE) \ PSIZE) \
@ -1790,7 +1750,6 @@ void paged_attention(
double scale, double scale,
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& context_lens, // [num_seqs] torch::Tensor& context_lens, // [num_seqs]
const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
int64_t block_size, int64_t max_context_len, int64_t block_size, int64_t max_context_len,
const std::optional<torch::Tensor>& alibi_slopes, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale, const std::string& kv_cache_dtype, torch::Tensor& k_scale,

View File

@ -7,9 +7,8 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, torch::Tensor& value_cache, int64_t num_kv_heads,
double scale, torch::Tensor& block_tables, double scale, torch::Tensor& block_tables,
torch::Tensor& context_lens, torch::Tensor& context_lens, int64_t block_size,
const std::optional<torch::Tensor>& query_start_loc, int64_t max_context_len,
int64_t block_size, int64_t max_context_len,
const std::optional<torch::Tensor>& alibi_slopes, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale, const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale); torch::Tensor& v_scale);

View File

@ -23,9 +23,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
" Tensor query, Tensor key_cache," " Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads," " Tensor value_cache, int num_kv_heads,"
" float scale, Tensor block_tables," " float scale, Tensor block_tables,"
" Tensor context_lens," " Tensor context_lens, int block_size,"
" Tensor? query_start_loc,"
" int block_size,"
" int max_context_len," " int max_context_len,"
" Tensor? alibi_slopes," " Tensor? alibi_slopes,"
" str kv_cache_dtype," " str kv_cache_dtype,"

View File

@ -31,10 +31,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
&get_cuda_view_from_cpu_tensor);
// Attention ops // Attention ops
// Compute the attention between an input query and the cached // Compute the attention between an input query and the cached
// keys/values using PagedAttention. // keys/values using PagedAttention.
@ -64,21 +60,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int blocksparse_head_sliding_step) -> ()"); " int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2); ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
#ifndef USE_ROCM
// Merge attn states
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case)
ops.def(
"merge_attn_states("
" Tensor! output,"
" Tensor!? output_lse,"
" Tensor prefix_output,"
" Tensor prefix_lse,"
" Tensor suffix_output,"
" Tensor suffix_lse) -> ()");
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
#endif
// Activation ops // Activation ops
// Activation function used in SwiGLU. // Activation function used in SwiGLU.
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
@ -310,9 +291,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#endif #endif
// Dequantization for GGML. // Dequantization for GGML.
ops.def( ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
"ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
"dtype) -> Tensor");
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize); ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
// mmvq kernel for GGML. // mmvq kernel for GGML.
@ -631,11 +610,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
&get_max_shared_memory_per_block_device_attribute); &get_max_shared_memory_per_block_device_attribute);
} }
#ifndef USE_ROCM
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels // Custom all-reduce kernels
custom_ar.def( custom_ar.def(
"init_custom_ar(int[] ipc_tensors, Tensor rank_data, " "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
"int rank, bool fully_connected) -> int"); "int rank, bool full_nvlink) -> int");
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
custom_ar.def( custom_ar.def(
"all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, " "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@ -648,13 +628,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
custom_ar.def("register_buffer", &register_buffer); custom_ar.def("register_buffer", &register_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", &register_graph_buffers); custom_ar.def("register_graph_buffers", &register_graph_buffers);
custom_ar.def("allocate_shared_buffer_and_handle",
&allocate_shared_buffer_and_handle);
custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
custom_ar.def("free_shared_buffer", &free_shared_buffer);
} }
#endif
REGISTER_EXTENSION(TORCH_EXTENSION_NAME) REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

View File

@ -1,142 +0,0 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
#
# Build targets:
# vllm-openai (default): used for serving deployment
# vllm-test: used for CI tests
# vllm-dev: used for development
#
# Build arguments:
# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
# VLLM_CPU_DISABLE_AVX512=false (default)|true
#
######################### BASE IMAGE #########################
FROM ubuntu:22.04 AS base
WORKDIR /workspace/
ARG PYTHON_VERSION=3.12
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV LD_PRELOAD=""
# Install minimal dependencies and uv
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
ENV PATH="/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV UV_HTTP_TIMEOUT=500
# Install Python dependencies
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE="copy"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
RUN echo 'ulimit -c 0' >> ~/.bashrc
######################### BUILD IMAGE #########################
FROM base AS vllm-build
ARG GIT_REPO_CHECK=0
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
uv pip install -r requirements/build.txt
COPY . .
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"]
######################### TEST IMAGE #########################
FROM base AS vllm-test
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
uv pip install -r requirements/test.txt
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
ADD ./tests/ ./tests/
ADD ./examples/ ./examples/
ADD ./benchmarks/ ./benchmarks/
ADD ./vllm/collect_env.py .
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
ENTRYPOINT ["bash"]
######################### RELEASE IMAGE #########################
FROM base AS vllm-openai
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -2,42 +2,19 @@
## Build the docs ## Build the docs
- Make sure in `docs` directory
```bash
cd docs
```
- Install the dependencies:
```bash ```bash
# Install dependencies.
pip install -r ../requirements/docs.txt pip install -r ../requirements/docs.txt
```
- Clean the previous build (optional but recommended): # Build the docs.
```bash
make clean make clean
```
- Generate the HTML documentation:
```bash
make html make html
``` ```
## Open the docs with your browser ## Open the docs with your browser
- Serve the documentation locally:
```bash ```bash
python -m http.server -d build/html/ python -m http.server -d build/html/
``` ```
This will start a local server at http://localhost:8000. You can now open your browser and view the documentation. Launch your browser and open localhost:8000.
If port 8000 is already in use, you can specify a different port, for example:
```bash
python -m http.server 3000 -d build/html/
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -4,9 +4,6 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0) - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)

View File

@ -22,7 +22,6 @@ Compute Resources:
- Databricks - Databricks
- DeepInfra - DeepInfra
- Google Cloud - Google Cloud
- Intel
- Lambda Lab - Lambda Lab
- Nebius - Nebius
- Novita AI - Novita AI

View File

@ -104,7 +104,7 @@ myst_url_schemes = {
"classes": ["github"], "classes": ["github"],
}, },
"gh-project": { "gh-project": {
"url": "https://github.com/orgs/vllm-project/projects/{{path}}", "url": "https://github.com/vllm-project/projects/{{path}}",
"title": "Project #{{path}}", "title": "Project #{{path}}",
"classes": ["github"], "classes": ["github"],
}, },

View File

@ -1,6 +1,6 @@
# Dockerfile # Dockerfile
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM. We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
More information about deploying with Docker can be found [here](#deployment-docker). More information about deploying with Docker can be found [here](#deployment-docker).
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@ -28,7 +28,7 @@ The edges of the build graph represent:
> Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
> >
> ```bash > ```bash
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
> ``` > ```
> >
> or in case you want to run it directly with the docker image: > or in case you want to run it directly with the docker image:
@ -43,7 +43,7 @@ The edges of the build graph represent:
> --output png \ > --output png \
> --dpi 200 \ > --dpi 200 \
> --max-label-length 50 \ > --max-label-length 50 \
> --filename docker/Dockerfile \ > --filename Dockerfile \
> --legend > --legend
> ``` > ```
> >

View File

@ -79,17 +79,6 @@ Further update the model as follows:
return inputs_embeds return inputs_embeds
``` ```
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
```python
class YourModelForImage2Seq(nn.Module):
...
def get_language_model(self) -> torch.nn.Module:
# Change `language_model` according to your implementation.
return self.language_model
```
- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
```diff ```diff
@ -121,21 +110,17 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": 1} return {"image": None, "video": 1}
``` ```
## 3. Specify dummy inputs ### Maximum number of placeholder feature tokens
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
HF processing as well as memory profiling. to return the maximum number of placeholder feature tokens per input item for each modality.
### For memory profiling When calling the model, the output embeddings from the visual encoder are assigned to the input positions
containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
to the size of the output embeddings.
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs` :::::{tab-set}
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of ::::{tab-item} Basic example: LLaVA
the model so that vLLM can reserve the correct amount of memory for it.
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
::::{tab-set}
:::{tab-item} Basic example: LLaVA
:sync: llava :sync: llava
Looking at the code of HF's `LlavaForConditionalGeneration`: Looking at the code of HF's `LlavaForConditionalGeneration`:
@ -244,7 +229,7 @@ def get_num_image_tokens(
``` ```
Notice that the number of image tokens doesn't depend on the image width and height. Notice that the number of image tokens doesn't depend on the image width and height.
We can simply use a dummy `image_size`: So, we can calculate the maximum number of image tokens using any image size:
```python ```python
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
@ -252,35 +237,33 @@ def get_image_size_with_most_features(self) -> ImageSize:
width = height = hf_config.image_size width = height = hf_config.image_size
return ImageSize(width=width, height=height) return ImageSize(width=width, height=height)
def get_dummy_processor_inputs( def get_max_image_tokens(self) -> int:
self, target_width, target_height = self.get_image_size_with_most_features()
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor() return self.get_num_image_tokens(
image_token = processor.image_token image_width=target_width,
image_height=target_height,
hf_config = self.get_hf_config()
target_width, target_height = self.info.get_image_size_with_most_features()
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
return ProcessorInputs(
prompt_text=image_token * num_images,
mm_data=mm_data,
) )
``` ```
And thus, we can override the method as:
```python
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.get_max_image_tokens()}
```
:::{note}
Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
::: :::
:::{tab-item} No input placeholders: Fuyu ::::
::::{tab-item} Non-consecutive feature tokens: Fuyu
:sync: fuyu :sync: fuyu
Looking at the code of HF's `FuyuForCausalLM`: Looking at the code of HF's `FuyuForCausalLM`:
@ -400,16 +383,188 @@ num_patches_per_dim_w = image_width // patch_width
num_patches = num_patches_per_dim_h * num_patches_per_dim_w num_patches = num_patches_per_dim_h * num_patches_per_dim_w
``` ```
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized We can calculate this in vLLM using this code:
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
```python
def get_num_image_patches(
self,
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.get_image_processor()
target_width = image_processor.size["width"]
target_height = image_processor.size["height"]
patch_width = image_processor.patch_size["width"]
patch_height = image_processor.patch_size["height"]
if not (image_width <= target_width and image_height <= target_height):
height_scale_factor = target_height / image_height
width_scale_factor = target_width / image_width
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
image_height = int(image_height * optimal_scale_factor)
image_width = int(image_width * optimal_scale_factor)
ncols = math.ceil(image_width / patch_width)
nrows = math.ceil(image_height / patch_height)
return ncols * nrows
```
These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
inserts newline tokens (`|NEWLINE|`) as shown here:
```python
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
tensor_of_image_ids = torch.full(
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
)
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
assert num_patches == patches.shape[0]
if variable_sized:
# Now terminate each line with |NEWLINE|.
tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
newline_ids = torch.full(
[tensor_of_image_ids.shape[0], 1],
image_newline_id,
dtype=torch.int32,
device=image_input.device,
)
tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
```
So, the layout of tokens for an image is:
```
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
...
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
```
This makes the placeholder tokens non-consecutive in the prompt.
Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
So overall, the total number of feature tokens is
```python
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.get_image_processor()
target_width = image_processor.size["width"]
target_height = image_processor.size["height"]
patch_width = image_processor.patch_size["width"]
patch_height = image_processor.patch_size["height"]
if not (image_width <= target_width and image_height <= target_height):
height_scale_factor = target_height / image_height
width_scale_factor = target_width / image_width
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
image_height = int(image_height * optimal_scale_factor)
image_width = int(image_width * optimal_scale_factor)
ncols = math.ceil(image_width / patch_width)
nrows = math.ceil(image_height / patch_height)
return (ncols + 1) * nrows
```
To calculate the maximum number of image tokens, recall that input images are first resized
to fit within `image_processor.size`. The maximum possible dimensions of the image before
being converted into patches is therefore equal to `image_processor.size`.
```python ```python
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor() image_processor = self.get_image_processor()
return ImageSize(width=image_processor.size["width"], return ImageSize(width=image_processor.size["width"],
height=image_processor.size["height"]) height=image_processor.size["height"])
def get_max_image_tokens(self) -> int:
target_width, target_height = self.get_image_size_with_most_features()
return self.get_num_image_tokens(
image_width=target_width,
image_height=target_height,
)
``` ```
And thus, we can override the method as:
```python
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.get_max_image_tokens()}
```
:::{note}
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
:::
::::
:::::
## 3. Specify dummy inputs
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
HF processing as well as memory profiling.
### For memory profiling
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
the model so that vLLM can reserve the correct amount of memory for it.
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
::::{tab-set}
:::{tab-item} Basic example: LLaVA
:sync: llava
Making use of the `get_image_size_with_most_features` method implemented in Step 2:
```python
def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor()
image_token = processor.image_token
hf_config = self.get_hf_config()
target_width, target_height = self.info.get_image_size_with_most_features()
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
return ProcessorInputs(
prompt_text=image_token * num_images,
mm_data=mm_data,
)
```
:::
:::{tab-item} No input placeholders: Fuyu
:sync: fuyu
Fuyu does not expect image placeholders in the inputs to HF processor, so Fuyu does not expect image placeholders in the inputs to HF processor, so
the dummy prompt text is empty regardless of the number of images. the dummy prompt text is empty regardless of the number of images.
Otherwise, the logic of this method is very similar to LLaVA: Otherwise, the logic of this method is very similar to LLaVA:
@ -705,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
) )
``` ```
To assign the vision embeddings to only the image tokens, instead of a string To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`: with different `full` and `feature` attributes:
```python ```python
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
@ -724,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows [_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails(
image_tokens + [bos_token_id], full=image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID, features=image_tokens,
) )
``` ```
@ -759,9 +914,9 @@ def _get_prompt_updates(
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows [_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails(
image_tokens + [bos_token_id], full=image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID, features=image_tokens,
) )
return [ return [

Some files were not shown because too many files have changed in this diff Show More