Compare commits
4 Commits
wye-refact
...
split_kv_c
| Author | SHA1 | Date | |
|---|---|---|---|
| 6e1e31a66a | |||
| 50e80db4ef | |||
| d3d6afb355 | |||
| 808fa43d76 |
@ -181,14 +181,18 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -365,7 +365,8 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
|
|||||||
@ -76,7 +76,7 @@ steps:
|
|||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
# Add job to create multi-arch manifest
|
||||||
|
|||||||
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||||
|
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
@ -163,6 +167,12 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
|||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
#Obsolete currently
|
||||||
|
##ignore certain Entrypoints/llm tests
|
||||||
|
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
|
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
|
#fi
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
|||||||
@ -58,8 +58,11 @@ function cpu_tests() {
|
|||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
# Note: disable Bart until supports V1
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
pytest -x -v -s tests/models/multimodal/generation \
|
||||||
|
|||||||
@ -1,191 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
||||||
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
|
|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
|||||||
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
|
|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
|||||||
@ -35,15 +35,16 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_metrics
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
|
pytest -v -s v1/test_utils.py
|
||||||
|
pytest -v -s v1/test_metrics_reader.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|||||||
@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Setup script for Prime-RL integration tests
|
|
||||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
|
||||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
|
||||||
echo "Removing existing Prime-RL directory..."
|
|
||||||
rm -rf "${PRIME_RL_DIR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install UV if not available
|
|
||||||
if ! command -v uv &> /dev/null; then
|
|
||||||
echo "Installing UV package manager..."
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
|
||||||
PRIME_RL_BRANCH="integ-vllm-main"
|
|
||||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
|
||||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
|
||||||
cd "${PRIME_RL_DIR}"
|
|
||||||
|
|
||||||
echo "Setting up UV project environment..."
|
|
||||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
|
||||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
|
||||||
|
|
||||||
# Remove vllm pin from pyproject.toml
|
|
||||||
echo "Removing vllm pin from pyproject.toml..."
|
|
||||||
sed -i '/vllm==/d' pyproject.toml
|
|
||||||
|
|
||||||
# Sync Prime-RL dependencies
|
|
||||||
echo "Installing Prime-RL dependencies..."
|
|
||||||
uv sync --inexact && uv sync --inexact --all-extras
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
echo "Verifying installations..."
|
|
||||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
|
||||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
|
||||||
|
|
||||||
echo "Prime-RL integration test environment setup complete!"
|
|
||||||
|
|
||||||
echo "Running Prime-RL integration tests..."
|
|
||||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
|
||||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
|
||||||
|
|
||||||
echo "Prime-RL integration tests completed!"
|
|
||||||
@ -6,28 +6,24 @@
|
|||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emojis allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
|
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||||
# fast_check_only(bool): run this test on the fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
|
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
# commands(list): the list of commands to run for the test. incompatible with command.
|
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||||
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on the first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
|
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
||||||
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
|
||||||
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
|
||||||
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
|
||||||
|
|
||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belongs to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
@ -50,28 +46,23 @@ steps:
|
|||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/multimodal
|
- tests/async_engine
|
||||||
- tests/utils_
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
|
||||||
- pytest -v -s utils_
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
|
- tests/utils_
|
||||||
|
- tests/worker
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
no_gpu: true
|
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s utils_ # Utils
|
||||||
|
- pytest -v -s worker # Worker
|
||||||
|
- pytest -v -s transformers_utils # transformers_utils
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@ -91,12 +82,14 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
- tests/basic_correctness/test_cpu_offload
|
- tests/basic_correctness/test_cpu_offload
|
||||||
|
- tests/basic_correctness/test_preemption
|
||||||
- tests/basic_correctness/test_cumem.py
|
- tests/basic_correctness/test_cumem.py
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Entrypoints Unit Tests # 5min
|
- label: Entrypoints Unit Tests # 5min
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
@ -121,9 +114,10 @@ steps:
|
|||||||
- tests/entrypoints/offline_mode
|
- tests/entrypoints/offline_mode
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server) # 100min
|
- label: Entrypoints Integration Test (API Server) # 100min
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@ -161,6 +155,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
@ -168,34 +163,28 @@ steps:
|
|||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/distributed
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
|
- tests/v1/test_internal_lb_dp.py
|
||||||
|
- tests/v1/test_hybrid_lb_dp.py
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
|
||||||
commands:
|
commands:
|
||||||
# test with torchrun tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with tp=2 and pp=2
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=4 and dp=1
|
|
||||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2, pp=2 and dp=1
|
|
||||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=1 and dp=4 with ep
|
|
||||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2 and dp=2 with ep
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
@ -296,34 +285,23 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/logits_processors
|
- pytest -v -s v1/logits_processors
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
- pytest -v -s v1/kv_connector/unit
|
||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
- pytest -v -s v1/metrics
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_request.py
|
- pytest -v -s v1/test_metrics_reader.py
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s v1/core
|
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
|
||||||
|
|
||||||
|
|
||||||
- label: Examples Test # 30min
|
- label: Examples Test # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -342,13 +320,12 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
|
||||||
|
|
||||||
- label: Platform Tests (CUDA) # 4min
|
- label: Platform Tests (CUDA) # 4min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@ -479,19 +456,30 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Tensorizer Test # 14min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 25
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor
|
- vllm/model_executor/model_loader
|
||||||
- tests/model_executor
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s model_executor
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
|
- label: Model Executor Test # 7min
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 11min
|
- label: Benchmarks # 11min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -525,7 +513,7 @@ steps:
|
|||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -553,17 +541,10 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
|
- tests/mistral_tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s -m 'not cpu_test' tool_use
|
- pytest -v -s tool_use
|
||||||
|
- pytest -v -s mistral_tool_use
|
||||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'cpu_test' tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@ -603,19 +584,13 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_transformers.py
|
- tests/models/test_transformers.py
|
||||||
- tests/models/test_registry.py
|
- tests/models/test_registry.py
|
||||||
commands:
|
|
||||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
|
||||||
|
|
||||||
- label: Basic Models Test (Other CPU) # 5min
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/test_utils.py
|
- tests/models/test_utils.py
|
||||||
- tests/models/test_vision.py
|
- tests/models/test_vision.py
|
||||||
no_gpu: true
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
- pytest -v -s models/test_transformers.py \
|
||||||
|
models/test_registry.py \
|
||||||
|
models/test_utils.py \
|
||||||
|
models/test_vision.py
|
||||||
|
|
||||||
- label: Language Models Tests (Standard)
|
- label: Language Models Tests (Standard)
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
@ -785,13 +760,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 38 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -845,24 +818,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/quantization/test_blackwell_moe.py
|
|
||||||
- vllm/model_executor/models/deepseek_v2.py
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/models/llama4.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
|
||||||
- vllm/model_executor/layers/quantization/modelopt.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -906,58 +862,47 @@ steps:
|
|||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 68min
|
- label: Distributed Tests (2 GPUs) # 110min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 150
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/compilation/
|
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/engine/
|
- vllm/engine/
|
||||||
- vllm/executor/
|
- vllm/executor/
|
||||||
- vllm/worker/worker_base.py
|
- vllm/model_executor/models/
|
||||||
- vllm/v1/engine/
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- tests/compile/test_basic_correctness.py
|
|
||||||
- tests/compile/test_wrapper.py
|
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- vllm/compilation
|
||||||
- tests/v1/distributed
|
- vllm/worker/worker_base.py
|
||||||
|
- vllm/worker/worker.py
|
||||||
|
- vllm/worker/model_runner.py
|
||||||
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- tests/v1/shutdown
|
- vllm/v1/engine/
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
|
||||||
|
|
||||||
- label: Distributed Model Tests (2 GPUs) # 37min
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/basic_correctness/
|
|
||||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
|
||||||
- tests/models/
|
|
||||||
commands:
|
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
# test sequence parallel
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
|
# this test fails consistently.
|
||||||
|
# TODO: investigate and fix
|
||||||
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -1091,16 +1036,3 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
|
||||||
- label: Prime-RL Integration Test # 15min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
|
|||||||
35
.github/CODEOWNERS
vendored
35
.github/CODEOWNERS
vendored
@ -4,14 +4,19 @@
|
|||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/model_executor/layers/fused_moe @mgoin
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
|
/vllm/v1/attention @LucasWilkinson
|
||||||
|
/vllm/v1/sample @22quinn @houseroad
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
@ -26,18 +31,17 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
|
/vllm/v1/spec_decode @benchislett @luccafong
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
|
/vllm/v1/worker/kv_cache_initializer_mixin.py @heheda12345
|
||||||
/vllm/v1/offloading @ApostaC
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
|
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
@ -46,6 +50,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
@ -54,35 +59,23 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
/tests/v1/kv_connector @ApostaC
|
/tests/v1/kv_connector @ApostaC
|
||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Transformers backend
|
|
||||||
/vllm/model_executor/models/transformers.py @hmellor
|
|
||||||
/tests/models/test_transformers.py @hmellor
|
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs/mkdocs @hmellor
|
/docs @hmellor
|
||||||
/docs/**/*.yml @hmellor
|
|
||||||
/requirements/docs.txt @hmellor
|
|
||||||
.readthedocs.yaml @hmellor
|
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
# Linting
|
|
||||||
.markdownlint.yaml @hmellor
|
|
||||||
.pre-commit-config.yaml @hmellor
|
|
||||||
/tools/pre_commit @hmellor
|
|
||||||
|
|
||||||
# CPU
|
# CPU
|
||||||
/vllm/v1/worker/cpu* @bigPYJ1151
|
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||||
/csrc/cpu @bigPYJ1151
|
/csrc/cpu @bigPYJ1151
|
||||||
/vllm/platforms/cpu.py @bigPYJ1151
|
/vllm/platforms/cpu.py @bigPYJ1151
|
||||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||||
/docker/Dockerfile.cpu @bigPYJ1151
|
/docker/Dockerfile.cpu @bigPYJ1151
|
||||||
|
|
||||||
# Intel GPU
|
# Intel GPU
|
||||||
/vllm/v1/worker/xpu* @jikunshang
|
/vllm/v1/worker/^xpu @jikunshang
|
||||||
/vllm/platforms/xpu.py @jikunshang
|
/vllm/platforms/xpu.py @jikunshang
|
||||||
/docker/Dockerfile.xpu @jikunshang
|
/docker/Dockerfile.xpu @jikunshang
|
||||||
|
|
||||||
|
|||||||
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -43,6 +43,10 @@ body:
|
|||||||
Any other things you would like to mention.
|
Any other things you would like to mention.
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
52
.github/mergify.yml
vendored
52
.github/mergify.yml
vendored
@ -2,7 +2,6 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -15,7 +14,6 @@ pull_request_rules:
|
|||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -32,7 +30,6 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -49,7 +46,6 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -59,7 +55,6 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -75,7 +70,6 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -89,7 +83,6 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -101,7 +94,6 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@ -115,7 +107,6 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -130,7 +121,6 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
@ -152,7 +142,6 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -173,7 +162,6 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -183,7 +171,7 @@ pull_request_rules:
|
|||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -193,7 +181,6 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -209,7 +196,6 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -222,7 +208,6 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -238,7 +223,6 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -253,9 +237,9 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
|
- files~=^tests/mistral_tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
@ -272,9 +256,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- conflict
|
||||||
- conflict
|
- -closed
|
||||||
- -closed
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -288,12 +271,10 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
- files~=^tests/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
@ -301,7 +282,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
- name: assign reviewer for modelopt changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
@ -316,27 +296,9 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
- needs-rebase
|
- needs-rebase
|
||||||
|
|
||||||
- name: label-kv-connector
|
|
||||||
description: Automatically apply kv-connector label
|
|
||||||
conditions:
|
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
|
||||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
|
||||||
- files~=^examples/others/lmcache/
|
|
||||||
- files~=^tests/v1/kv_connector/
|
|
||||||
- files~=^vllm/distributed/kv_transfer/
|
|
||||||
- title~=(?i)\bP/?D\b
|
|
||||||
- title~=(?i)NIXL
|
|
||||||
- title~=(?i)LMCache
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- kv-connector
|
|
||||||
@ -49,7 +49,7 @@ repos:
|
|||||||
rev: 0.6.17
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@ -60,32 +60,38 @@ repos:
|
|||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy for local Python installation
|
||||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
<<: &mypy_common
|
|
||||||
language: python
|
|
||||||
types_or: [python, pyi]
|
|
||||||
require_serial: true
|
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.9"
|
entry: tools/mypy.sh 1 "3.9"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: tools/mypy.sh 1 "3.10"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.11
|
name: Run mypy for Python 3.11
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.11"
|
entry: tools/mypy.sh 1 "3.11"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.12
|
name: Run mypy for Python 3.12
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
entry: tools/mypy.sh 1 "3.12"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
@ -149,15 +155,18 @@ repos:
|
|||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: check-pickle-imports
|
- id: check-pickle-imports
|
||||||
name: Prevent new pickle/cloudpickle imports
|
name: Prevent new pickle/cloudpickle imports
|
||||||
entry: python tools/pre_commit/check_pickle_imports.py
|
entry: python tools/check_pickle_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: [regex]
|
pass_filenames: false
|
||||||
|
additional_dependencies: [pathspec, regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/validate_config.py
|
entry: python tools/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
additional_dependencies: [regex]
|
types: [python]
|
||||||
|
pass_filenames: true
|
||||||
|
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@ -13,7 +13,6 @@ build:
|
|||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
fail_on_warning: true
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
106
CMakeLists.txt
106
CMakeLists.txt
@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
|
|||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
|
||||||
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
else()
|
else()
|
||||||
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Set compression mode for CUDA >=13.x.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
|
||||||
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Set CUDA include flags for CXX compiler.
|
# Set CUDA include flags for CXX compiler.
|
||||||
#
|
#
|
||||||
@ -269,8 +257,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/w8a8/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -314,13 +302,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||||
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -424,11 +412,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -452,16 +440,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -486,16 +470,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -526,7 +506,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -570,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@ -593,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@ -619,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
# CUTLASS MLA Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||||
@ -649,7 +617,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -667,13 +635,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -692,13 +656,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -715,13 +675,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
|||||||
@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||||
|
|||||||
@ -103,15 +103,10 @@ start_server() {
|
|||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
fi
|
fi
|
||||||
local server_pid=$!
|
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
# This line checks whether the server is still alive or not,
|
|
||||||
# since that we should always have permission to send signal to the server process.
|
|
||||||
kill -0 $server_pid 2> /dev/null || break
|
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
@ -123,7 +118,7 @@ start_server() {
|
|||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
else
|
else
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@ -1,31 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
import time
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
from benchmark_utils import TimeCollector
|
||||||
from vllm.config import (
|
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||||
CacheConfig,
|
|
||||||
DeviceConfig,
|
|
||||||
LoadConfig,
|
|
||||||
ModelConfig,
|
|
||||||
ParallelConfig,
|
|
||||||
SchedulerConfig,
|
|
||||||
SpeculativeConfig,
|
|
||||||
VllmConfig,
|
|
||||||
)
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
|
||||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_propose(args):
|
def main(args):
|
||||||
rows = []
|
rows = []
|
||||||
for max_ngram in args.max_ngram:
|
for max_ngram in args.max_ngram:
|
||||||
collector = TimeCollector(TimeCollector.US)
|
collector = TimeCollector(TimeCollector.US)
|
||||||
@ -83,88 +69,10 @@ def benchmark_propose(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_batched_propose(args):
|
|
||||||
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
|
||||||
PROMPT_LOOKUP_MIN = 5
|
|
||||||
PROMPT_LOOKUP_MAX = 15
|
|
||||||
MAX_MODEL_LEN = int(1e7)
|
|
||||||
DEVICE = current_platform.device_type
|
|
||||||
|
|
||||||
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
|
||||||
|
|
||||||
speculative_config = SpeculativeConfig(
|
|
||||||
target_model_config=model_config,
|
|
||||||
target_parallel_config=ParallelConfig(),
|
|
||||||
method="ngram",
|
|
||||||
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
|
||||||
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
|
||||||
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
cache_config=CacheConfig(),
|
|
||||||
speculative_config=speculative_config,
|
|
||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
|
||||||
parallel_config=ParallelConfig(),
|
|
||||||
load_config=LoadConfig(),
|
|
||||||
scheduler_config=SchedulerConfig(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
|
||||||
mock_pp_group = mock.MagicMock()
|
|
||||||
mock_pp_group.world_size = 1
|
|
||||||
with mock.patch(
|
|
||||||
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
|
||||||
):
|
|
||||||
runner = GPUModelRunner(vllm_config, DEVICE)
|
|
||||||
|
|
||||||
# hack max model len
|
|
||||||
runner.max_model_len = MAX_MODEL_LEN
|
|
||||||
runner.drafter.max_model_len = MAX_MODEL_LEN
|
|
||||||
|
|
||||||
dummy_input_batch = InputBatch(
|
|
||||||
max_num_reqs=args.num_req,
|
|
||||||
max_model_len=MAX_MODEL_LEN,
|
|
||||||
max_num_batched_tokens=args.num_req * args.num_token,
|
|
||||||
device=DEVICE,
|
|
||||||
pin_memory=False,
|
|
||||||
vocab_size=256000,
|
|
||||||
block_sizes=[16],
|
|
||||||
)
|
|
||||||
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
|
||||||
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
|
||||||
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
|
||||||
dummy_input_batch.token_ids_cpu = np.random.randint(
|
|
||||||
0, 20, (args.num_req, args.num_token)
|
|
||||||
)
|
|
||||||
|
|
||||||
runner.input_batch = dummy_input_batch
|
|
||||||
|
|
||||||
sampled_token_ids = [[0]] * args.num_req
|
|
||||||
|
|
||||||
print("Starting benchmark")
|
|
||||||
# first run is warmup so ignore it
|
|
||||||
for _ in range(args.num_iteration):
|
|
||||||
start = time.time()
|
|
||||||
runner.drafter.propose(
|
|
||||||
sampled_token_ids,
|
|
||||||
dummy_input_batch.req_ids,
|
|
||||||
dummy_input_batch.num_tokens_no_spec,
|
|
||||||
dummy_input_batch.token_ids_cpu,
|
|
||||||
dummy_input_batch.spec_decode_unsupported_reqs,
|
|
||||||
)
|
|
||||||
end = time.time()
|
|
||||||
print(f"Iteration time (s): {end - start}")
|
|
||||||
|
|
||||||
|
|
||||||
def invoke_main() -> None:
|
def invoke_main() -> None:
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--batched", action="store_true", help="consider time to prepare batch"
|
|
||||||
) # noqa: E501
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
@ -197,17 +105,8 @@ def invoke_main() -> None:
|
|||||||
help="Number of speculative tokens to generate",
|
help="Number of speculative tokens to generate",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
if not args.batched:
|
|
||||||
benchmark_propose(args)
|
|
||||||
else:
|
|
||||||
benchmark_batched_propose(args)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Example command lines:
|
|
||||||
# time python3 benchmarks/benchmark_ngram_proposer.py
|
|
||||||
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
|
||||||
""" # noqa: E501
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
invoke_main() # pragma: no cover
|
invoke_main() # pragma: no cover
|
||||||
|
|||||||
@ -449,8 +449,7 @@ async def benchmark(
|
|||||||
def prepare_extra_body(request) -> dict:
|
def prepare_extra_body(request) -> dict:
|
||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body["structured_outputs"] = {}
|
extra_body[request.structure_type] = request.schema
|
||||||
extra_body["structured_outputs"][request.structure_type] = request.schema
|
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
@ -697,11 +696,11 @@ def evaluate(ret, args):
|
|||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
def _eval_correctness(expected, actual):
|
||||||
if args.structure_type == "json":
|
if args.structure_type == "guided_json":
|
||||||
return _eval_correctness_json(expected, actual)
|
return _eval_correctness_json(expected, actual)
|
||||||
elif args.structure_type == "regex":
|
elif args.structure_type == "guided_regex":
|
||||||
return _eval_correctness_regex(expected, actual)
|
return _eval_correctness_regex(expected, actual)
|
||||||
elif args.structure_type == "choice":
|
elif args.structure_type == "guided_choice":
|
||||||
return _eval_correctness_choice(expected, actual)
|
return _eval_correctness_choice(expected, actual)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@ -781,18 +780,18 @@ def main(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset == "grammar":
|
if args.dataset == "grammar":
|
||||||
args.structure_type = "grammar"
|
args.structure_type = "guided_grammar"
|
||||||
elif args.dataset == "regex":
|
elif args.dataset == "regex":
|
||||||
args.structure_type = "regex"
|
args.structure_type = "guided_regex"
|
||||||
elif args.dataset == "choice":
|
elif args.dataset == "choice":
|
||||||
args.structure_type = "choice"
|
args.structure_type = "guided_choice"
|
||||||
else:
|
else:
|
||||||
args.structure_type = "json"
|
args.structure_type = "guided_json"
|
||||||
|
|
||||||
if args.no_structured_output:
|
if args.no_structured_output:
|
||||||
args.structured_output_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f"{args.structured_output_ratio}so"
|
result_file_name = f"{args.structured_output_ratio}guided"
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
@ -158,7 +158,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
|
|||||||
@ -55,7 +55,9 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -63,7 +65,9 @@ benchmark() {
|
|||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -38,12 +38,16 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@ -58,14 +62,18 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
|
|||||||
"torch-bf16": dict(enabled=True),
|
"torch-bf16": dict(enabled=True),
|
||||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
|
|
||||||
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_needs_fbgemm = any(
|
|
||||||
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
|
|
||||||
)
|
|
||||||
if _needs_fbgemm:
|
|
||||||
try:
|
|
||||||
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
|
||||||
triton_scale_nvfp4_quant,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
print(
|
|
||||||
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
|
|
||||||
"These providers will be skipped. Please install fbgemm_gpu with: "
|
|
||||||
"'pip install fbgemm-gpu-genai' to run them."
|
|
||||||
)
|
|
||||||
# Disable FBGEMM providers so the benchmark can run.
|
|
||||||
for cfg in PROVIDER_CFGS.values():
|
|
||||||
if cfg.get("fbgemm"):
|
|
||||||
cfg["enabled"] = False
|
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
|
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
|
||||||
# Compute global scale for weight
|
# Compute global scale for weight
|
||||||
b_amax = torch.abs(b).max().to(torch.float32)
|
b_amax = torch.abs(b).max().to(torch.float32)
|
||||||
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
||||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||||
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
|
|
||||||
else:
|
|
||||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
|
||||||
return b_fp4, scale_b_fp4, b_global_scale
|
return b_fp4, scale_b_fp4, b_global_scale
|
||||||
|
|
||||||
|
|
||||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
|
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
|
||||||
|
|
||||||
# Compute global scale for activation
|
# Compute global scale for activation
|
||||||
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
||||||
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
|
|||||||
|
|
||||||
# Alpha for the GEMM operation
|
# Alpha for the GEMM operation
|
||||||
alpha = 1.0 / (a_global_scale * b_global_scale)
|
alpha = 1.0 / (a_global_scale * b_global_scale)
|
||||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
|
||||||
if cfg["no_a_quant"]:
|
|
||||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
|
||||||
|
|
||||||
def run():
|
|
||||||
return torch.ops.fbgemm.f4f4bf16(
|
|
||||||
a_fp4,
|
|
||||||
b_fp4,
|
|
||||||
scale_a_fp4,
|
|
||||||
scale_b_fp4,
|
|
||||||
global_scale=alpha,
|
|
||||||
use_mx=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
else:
|
|
||||||
|
|
||||||
def run():
|
|
||||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
|
||||||
return torch.ops.fbgemm.f4f4bf16(
|
|
||||||
a_fp4,
|
|
||||||
b_fp4,
|
|
||||||
scale_a_fp4,
|
|
||||||
scale_b_fp4,
|
|
||||||
global_scale=alpha,
|
|
||||||
use_mx=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
|
|
||||||
if cfg["no_a_quant"]:
|
if cfg["no_a_quant"]:
|
||||||
# Pre-quantize activation
|
# Pre-quantize activation
|
||||||
@ -184,13 +130,10 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
for K, N, model in prepare_shapes(args):
|
||||||
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
|
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
|
||||||
|
|
||||||
benchmark.run(
|
benchmark.run(
|
||||||
print_data=True,
|
print_data=True,
|
||||||
show_plots=True,
|
show_plots=True,
|
||||||
save_path=save_dir,
|
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||||
N=N,
|
N=N,
|
||||||
K=K,
|
K=K,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -51,7 +51,7 @@ def calculate_diff(
|
|||||||
):
|
):
|
||||||
"""Calculate the difference between Inductor and CUDA implementations."""
|
"""Calculate the difference between Inductor and CUDA implementations."""
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
|
x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
|
||||||
|
|
||||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
||||||
|
|
||||||
@ -59,25 +59,23 @@ def calculate_diff(
|
|||||||
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
||||||
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
||||||
|
|
||||||
try:
|
out_allclose = lambda o1, o2: torch.allclose(
|
||||||
torch.testing.assert_close(
|
o1.to(torch.float32),
|
||||||
cuda_out.to(torch.float32),
|
o2.to(torch.float32),
|
||||||
torch_out.to(torch.float32),
|
rtol=1e-3,
|
||||||
rtol=1e-3,
|
atol=1e-5,
|
||||||
atol=1e-5,
|
)
|
||||||
)
|
scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
|
||||||
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
|
|
||||||
torch.testing.assert_close(
|
if (
|
||||||
cuda_out.to(torch.float32),
|
out_allclose(cuda_out, torch_out)
|
||||||
torch_eager_out.to(torch.float32),
|
and scale_allclose(cuda_scale, torch_scale)
|
||||||
rtol=1e-3,
|
and out_allclose(cuda_out, torch_eager_out)
|
||||||
atol=1e-5,
|
and scale_allclose(cuda_scale, torch_eager_scale)
|
||||||
)
|
):
|
||||||
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
|
|
||||||
print("✅ All implementations match")
|
print("✅ All implementations match")
|
||||||
except AssertionError as e:
|
else:
|
||||||
print("❌ Implementations differ")
|
print("❌ Implementations differ")
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
configs = []
|
configs = []
|
||||||
@ -93,7 +91,7 @@ def benchmark_quantization(
|
|||||||
):
|
):
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
|
|
||||||
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
|
x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
||||||
@ -159,21 +157,21 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
parser.add_argument("-c", "--check", action="store_true")
|
parser.add_argument("-c", "--check", action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--hidden-sizes",
|
"--hidden-sizes",
|
||||||
type=int,
|
type=int,
|
||||||
nargs="+",
|
nargs="+",
|
||||||
default=[896, 1024, 2048, 4096, 7168],
|
default=None,
|
||||||
help="Hidden sizes to benchmark",
|
help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch-sizes",
|
"--batch-sizes",
|
||||||
type=int,
|
type=int,
|
||||||
nargs="+",
|
nargs="+",
|
||||||
default=[1, 16, 128, 512, 1024],
|
default=None,
|
||||||
help="Batch sizes to benchmark",
|
help="Batch sizes to benchmark (default: 1,16,32,64,128)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--group-sizes",
|
"--group-sizes",
|
||||||
@ -194,8 +192,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||||
|
|
||||||
hidden_sizes = args.hidden_sizes
|
hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
batch_sizes = args.batch_sizes
|
batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
|
||||||
|
|
||||||
if args.group_sizes is not None:
|
if args.group_sizes is not None:
|
||||||
group_shapes = []
|
group_shapes = []
|
||||||
|
|||||||
@ -1,406 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
|
||||||
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
|
|
||||||
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
|
|
||||||
but use different quantization strategies and backends.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import nvtx
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
|
||||||
# intermediate_size]
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
|
||||||
"mixtral-8x7b": [
|
|
||||||
[8, 2, 4096, 14336],
|
|
||||||
],
|
|
||||||
"deepseek-v2": [
|
|
||||||
[160, 6, 5120, 12288],
|
|
||||||
],
|
|
||||||
"custom-small": [
|
|
||||||
[8, 2, 2048, 7168],
|
|
||||||
],
|
|
||||||
"glm45-fp8": [
|
|
||||||
[128, 8, 4096, 1408],
|
|
||||||
],
|
|
||||||
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
|
|
||||||
[128, 1, 5120, 8192],
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFAULT_MODELS = [
|
|
||||||
"mixtral-8x7b",
|
|
||||||
]
|
|
||||||
|
|
||||||
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
|
|
||||||
DEFAULT_TP_SIZES = [1]
|
|
||||||
|
|
||||||
PER_ACT_TOKEN_OPTS = [False, True]
|
|
||||||
PER_OUT_CH_OPTS = [False, True]
|
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
|
||||||
|
|
||||||
|
|
||||||
def bench_run(
|
|
||||||
results: list,
|
|
||||||
model: str,
|
|
||||||
num_experts: int,
|
|
||||||
topk: int,
|
|
||||||
per_act_token: bool,
|
|
||||||
per_out_ch: bool,
|
|
||||||
mkn: tuple[int, int, int],
|
|
||||||
):
|
|
||||||
(m, k, n) = mkn
|
|
||||||
|
|
||||||
dtype = torch.half
|
|
||||||
device = "cuda"
|
|
||||||
|
|
||||||
# Create input activations
|
|
||||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
|
||||||
|
|
||||||
# Create weights
|
|
||||||
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
|
|
||||||
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
|
|
||||||
|
|
||||||
# Create FP8 quantized weights and scales for both kernels
|
|
||||||
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
|
|
||||||
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
|
|
||||||
|
|
||||||
# Create scales based on quantization strategy
|
|
||||||
if per_out_ch:
|
|
||||||
# Per-channel quantization
|
|
||||||
w1_scale = torch.empty(
|
|
||||||
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
|
|
||||||
)
|
|
||||||
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
|
|
||||||
else:
|
|
||||||
# Per-tensor quantization
|
|
||||||
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
|
||||||
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
|
||||||
|
|
||||||
# Quantize weights
|
|
||||||
for expert in range(num_experts):
|
|
||||||
if per_out_ch:
|
|
||||||
# Per-channel quantization - not yet implemented properly
|
|
||||||
# For now, fall back to per-tensor quantization
|
|
||||||
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
|
||||||
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
|
||||||
# Expand scalar scales to the expected per-channel shape
|
|
||||||
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
|
|
||||||
w2_scale[expert] = w2_scale_temp.expand(k, 1)
|
|
||||||
else:
|
|
||||||
# Per-tensor quantization
|
|
||||||
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
|
||||||
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
|
||||||
# Store scalar scales in [1, 1] tensors
|
|
||||||
w1_scale[expert, 0, 0] = w1_scale_temp
|
|
||||||
w2_scale[expert, 0, 0] = w2_scale_temp
|
|
||||||
|
|
||||||
# Prepare weights for CUTLASS (no transpose needed)
|
|
||||||
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
|
|
||||||
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
|
|
||||||
|
|
||||||
# Create router scores and get topk
|
|
||||||
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
|
||||||
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
|
||||||
|
|
||||||
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
|
|
||||||
# Force per-tensor quantization for all cases to match working e2e setup
|
|
||||||
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
|
||||||
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
|
||||||
|
|
||||||
# Force per-tensor quantization for all cases
|
|
||||||
per_act_token = False
|
|
||||||
|
|
||||||
# Create stride tensors for CUTLASS
|
|
||||||
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
|
||||||
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
|
|
||||||
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
|
|
||||||
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
|
||||||
|
|
||||||
def run_triton_moe(
|
|
||||||
a: torch.Tensor,
|
|
||||||
w1: torch.Tensor,
|
|
||||||
w2: torch.Tensor,
|
|
||||||
topk_weights: torch.Tensor,
|
|
||||||
topk_ids: torch.Tensor,
|
|
||||||
w1_scale: torch.Tensor,
|
|
||||||
w2_scale: torch.Tensor,
|
|
||||||
a1_scale: torch.Tensor,
|
|
||||||
a2_scale: torch.Tensor,
|
|
||||||
num_repeats: int,
|
|
||||||
):
|
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
per_act_token_quant=per_act_token,
|
|
||||||
per_out_ch_quant=per_out_ch,
|
|
||||||
)
|
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
|
||||||
fused_experts(
|
|
||||||
a,
|
|
||||||
w1,
|
|
||||||
w2,
|
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
quant_config=quant_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_cutlass_moe_fp8(
|
|
||||||
a: torch.Tensor,
|
|
||||||
w1: torch.Tensor,
|
|
||||||
w2: torch.Tensor,
|
|
||||||
topk_weights: torch.Tensor,
|
|
||||||
topk_ids: torch.Tensor,
|
|
||||||
ab_strides1: torch.Tensor,
|
|
||||||
ab_strides2: torch.Tensor,
|
|
||||||
c_strides1: torch.Tensor,
|
|
||||||
c_strides2: torch.Tensor,
|
|
||||||
w1_scale: torch.Tensor,
|
|
||||||
w2_scale: torch.Tensor,
|
|
||||||
a1_scale: torch.Tensor,
|
|
||||||
a2_scale: torch.Tensor,
|
|
||||||
num_repeats: int,
|
|
||||||
):
|
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
per_act_token_quant=per_act_token,
|
|
||||||
per_out_ch_quant=per_out_ch,
|
|
||||||
)
|
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
|
||||||
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
|
|
||||||
cutlass_moe_fp8(
|
|
||||||
a=a,
|
|
||||||
w1_q=w1,
|
|
||||||
w2_q=w2,
|
|
||||||
topk_weights=topk_weights,
|
|
||||||
topk_ids=topk_ids,
|
|
||||||
ab_strides1=ab_strides1,
|
|
||||||
ab_strides2=ab_strides2,
|
|
||||||
c_strides1=c_strides1,
|
|
||||||
c_strides2=c_strides2,
|
|
||||||
quant_config=quant_config,
|
|
||||||
activation="silu",
|
|
||||||
global_num_experts=num_experts,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Pre-create quantization config to avoid creating it inside CUDA graph
|
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
per_act_token_quant=per_act_token,
|
|
||||||
per_out_ch_quant=per_out_ch,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
|
|
||||||
cutlass_stream = torch.cuda.Stream()
|
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
|
||||||
# Capture 10 invocations like benchmark_moe.py
|
|
||||||
for _ in range(10):
|
|
||||||
cutlass_moe_fp8(
|
|
||||||
a=a,
|
|
||||||
w1_q=w1_fp8q_cutlass,
|
|
||||||
w2_q=w2_fp8q_cutlass,
|
|
||||||
topk_weights=topk_weights,
|
|
||||||
topk_ids=topk_ids,
|
|
||||||
ab_strides1=ab_strides1,
|
|
||||||
ab_strides2=ab_strides2,
|
|
||||||
c_strides1=c_strides1,
|
|
||||||
c_strides2=c_strides2,
|
|
||||||
quant_config=quant_config,
|
|
||||||
activation="silu",
|
|
||||||
global_num_experts=num_experts,
|
|
||||||
)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
|
||||||
triton_stream = torch.cuda.Stream()
|
|
||||||
triton_graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
|
||||||
# Capture 10 invocations like benchmark_moe.py
|
|
||||||
for _ in range(10):
|
|
||||||
fused_experts(
|
|
||||||
a,
|
|
||||||
w1_fp8q,
|
|
||||||
w2_fp8q,
|
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
quant_config=quant_config,
|
|
||||||
)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
|
||||||
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
|
||||||
# Warmup
|
|
||||||
for _ in range(num_warmup):
|
|
||||||
graph.replay()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Timing
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
|
|
||||||
latencies = []
|
|
||||||
for _ in range(num_iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start_event.record()
|
|
||||||
graph.replay()
|
|
||||||
end_event.record()
|
|
||||||
end_event.synchronize()
|
|
||||||
latencies.append(start_event.elapsed_time(end_event))
|
|
||||||
|
|
||||||
# Divide by 10 since graph contains 10 calls
|
|
||||||
return sum(latencies) / (num_iters * 10)
|
|
||||||
|
|
||||||
# Benchmark parameters
|
|
||||||
num_warmup = 5
|
|
||||||
num_iters = 100
|
|
||||||
|
|
||||||
# Benchmark only CUDA graphs (more reliable and faster)
|
|
||||||
# Benchmark Triton MoE with CUDA graphs
|
|
||||||
triton_graph_time = bench_cuda_graph(
|
|
||||||
triton_graph, num_warmup=num_warmup, num_iters=num_iters
|
|
||||||
)
|
|
||||||
|
|
||||||
# Benchmark CUTLASS MoE with CUDA graphs
|
|
||||||
cutlass_graph_time = bench_cuda_graph(
|
|
||||||
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert ms to us and return results
|
|
||||||
triton_time_us = triton_graph_time * 1000
|
|
||||||
cutlass_time_us = cutlass_graph_time * 1000
|
|
||||||
|
|
||||||
return {
|
|
||||||
"batch_size": m,
|
|
||||||
"triton_time_us": triton_time_us,
|
|
||||||
"cutlass_time_us": cutlass_time_us,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
print("Benchmarking models:")
|
|
||||||
for i, model in enumerate(args.models):
|
|
||||||
print(f"[{i}] {model}")
|
|
||||||
|
|
||||||
all_results = []
|
|
||||||
|
|
||||||
for model in args.models:
|
|
||||||
for tp in args.tp_sizes:
|
|
||||||
for layer in WEIGHT_SHAPES_MOE[model]:
|
|
||||||
num_experts = layer[0]
|
|
||||||
topk = layer[1]
|
|
||||||
size_k = layer[2]
|
|
||||||
size_n = layer[3] // tp
|
|
||||||
|
|
||||||
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for per_act_token in args.per_act_token_opts:
|
|
||||||
for per_out_ch in args.per_out_ch_opts:
|
|
||||||
print(
|
|
||||||
f"\n=== {model}, experts={num_experts}, topk={topk},"
|
|
||||||
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
|
|
||||||
)
|
|
||||||
|
|
||||||
config_results = []
|
|
||||||
for size_m in args.batch_sizes:
|
|
||||||
mkn = (size_m, size_k, size_n)
|
|
||||||
result = bench_run(
|
|
||||||
[], # Not used anymore
|
|
||||||
model,
|
|
||||||
num_experts,
|
|
||||||
topk,
|
|
||||||
per_act_token,
|
|
||||||
per_out_ch,
|
|
||||||
mkn,
|
|
||||||
)
|
|
||||||
if result:
|
|
||||||
config_results.append(result)
|
|
||||||
|
|
||||||
# Print results table for this configuration
|
|
||||||
if config_results:
|
|
||||||
print(
|
|
||||||
f"\n{'Batch Size':<12}"
|
|
||||||
f"{'Triton (us)':<15}"
|
|
||||||
f"{'CUTLASS (us)':<15}"
|
|
||||||
)
|
|
||||||
print("-" * 45)
|
|
||||||
for result in config_results:
|
|
||||||
print(
|
|
||||||
f"{result['batch_size']:<12}"
|
|
||||||
f"{result['triton_time_us']:<15.2f}"
|
|
||||||
f"{result['cutlass_time_us']:<15.2f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
all_results.extend(config_results)
|
|
||||||
|
|
||||||
print(f"\nTotal benchmarks completed: {len(all_results)}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
|
|
||||||
across specified models/shapes/batches
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
python benchmark_cutlass_moe_fp8.py \
|
|
||||||
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
|
|
||||||
--tp-sizes 8 \
|
|
||||||
--batch-size 2 4 8 \
|
|
||||||
--per-act-token-opts false \
|
|
||||||
--per-out-ch-opts false
|
|
||||||
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
nargs="+",
|
|
||||||
type=str,
|
|
||||||
default=DEFAULT_MODELS,
|
|
||||||
choices=WEIGHT_SHAPES_MOE.keys(),
|
|
||||||
)
|
|
||||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
|
|
||||||
parser.add_argument(
|
|
||||||
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
|
||||||
)
|
|
||||||
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
|
||||||
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
|
||||||
parser.add_argument(
|
|
||||||
"--per-act-token-opts",
|
|
||||||
nargs="+",
|
|
||||||
type=lambda x: x.lower() == "true",
|
|
||||||
default=[False, True],
|
|
||||||
help="Per-activation token quantization options (true/false)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--per-out-ch-opts",
|
|
||||||
nargs="+",
|
|
||||||
type=lambda x: x.lower() == "true",
|
|
||||||
default=[False, True],
|
|
||||||
help="Per-output channel quantization options (true/false)",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
|
||||||
@ -7,10 +7,6 @@ Benchmark script for device communicators:
|
|||||||
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
|
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
|
||||||
and SymmMemCommunicator (multimem, two-shot).
|
and SymmMemCommunicator (multimem, two-shot).
|
||||||
|
|
||||||
for NCCL symmetric memory you need to set the environment variables
|
|
||||||
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
|
|
||||||
not use fast NVLS implementation for all reduce.
|
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
|
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
|
||||||
|
|
||||||
@ -30,13 +26,7 @@ import torch.distributed as dist
|
|||||||
from torch.distributed import ProcessGroup
|
from torch.distributed import ProcessGroup
|
||||||
|
|
||||||
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
||||||
from vllm.distributed.device_communicators.pynccl import (
|
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||||
PyNcclCommunicator,
|
|
||||||
register_nccl_symmetric_ops,
|
|
||||||
)
|
|
||||||
from vllm.distributed.device_communicators.pynccl_allocator import (
|
|
||||||
set_graph_pool_id,
|
|
||||||
)
|
|
||||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@ -108,7 +98,6 @@ class CommunicatorBenchmark:
|
|||||||
)
|
)
|
||||||
if not self.pynccl_comm.disabled:
|
if not self.pynccl_comm.disabled:
|
||||||
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
|
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
|
||||||
register_nccl_symmetric_ops(self.pynccl_comm)
|
|
||||||
else:
|
else:
|
||||||
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
|
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
|
||||||
self.pynccl_comm = None
|
self.pynccl_comm = None
|
||||||
@ -205,15 +194,6 @@ class CommunicatorBenchmark:
|
|||||||
None, # no env variable needed
|
None, # no env variable needed
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
communicators.append(
|
|
||||||
(
|
|
||||||
"pynccl-symm",
|
|
||||||
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
|
|
||||||
lambda t: True, # Always available if initialized
|
|
||||||
nullcontext(),
|
|
||||||
None, # no env variable needed
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.symm_mem_comm_multimem is not None:
|
if self.symm_mem_comm_multimem is not None:
|
||||||
comm = self.symm_mem_comm_multimem
|
comm = self.symm_mem_comm_multimem
|
||||||
@ -291,9 +271,7 @@ class CommunicatorBenchmark:
|
|||||||
# Capture the graph using context manager
|
# Capture the graph using context manager
|
||||||
with context:
|
with context:
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
graph_pool = torch.cuda.graph_pool_handle()
|
with torch.cuda.graph(graph):
|
||||||
set_graph_pool_id(graph_pool)
|
|
||||||
with torch.cuda.graph(graph, pool=graph_pool):
|
|
||||||
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||||
allreduce_fn(graph_input)
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
|
|||||||
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
|
|||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(
|
def make_rand_tensors(
|
||||||
a_shape: tuple[int, ...],
|
a_shape: tuple[int],
|
||||||
b_shape: tuple[int, ...],
|
b_shape: tuple[int],
|
||||||
c_shape: tuple[int, ...],
|
c_shape: tuple[int],
|
||||||
a_dtype: torch.dtype,
|
a_dtype: torch.dtype,
|
||||||
b_dtype: torch.dtype,
|
b_dtype: torch.dtype,
|
||||||
c_dtype: torch.dtype,
|
c_dtype: torch.dtype,
|
||||||
@ -243,7 +243,7 @@ class OpType(Enum):
|
|||||||
lora_rank: int,
|
lora_rank: int,
|
||||||
num_loras: int,
|
num_loras: int,
|
||||||
num_slices: int,
|
num_slices: int,
|
||||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
) -> tuple[tuple[int], tuple[int], tuple[int]]:
|
||||||
"""
|
"""
|
||||||
Given num_slices, return the shapes of the A, B, and C matrices
|
Given num_slices, return the shapes of the A, B, and C matrices
|
||||||
in A x B = C, for the op_type
|
in A x B = C, for the op_type
|
||||||
|
|||||||
@ -584,9 +584,8 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"DeepseekV2ForCausalLM",
|
|
||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
"DeepseekV32ForCausalLM",
|
"DeepseekV2ForCausalLM",
|
||||||
"Glm4MoeForCausalLM",
|
"Glm4MoeForCausalLM",
|
||||||
):
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
|
|||||||
@ -1,174 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import (
|
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def run_benchmark(
|
|
||||||
num_tokens: int,
|
|
||||||
num_heads: int,
|
|
||||||
head_size: int,
|
|
||||||
block_size: int,
|
|
||||||
num_blocks: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
num_iters: int,
|
|
||||||
benchmark_mode: str,
|
|
||||||
device: str = "cuda",
|
|
||||||
) -> float:
|
|
||||||
"""Return latency (seconds) for given num_tokens."""
|
|
||||||
|
|
||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
|
||||||
|
|
||||||
current_platform.seed_everything(42)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
|
|
||||||
# create random key / value tensors [T, H, D].
|
|
||||||
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
|
||||||
value = torch.randn_like(key)
|
|
||||||
|
|
||||||
# prepare the slot mapping.
|
|
||||||
# each token is assigned a unique slot in the KV-cache.
|
|
||||||
num_slots = block_size * num_blocks
|
|
||||||
if num_tokens > num_slots:
|
|
||||||
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
|
||||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
|
||||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
|
||||||
|
|
||||||
key_caches, value_caches = create_kv_caches_with_random(
|
|
||||||
num_blocks,
|
|
||||||
block_size,
|
|
||||||
1, # num_layers
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
kv_cache_dtype,
|
|
||||||
dtype,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
|
||||||
# to free unused memory
|
|
||||||
del key_caches, value_caches
|
|
||||||
|
|
||||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
|
||||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
|
||||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
|
||||||
|
|
||||||
function_under_test = lambda: ops.reshape_and_cache(
|
|
||||||
key, # noqa: F821
|
|
||||||
value, # noqa: F821
|
|
||||||
key_cache, # noqa: F821
|
|
||||||
value_cache, # noqa: F821
|
|
||||||
slot_mapping, # noqa: F821
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
|
|
||||||
if benchmark_mode == "cudagraph":
|
|
||||||
g = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(g):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
function_under_test = lambda: g.replay()
|
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.perf_counter()
|
|
||||||
for _ in range(n_iters):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.perf_counter()
|
|
||||||
return (end - start) / n_iters
|
|
||||||
|
|
||||||
# warm-up
|
|
||||||
run_cuda_benchmark(3)
|
|
||||||
|
|
||||||
lat = run_cuda_benchmark(num_iters)
|
|
||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
return lat
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
rows = []
|
|
||||||
for exp in range(1, 17):
|
|
||||||
n_tok = 2**exp
|
|
||||||
lat = run_benchmark(
|
|
||||||
num_tokens=n_tok,
|
|
||||||
num_heads=args.num_heads,
|
|
||||||
head_size=args.head_size,
|
|
||||||
block_size=args.block_size,
|
|
||||||
num_blocks=args.num_blocks,
|
|
||||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
|
||||||
num_iters=args.iters,
|
|
||||||
benchmark_mode=args.mode,
|
|
||||||
device="cuda",
|
|
||||||
)
|
|
||||||
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
|
||||||
|
|
||||||
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
|
||||||
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
|
|
||||||
parser.add_argument("--num-heads", type=int, default=128)
|
|
||||||
parser.add_argument(
|
|
||||||
"--head-size",
|
|
||||||
type=int,
|
|
||||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
|
||||||
default=128,
|
|
||||||
)
|
|
||||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
|
||||||
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["half", "bfloat16", "float"],
|
|
||||||
default="bfloat16",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--kv-cache-dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["auto", "fp8"],
|
|
||||||
default="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("--iters", type=int, default=200)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--mode",
|
|
||||||
type=str,
|
|
||||||
choices=["cudagraph", "no_graph"],
|
|
||||||
default="cudagraph",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
main(args)
|
|
||||||
@ -9,9 +9,6 @@ import torch
|
|||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
|
||||||
triton_reshape_and_cache_flash,
|
|
||||||
)
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import (
|
||||||
@ -34,8 +31,6 @@ def run_benchmark(
|
|||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
kv_cache_layout: str,
|
kv_cache_layout: str,
|
||||||
num_iters: int,
|
num_iters: int,
|
||||||
implementation: str,
|
|
||||||
benchmark_mode: str,
|
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
) -> float:
|
) -> float:
|
||||||
"""Return latency (seconds) for given num_tokens."""
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
@ -43,14 +38,6 @@ def run_benchmark(
|
|||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
if implementation not in ("cuda", "triton"):
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported implementation: {implementation}. "
|
|
||||||
"Only 'cuda' and 'triton' are supported."
|
|
||||||
)
|
|
||||||
if implementation == "triton" and kv_cache_layout == "HND":
|
|
||||||
return float("nan") # Triton does not support HND layout yet.
|
|
||||||
|
|
||||||
current_platform.seed_everything(42)
|
current_platform.seed_everything(42)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
@ -78,49 +65,27 @@ def run_benchmark(
|
|||||||
cache_layout=kv_cache_layout,
|
cache_layout=kv_cache_layout,
|
||||||
)
|
)
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
# to free unused memory
|
|
||||||
del key_caches, value_caches
|
|
||||||
|
|
||||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
if implementation == "cuda":
|
|
||||||
function_under_test = lambda: ops.reshape_and_cache_flash(
|
|
||||||
key, # noqa: F821
|
|
||||||
value, # noqa: F821
|
|
||||||
key_cache, # noqa: F821
|
|
||||||
value_cache, # noqa: F821
|
|
||||||
slot_mapping, # noqa: F821
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
function_under_test = lambda: triton_reshape_and_cache_flash(
|
|
||||||
key, # noqa: F821
|
|
||||||
value, # noqa: F821
|
|
||||||
key_cache, # noqa: F821
|
|
||||||
value_cache, # noqa: F821
|
|
||||||
slot_mapping, # noqa: F821
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
if benchmark_mode == "cudagraph":
|
|
||||||
g = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(g):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
function_under_test = lambda: g.replay()
|
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
function_under_test()
|
ops.reshape_and_cache_flash(
|
||||||
torch.cuda.synchronize()
|
key,
|
||||||
|
value,
|
||||||
|
key_cache,
|
||||||
|
value_cache,
|
||||||
|
slot_mapping,
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@ -151,16 +116,10 @@ def main(args):
|
|||||||
kv_cache_dtype=args.kv_cache_dtype,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
kv_cache_layout=layout,
|
kv_cache_layout=layout,
|
||||||
num_iters=args.iters,
|
num_iters=args.iters,
|
||||||
implementation=args.implementation,
|
|
||||||
benchmark_mode=args.mode,
|
|
||||||
device="cuda",
|
device="cuda",
|
||||||
)
|
)
|
||||||
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
||||||
|
|
||||||
print(
|
|
||||||
f"Benchmark results for implementation {args.implementation}"
|
|
||||||
f" (measuring with {args.mode}):"
|
|
||||||
)
|
|
||||||
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
||||||
|
|
||||||
|
|
||||||
@ -192,21 +151,6 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--iters", type=int, default=100)
|
parser.add_argument("--iters", type=int, default=100)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--implementation",
|
|
||||||
type=str,
|
|
||||||
choices=["cuda", "triton"],
|
|
||||||
default="cuda",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--mode",
|
|
||||||
type=str,
|
|
||||||
choices=["cudagraph", "no_graph"],
|
|
||||||
default="cudagraph",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -11,13 +11,13 @@ from datetime import datetime
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import triton
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
_w8a8_block_fp8_matmul,
|
_w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
mp.set_start_method("spawn", force=True)
|
mp.set_start_method("spawn", force=True)
|
||||||
|
|||||||
@ -8,16 +8,12 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
get_col_major_tma_aligned_tensor,
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.deep_gemm import (
|
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
||||||
calc_diff,
|
|
||||||
fp8_gemm_nt,
|
|
||||||
get_col_major_tma_aligned_tensor,
|
|
||||||
per_block_cast_to_fp8,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_shape(m: int,
|
def benchmark_shape(m: int,
|
||||||
@ -63,7 +59,7 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
# === vLLM Triton Implementation ===
|
# === vLLM Triton Implementation ===
|
||||||
def vllm_triton_gemm():
|
def vllm_triton_gemm():
|
||||||
return w8a8_triton_block_scaled_mm(A_vllm,
|
return w8a8_block_fp8_matmul(A_vllm,
|
||||||
B_vllm,
|
B_vllm,
|
||||||
A_scale_vllm,
|
A_scale_vllm,
|
||||||
B_scale_vllm,
|
B_scale_vllm,
|
||||||
|
|||||||
@ -101,7 +101,6 @@ else()
|
|||||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
@ -178,14 +177,8 @@ elseif (S390_FOUND)
|
|||||||
"-mzvector"
|
"-mzvector"
|
||||||
"-march=native"
|
"-march=native"
|
||||||
"-mtune=native")
|
"-mtune=native")
|
||||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
|
||||||
if(RVV_FOUND)
|
|
||||||
message(FAIL_ERROR "Can't support rvv now.")
|
|
||||||
else()
|
|
||||||
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
|
||||||
endif()
|
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -265,8 +258,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/layernorm.cpp"
|
"csrc/cpu/layernorm.cpp"
|
||||||
"csrc/cpu/mla_decode.cpp"
|
"csrc/cpu/mla_decode.cpp"
|
||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp"
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
|
|||||||
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
|
|||||||
else()
|
else()
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
flashmla
|
flashmla
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
|
||||||
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
|
GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
CONFIGURE_COMMAND ""
|
CONFIGURE_COMMAND ""
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
@ -33,64 +33,23 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
|||||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||||
# Only build FlashMLA kernels if we are building for something compatible with
|
# Only build FlashMLA kernels if we are building for something compatible with
|
||||||
# sm90a
|
# sm90a
|
||||||
|
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
set(SUPPORT_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
|
||||||
list(APPEND SUPPORT_ARCHS 9.0a)
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
|
||||||
list(APPEND SUPPORT_ARCHS 10.0a)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
|
||||||
if(FLASH_MLA_ARCHS)
|
|
||||||
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
|
||||||
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
|
||||||
|
|
||||||
set(FlashMLA_SOURCES
|
set(FlashMLA_SOURCES
|
||||||
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
|
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
|
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
|
||||||
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
|
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
|
||||||
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
|
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
|
${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
|
|
||||||
)
|
|
||||||
|
|
||||||
set(FlashMLA_Extension_SOURCES
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
|
||||||
)
|
|
||||||
|
|
||||||
set(FlashMLA_INCLUDES
|
set(FlashMLA_INCLUDES
|
||||||
${flashmla_SOURCE_DIR}/csrc
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
${flashmla_SOURCE_DIR}/csrc)
|
||||||
)
|
|
||||||
|
|
||||||
set(FlashMLA_Extension_INCLUDES
|
|
||||||
${flashmla_SOURCE_DIR}/csrc
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
|
||||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
|
||||||
)
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${FlashMLA_SOURCES}"
|
SRCS "${FlashMLA_SOURCES}"
|
||||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${FlashMLA_Extension_SOURCES}"
|
|
||||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_flashmla_C
|
_flashmla_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -101,32 +60,8 @@ if(FLASH_MLA_ARCHS)
|
|||||||
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
|
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
|
||||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
|
||||||
target_compile_options(_flashmla_C PRIVATE
|
|
||||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
|
||||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
|
||||||
|
|
||||||
define_gpu_extension_target(
|
|
||||||
_flashmla_extension_C
|
|
||||||
DESTINATION vllm
|
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
|
||||||
SOURCES ${FlashMLA_Extension_SOURCES}
|
|
||||||
COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
|
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
|
||||||
INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
|
|
||||||
USE_SABI 3
|
|
||||||
WITH_SOABI)
|
|
||||||
|
|
||||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
|
||||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
|
||||||
target_compile_options(_flashmla_extension_C PRIVATE
|
|
||||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
|
||||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
|
||||||
else()
|
else()
|
||||||
# Create empty targets for setup.py when not targeting sm90a systems
|
# Create an empty target for setup.py when not targeting sm90a systems
|
||||||
add_custom_target(_flashmla_C)
|
add_custom_target(_flashmla_C)
|
||||||
add_custom_target(_flashmla_extension_C)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
|
|||||||
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
||||||
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
||||||
|
|
||||||
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||||
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
|
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
|
||||||
set(_CUDA_ARCHS)
|
set(_CUDA_ARCHS)
|
||||||
foreach(_arch ${_SRC_CUDA_ARCHS})
|
foreach(_arch ${_SRC_CUDA_ARCHS})
|
||||||
if(_arch MATCHES "[af]$")
|
if(_arch MATCHES "\\a$")
|
||||||
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
||||||
string(REGEX REPLACE "[af]$" "" _base "${_arch}")
|
string(REPLACE "a" "" _base "${_arch}")
|
||||||
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
||||||
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
||||||
list(APPEND _CUDA_ARCHS "${_arch}")
|
list(APPEND _CUDA_ARCHS "${_arch}")
|
||||||
|
|||||||
@ -28,10 +28,10 @@
|
|||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "../quantization/fp8/amd/quant_utils.cuh"
|
||||||
typedef __hip_bfloat16 __nv_bfloat16;
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
#else
|
#else
|
||||||
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|||||||
@ -135,10 +135,10 @@ public:
|
|||||||
max_splits = min(16, max_splits);
|
max_splits = min(16, max_splits);
|
||||||
|
|
||||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||||
// there is more than 1 kv_splits.
|
// there is more than 4 kv_splits.
|
||||||
// Discuss with NVIDIA how this can be fixed.
|
// Discuss with NVIDIA how this can be fixed.
|
||||||
if (B > 1) {
|
if (B > 1) {
|
||||||
max_splits = min(1, max_splits);
|
max_splits = min(2, max_splits);
|
||||||
}
|
}
|
||||||
|
|
||||||
// printf(" max_splits = %d\n", max_splits);
|
// printf(" max_splits = %d\n", max_splits);
|
||||||
|
|||||||
@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||||
auto blk_coord = tile_scheduler.get_block_coord();
|
auto blk_coord = tile_scheduler.get_block_coord();
|
||||||
auto problem_shape = params.problem_shape;
|
auto problem_shape = params.problem_shape;
|
||||||
auto local_split_kv = params.split_kv;
|
auto local_split_kv = params.split_kv;
|
||||||
if (params.mainloop.ptr_seq != nullptr) {
|
if (params.mainloop.ptr_seq != nullptr) {
|
||||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||||
if (params.ptr_split_kv != nullptr) {
|
if (params.ptr_split_kv != nullptr) {
|
||||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (local_split_kv <= get<3>(blk_coord))
|
if (local_split_kv <= get<3>(blk_coord))
|
||||||
continue;
|
continue;
|
||||||
load_page_table(
|
load_page_table(
|
||||||
blk_coord,
|
blk_coord,
|
||||||
problem_shape,
|
problem_shape,
|
||||||
params.mainloop,
|
params.mainloop,
|
||||||
shared_storage.tensors,
|
shared_storage.tensors,
|
||||||
pipeline_page_table, pipeline_pt_producer_state,
|
pipeline_page_table, pipeline_pt_producer_state,
|
||||||
local_split_kv
|
local_split_kv
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
CUTLASS_PRAGMA_NO_UNROLL
|
CUTLASS_PRAGMA_NO_UNROLL
|
||||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||||
auto blk_coord = tile_scheduler.get_block_coord();
|
auto blk_coord = tile_scheduler.get_block_coord();
|
||||||
auto problem_shape = params.problem_shape;
|
auto problem_shape = params.problem_shape;
|
||||||
auto local_split_kv = params.split_kv;
|
auto local_split_kv = params.split_kv;
|
||||||
if (params.mainloop.ptr_seq != nullptr) {
|
if (params.mainloop.ptr_seq != nullptr) {
|
||||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||||
if (params.ptr_split_kv != nullptr) {
|
if (params.ptr_split_kv != nullptr) {
|
||||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (local_split_kv <= get<3>(blk_coord))
|
if (local_split_kv <= get<3>(blk_coord))
|
||||||
continue;
|
continue;
|
||||||
load_cpasync(
|
load_cpasync(
|
||||||
blk_coord,
|
blk_coord,
|
||||||
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
params.mainloop_params,
|
params.mainloop_params,
|
||||||
shared_storage.tensors,
|
shared_storage.tensors,
|
||||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||||
local_split_kv,
|
local_split_kv,
|
||||||
/* must be shared pipe */
|
/* must be shared pipe */
|
||||||
pipeline_page_table, pipeline_pt_consumer_state
|
pipeline_page_table, pipeline_pt_consumer_state
|
||||||
);
|
);
|
||||||
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
CUTLASS_PRAGMA_NO_UNROLL
|
CUTLASS_PRAGMA_NO_UNROLL
|
||||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||||
auto blk_coord = tile_scheduler.get_block_coord();
|
auto blk_coord = tile_scheduler.get_block_coord();
|
||||||
auto problem_shape = params.problem_shape;
|
auto problem_shape = params.problem_shape;
|
||||||
auto local_split_kv = params.split_kv;
|
auto local_split_kv = params.split_kv;
|
||||||
if (params.mainloop.ptr_seq != nullptr) {
|
if (params.mainloop.ptr_seq != nullptr) {
|
||||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||||
if (params.ptr_split_kv != nullptr) {
|
if (params.ptr_split_kv != nullptr) {
|
||||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (local_split_kv <= get<3>(blk_coord))
|
if (local_split_kv <= get<3>(blk_coord))
|
||||||
continue;
|
continue;
|
||||||
load_tma</* paged= */ true>(
|
load_tma</* paged= */ true>(
|
||||||
blk_coord,
|
blk_coord,
|
||||||
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
shared_storage.tensors,
|
shared_storage.tensors,
|
||||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||||
local_split_kv
|
local_split_kv
|
||||||
);
|
);
|
||||||
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
||||||
}
|
}
|
||||||
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
CUTLASS_PRAGMA_NO_UNROLL
|
CUTLASS_PRAGMA_NO_UNROLL
|
||||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||||
auto blk_coord = tile_scheduler.get_block_coord();
|
auto blk_coord = tile_scheduler.get_block_coord();
|
||||||
auto problem_shape = params.problem_shape;
|
auto problem_shape = params.problem_shape;
|
||||||
auto local_split_kv = params.split_kv;
|
auto local_split_kv = params.split_kv;
|
||||||
if (params.mainloop.ptr_seq != nullptr) {
|
if (params.mainloop.ptr_seq != nullptr) {
|
||||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||||
if (params.ptr_split_kv != nullptr) {
|
if (params.ptr_split_kv != nullptr) {
|
||||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (local_split_kv <= get<3>(blk_coord))
|
if (local_split_kv <= get<3>(blk_coord))
|
||||||
continue;
|
continue;
|
||||||
load_tma<false>(
|
load_tma<false>(
|
||||||
blk_coord,
|
blk_coord,
|
||||||
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
shared_storage.tensors,
|
shared_storage.tensors,
|
||||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||||
local_split_kv
|
local_split_kv
|
||||||
);
|
);
|
||||||
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
||||||
}
|
}
|
||||||
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||||
auto blk_coord = tile_scheduler.get_block_coord();
|
auto blk_coord = tile_scheduler.get_block_coord();
|
||||||
auto problem_shape = params.problem_shape;
|
auto problem_shape = params.problem_shape;
|
||||||
auto local_split_kv = params.split_kv;
|
auto local_split_kv = params.split_kv;
|
||||||
if (params.mainloop.ptr_seq != nullptr) {
|
if (params.mainloop.ptr_seq != nullptr) {
|
||||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||||
if (params.ptr_split_kv != nullptr) {
|
if (params.ptr_split_kv != nullptr) {
|
||||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (local_split_kv <= get<3>(blk_coord))
|
if (local_split_kv <= get<3>(blk_coord))
|
||||||
continue;
|
continue;
|
||||||
mma(blk_coord,
|
mma(blk_coord,
|
||||||
problem_shape,
|
problem_shape,
|
||||||
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
pipeline_mma_s, pipeline_mma_s_producer_state,
|
pipeline_mma_s, pipeline_mma_s_producer_state,
|
||||||
pipeline_p_mma, pipeline_p_mma_consumer_state,
|
pipeline_p_mma, pipeline_p_mma_consumer_state,
|
||||||
pipeline_mma_o, pipeline_mma_o_producer_state,
|
pipeline_mma_o, pipeline_mma_o_producer_state,
|
||||||
local_split_kv
|
local_split_kv
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||||
auto blk_coord = tile_scheduler.get_block_coord();
|
auto blk_coord = tile_scheduler.get_block_coord();
|
||||||
auto problem_shape = params.problem_shape;
|
auto problem_shape = params.problem_shape;
|
||||||
auto split_kv = params.split_kv;
|
auto split_kv = params.split_kv;
|
||||||
auto local_split_kv = split_kv;
|
auto local_split_kv = split_kv;
|
||||||
if (params.mainloop.ptr_seq != nullptr) {
|
if (params.mainloop.ptr_seq != nullptr) {
|
||||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||||
if (params.ptr_split_kv != nullptr) {
|
if (params.ptr_split_kv != nullptr) {
|
||||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (local_split_kv <= get<3>(blk_coord))
|
if (local_split_kv <= get<3>(blk_coord))
|
||||||
continue;
|
continue;
|
||||||
compute(
|
compute(
|
||||||
blk_coord,
|
blk_coord,
|
||||||
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
pipeline_mma_s, pipeline_mma_s_consumer_state,
|
pipeline_mma_s, pipeline_mma_s_consumer_state,
|
||||||
pipeline_p_mma, pipeline_p_mma_producer_state,
|
pipeline_p_mma, pipeline_p_mma_producer_state,
|
||||||
pipeline_mma_o, pipeline_mma_o_consumer_state,
|
pipeline_mma_o, pipeline_mma_o_consumer_state,
|
||||||
local_split_kv
|
local_split_kv
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
|||||||
cutlass::arch::NamedBarrier(
|
cutlass::arch::NamedBarrier(
|
||||||
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
|
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
|
||||||
kNamedBarrierEpilogue
|
kNamedBarrierEpilogue
|
||||||
).arrive_and_wait();
|
).arrive();
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -56,11 +56,3 @@ void cp_gather_cache(
|
|||||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||||
|
|
||||||
// Indexer K quantization and cache function
|
|
||||||
void indexer_k_quant_and_cache(
|
|
||||||
torch::Tensor& k, // [num_tokens, head_dim]
|
|
||||||
torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
|
||||||
torch::Tensor& slot_mapping, // [num_tokens]
|
|
||||||
int64_t quant_block_size, // quantization block size
|
|
||||||
const std::string& scale_fmt);
|
|
||||||
|
|||||||
@ -9,14 +9,15 @@
|
|||||||
#include "quantization/vectorization_utils.cuh"
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||||
#else
|
#else
|
||||||
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
#include "quantization/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cfloat>
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
@ -208,20 +209,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Used to copy/convert one element
|
|
||||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
|
||||||
struct CopyWithScaleOp {
|
|
||||||
float scale;
|
|
||||||
|
|
||||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
dst = static_cast<OutT>(src);
|
|
||||||
} else {
|
|
||||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_kernel(
|
__global__ void reshape_and_cache_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -237,51 +224,59 @@ __global__ void reshape_and_cache_kernel(
|
|||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
if (slot_idx < 0) {
|
if (slot_idx < 0) {
|
||||||
|
// Padding token that should be ignored.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
const int h_block_count = head_size / x; // head_size//x
|
|
||||||
|
|
||||||
const int h_block_idx = threadIdx.x;
|
const int n = num_heads * head_size;
|
||||||
if (h_block_idx >= num_heads * h_block_count) {
|
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||||
return;
|
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||||
}
|
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||||
|
|
||||||
const int head_idx = h_block_idx / h_block_count;
|
const int head_idx = i / head_size;
|
||||||
const int h_block = h_block_idx % h_block_count;
|
const int head_offset = i % head_size;
|
||||||
|
const int x_idx = head_offset / x;
|
||||||
|
const int x_offset = head_offset % x;
|
||||||
|
|
||||||
const scalar_t* __restrict__ key_src =
|
const int64_t tgt_key_idx =
|
||||||
key + token_idx * key_stride + head_idx * head_size + h_block * x;
|
block_idx * num_heads * (head_size / x) * block_size * x +
|
||||||
const int64_t src_value_start =
|
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
||||||
token_idx * value_stride + head_idx * head_size + h_block * x;
|
block_offset * x + x_offset;
|
||||||
|
const int64_t tgt_value_idx =
|
||||||
cache_t* __restrict__ key_dst =
|
block_idx * num_heads * head_size * block_size +
|
||||||
key_cache + block_idx * num_heads * h_block_count * block_size * x +
|
head_idx * head_size * block_size + head_offset * block_size +
|
||||||
head_idx * h_block_count * block_size * x + h_block * block_size * x +
|
block_offset;
|
||||||
block_offset * x;
|
scalar_t tgt_key = key[src_key_idx];
|
||||||
const int64_t tgt_value_start =
|
scalar_t tgt_value = value[src_value_idx];
|
||||||
block_idx * num_heads * h_block_count * x * block_size +
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
head_idx * h_block_count * x * block_size + h_block * x * block_size +
|
key_cache[tgt_key_idx] = tgt_key;
|
||||||
block_offset;
|
value_cache[tgt_value_idx] = tgt_value;
|
||||||
|
} else {
|
||||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
key_cache[tgt_key_idx] =
|
||||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
||||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
value_cache[tgt_value_idx] =
|
||||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
||||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
}
|
||||||
|
|
||||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
|
|
||||||
|
|
||||||
const scalar_t* __restrict__ value_src = value + src_value_start;
|
|
||||||
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < x; i++) {
|
|
||||||
v_op(value_dst[i * block_size], value_src[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Used by vectorization_utils to copy/convert one element
|
||||||
|
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||||
|
struct CopyWithScaleOp {
|
||||||
|
float scale;
|
||||||
|
|
||||||
|
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||||
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
|
dst = static_cast<OutT>(src);
|
||||||
|
} else {
|
||||||
|
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_flash_kernel(
|
__global__ void reshape_and_cache_flash_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -401,177 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
|
|||||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
|
||||||
__global__ void concat_and_cache_ds_mla_kernel(
|
|
||||||
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
|
||||||
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
|
|
||||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
|
|
||||||
// + pe_dim)]
|
|
||||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
|
||||||
const int block_stride, //
|
|
||||||
const int entry_stride, //
|
|
||||||
const int kv_c_stride, //
|
|
||||||
const int k_pe_stride, //
|
|
||||||
const int kv_lora_rank, //
|
|
||||||
const int pe_dim, //
|
|
||||||
const int block_size, //
|
|
||||||
const float* scale //
|
|
||||||
) {
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
|
||||||
// NOTE: slot_idx can be -1 if the token is padded
|
|
||||||
if (slot_idx < 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
|
||||||
const int64_t dst_idx_start =
|
|
||||||
block_idx * block_stride + block_offset * entry_stride;
|
|
||||||
|
|
||||||
// For the NoPE part, each tile of 128 elements is handled by half of one warp
|
|
||||||
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
|
||||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
|
||||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
|
|
||||||
// So in total, we use 3 warps (96 threads) per block.
|
|
||||||
|
|
||||||
// Cast kv_cache to 16_bit for RoPE values
|
|
||||||
scalar_t* kv_cache_16bit =
|
|
||||||
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
|
||||||
|
|
||||||
// The last warp handles the RoPE part
|
|
||||||
if (threadIdx.x >= 64) {
|
|
||||||
// Each thread handles two elements of RoPE
|
|
||||||
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
|
|
||||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
|
|
||||||
// Vectorized load of two 16-bit values, performed as one 32-bit load
|
|
||||||
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
|
|
||||||
// RoPE values start after the packed 8-bit NoPE values and the
|
|
||||||
// 32-bit scales
|
|
||||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
|
|
||||||
// Vectorized store of two 16-bit values, performed as one 32-bit store
|
|
||||||
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// The first two warps handle the NoPE part
|
|
||||||
const int8_t warp_idx = threadIdx.x >> 5;
|
|
||||||
const int8_t lane_idx = threadIdx.x & 31;
|
|
||||||
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
|
|
||||||
|
|
||||||
// Each thread handles 8 elements of NoPE
|
|
||||||
// Load the NoPE elements for this thread into registers
|
|
||||||
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
|
|
||||||
// Vectorized load of eight 16-bit values, performed as an int4 load
|
|
||||||
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
|
|
||||||
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
|
|
||||||
|
|
||||||
// Max absolute value of this thread's elements
|
|
||||||
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
|
|
||||||
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
|
|
||||||
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
|
|
||||||
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
|
|
||||||
|
|
||||||
// Warp-level reduction to find the max absolute value in each half-warp
|
|
||||||
#pragma unroll
|
|
||||||
for (int offset = 8; offset > 0; offset /= 2) {
|
|
||||||
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute the scale for the tile
|
|
||||||
float tile_scale = max_abs / 448.f;
|
|
||||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
|
||||||
|
|
||||||
// The first lane of each half-warp writes the scale to kv_cache
|
|
||||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
|
||||||
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
|
||||||
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
|
||||||
kv_cache_32bit[dst_idx] = tile_scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now all threads in the block scale and write their elements
|
|
||||||
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
|
|
||||||
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
|
|
||||||
|
|
||||||
uint8_t result[8];
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
result[i] =
|
|
||||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
|
||||||
vals[i], tile_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store as aligned 64-bit writes
|
|
||||||
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
|
|
||||||
*reinterpret_cast<const uint64_t*>(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
|
||||||
__global__ void indexer_k_quant_and_cache_kernel(
|
|
||||||
const scalar_t* __restrict__ k, // [num_tokens, head_dim]
|
|
||||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, cache_stride]
|
|
||||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
|
||||||
const int head_dim, // dimension of each head
|
|
||||||
const int quant_block_size, // quantization block size
|
|
||||||
const int cache_block_size, // cache block size
|
|
||||||
const int cache_stride, // stride for each token in kv_cache
|
|
||||||
const bool use_ue8m0 // use ue8m0 scale format
|
|
||||||
) {
|
|
||||||
constexpr int VEC_SIZE = 4;
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
|
||||||
const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
|
|
||||||
threadIdx.y * blockDim.x + threadIdx.x) *
|
|
||||||
VEC_SIZE;
|
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
|
||||||
const int64_t block_idx = slot_idx / cache_block_size;
|
|
||||||
const int64_t block_offset = slot_idx % cache_block_size;
|
|
||||||
|
|
||||||
// NOTE: slot_idx can be -1 if the token is padded
|
|
||||||
if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
float2 k_val = (reinterpret_cast<const float2*>(
|
|
||||||
k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
|
|
||||||
scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
|
|
||||||
float amax = 0.0f;
|
|
||||||
for (int i = 0; i < VEC_SIZE; i++) {
|
|
||||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
|
||||||
}
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
__syncwarp();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Reduced amax
|
|
||||||
for (int mask = 16; mask > 0; mask /= 2) {
|
|
||||||
#ifdef USE_ROCM
|
|
||||||
amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
|
|
||||||
#else
|
|
||||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
__syncwarp();
|
|
||||||
#endif
|
|
||||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
|
||||||
if (use_ue8m0) {
|
|
||||||
scale = exp2f(ceilf(log2f(scale)));
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
|
|
||||||
block_offset * head_dim + head_dim_idx;
|
|
||||||
for (int i = 0; i < VEC_SIZE; i++) {
|
|
||||||
kv_cache[dst_offset + i] =
|
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
|
|
||||||
}
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
const int64_t dst_scale_idx =
|
|
||||||
block_idx * cache_block_size * cache_stride +
|
|
||||||
cache_block_size * head_dim +
|
|
||||||
(block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
|
|
||||||
reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
// KV_T is the data type of key and value tensors.
|
// KV_T is the data type of key and value tensors.
|
||||||
@ -607,15 +431,14 @@ void reshape_and_cache(
|
|||||||
|
|
||||||
int key_stride = key.stride(0);
|
int key_stride = key.stride(0);
|
||||||
int value_stride = value.stride(0);
|
int value_stride = value.stride(0);
|
||||||
int head_div_x = head_size / x;
|
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * head_div_x, 512));
|
dim3 block(std::min(num_heads * head_size, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
|
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
|
||||||
CALL_RESHAPE_AND_CACHE);
|
CALL_RESHAPE_AND_CACHE)
|
||||||
}
|
}
|
||||||
|
|
||||||
// KV_T is the data type of key and value tensors.
|
// KV_T is the data type of key and value tensors.
|
||||||
@ -686,18 +509,6 @@ void reshape_and_cache_flash(
|
|||||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||||
|
|
||||||
// KV_T is the data type of key and value tensors.
|
|
||||||
// CACHE_T is the stored data type of kv-cache.
|
|
||||||
#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
|
||||||
vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
|
||||||
<<<grid, block, 0, stream>>>( \
|
|
||||||
reinterpret_cast<KV_T*>(kv_c.data_ptr()), \
|
|
||||||
reinterpret_cast<KV_T*>(k_pe.data_ptr()), \
|
|
||||||
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
|
||||||
slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
|
|
||||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
|
||||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
|
||||||
|
|
||||||
void concat_and_cache_mla(
|
void concat_and_cache_mla(
|
||||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
||||||
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
||||||
@ -720,43 +531,20 @@ void concat_and_cache_mla(
|
|||||||
int pe_dim = k_pe.size(1);
|
int pe_dim = k_pe.size(1);
|
||||||
int block_size = kv_cache.size(1);
|
int block_size = kv_cache.size(1);
|
||||||
|
|
||||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||||
TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
|
|
||||||
TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
|
|
||||||
TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
|
|
||||||
"kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
|
|
||||||
TORCH_CHECK(kv_c.itemsize() == 2,
|
|
||||||
"kv_c.itemsize() must be 2 for fp8_ds_mla");
|
|
||||||
TORCH_CHECK(k_pe.itemsize() == 2,
|
|
||||||
"k_pe.itemsize() must be 2 for fp8_ds_mla");
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
|
||||||
}
|
|
||||||
|
|
||||||
int kv_c_stride = kv_c.stride(0);
|
int kv_c_stride = kv_c.stride(0);
|
||||||
int k_pe_stride = k_pe.stride(0);
|
int k_pe_stride = k_pe.stride(0);
|
||||||
int block_stride = kv_cache.stride(0);
|
int block_stride = kv_cache.stride(0);
|
||||||
int entry_stride = kv_cache.stride(1);
|
int entry_stride = kv_cache.stride(1);
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(std::min(kv_lora_rank, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||||
dim3 grid(num_tokens);
|
CALL_CONCAT_AND_CACHE_MLA);
|
||||||
// For the NoPE part, each tile of 128 elements is handled by half of one
|
|
||||||
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
|
||||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
|
||||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32
|
|
||||||
// threads). So in total, we use 3 warps (96 threads) per block.
|
|
||||||
dim3 block(96);
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
|
||||||
CALL_CONCAT_AND_CACHE_DS_MLA);
|
|
||||||
} else {
|
|
||||||
dim3 grid(num_tokens);
|
|
||||||
dim3 block(std::min(kv_lora_rank, 512));
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
|
||||||
CALL_CONCAT_AND_CACHE_MLA);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
@ -1134,42 +922,3 @@ void cp_gather_cache(
|
|||||||
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
|
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Macro to dispatch the kernel based on the data type.
|
|
||||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
|
||||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
|
||||||
<<<grid, block, 0, stream>>>( \
|
|
||||||
reinterpret_cast<KV_T*>(k.data_ptr()), \
|
|
||||||
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
|
||||||
slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
|
|
||||||
cache_block_size, cache_stride, use_ue8m0);
|
|
||||||
|
|
||||||
void indexer_k_quant_and_cache(
|
|
||||||
torch::Tensor& k, // [num_tokens, head_dim]
|
|
||||||
torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
|
||||||
torch::Tensor& slot_mapping, // [num_tokens]
|
|
||||||
int64_t quant_block_size, // quantization block size
|
|
||||||
const std::string& scale_fmt) {
|
|
||||||
int num_tokens = k.size(0);
|
|
||||||
int head_dim = k.size(1);
|
|
||||||
int cache_block_size = kv_cache.size(1);
|
|
||||||
int cache_stride = kv_cache.size(2);
|
|
||||||
bool use_ue8m0 = scale_fmt == "ue8m0";
|
|
||||||
|
|
||||||
TORCH_CHECK(k.device() == kv_cache.device(),
|
|
||||||
"k and kv_cache must be on the same device");
|
|
||||||
TORCH_CHECK(k.device() == slot_mapping.device(),
|
|
||||||
"k and slot_mapping must be on the same device");
|
|
||||||
TORCH_CHECK(head_dim % quant_block_size == 0,
|
|
||||||
"head_dim must be divisible by quant_block_size");
|
|
||||||
|
|
||||||
constexpr int vec_size = 4;
|
|
||||||
dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
|
|
||||||
(quant_block_size * vec_size));
|
|
||||||
dim3 block(32, vec_size);
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
|
||||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
|
||||||
}
|
|
||||||
|
|||||||
@ -1,16 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <string>
|
|
||||||
#include <cctype>
|
|
||||||
|
|
||||||
namespace vllm {
|
|
||||||
|
|
||||||
// vllm_kernel_override_batch_invariant(); returns true
|
|
||||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
|
||||||
inline bool vllm_kernel_override_batch_invariant() {
|
|
||||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
|
||||||
const char* val = std::getenv(env_key.c_str());
|
|
||||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
|
||||||
@ -14,12 +14,7 @@
|
|||||||
// arm implementation
|
// arm implementation
|
||||||
#include "cpu_types_arm.hpp"
|
#include "cpu_types_arm.hpp"
|
||||||
#else
|
#else
|
||||||
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
|
#warning "unsupported vLLM cpu implementation"
|
||||||
#include "cpu_types_scalar.hpp"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _OPENMP
|
|
||||||
#include <omp.h>
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -1,513 +0,0 @@
|
|||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstring>
|
|
||||||
#include <torch/all.h>
|
|
||||||
#include "float_convert.hpp"
|
|
||||||
|
|
||||||
namespace vec_op {
|
|
||||||
|
|
||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
|
||||||
|
|
||||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
|
||||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
|
||||||
|
|
||||||
#ifndef CPU_OP_GUARD
|
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
|
||||||
#else
|
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
|
||||||
std::cout << #NAME << " invoked." << std::endl;
|
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME) \
|
|
||||||
std::cout << #NAME << " exit." << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
|
||||||
|
|
||||||
#define __max(a, b) ((a) > (b) ? (a) : (b))
|
|
||||||
#define __min(a, b) ((a) < (b) ? (a) : (b))
|
|
||||||
#define __abs(a) ((a) < (0) ? (0 - a) : (a))
|
|
||||||
|
|
||||||
typedef struct f16x8_t {
|
|
||||||
uint16_t val[8];
|
|
||||||
} f16x8_t;
|
|
||||||
|
|
||||||
typedef struct f16x16_t {
|
|
||||||
uint16_t val[16];
|
|
||||||
} f16x16_t;
|
|
||||||
|
|
||||||
typedef struct f16x32_t {
|
|
||||||
uint16_t val[32];
|
|
||||||
} f16x32_t;
|
|
||||||
|
|
||||||
typedef struct f32x4_t {
|
|
||||||
float val[4];
|
|
||||||
} f32x4_t;
|
|
||||||
|
|
||||||
typedef struct f32x8_t {
|
|
||||||
float val[8];
|
|
||||||
} f32x8_t;
|
|
||||||
|
|
||||||
typedef struct f32x16_t {
|
|
||||||
float val[16];
|
|
||||||
} f32x16_t;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
template <typename T, T... indexes, typename F>
|
|
||||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
|
|
||||||
(f(std::integral_constant<T, indexes>{}), ...);
|
|
||||||
};
|
|
||||||
}; // namespace
|
|
||||||
|
|
||||||
template <typename T, T count, typename F,
|
|
||||||
typename = std::enable_if_t<std::is_invocable_v<F, T> > >
|
|
||||||
constexpr void unroll_loop(F&& f) {
|
|
||||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct Vec {
|
|
||||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec8;
|
|
||||||
struct FP32Vec16;
|
|
||||||
|
|
||||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
f16x8_t reg;
|
|
||||||
|
|
||||||
explicit FP16Vec8(const void* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit FP16Vec8(const FP32Vec8&);
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP16Vec16 : public Vec<FP16Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
f16x16_t reg;
|
|
||||||
|
|
||||||
explicit FP16Vec16(const void* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit FP16Vec16(const FP32Vec16&);
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
|
||||||
|
|
||||||
void save(void* ptr, const int elem_num) const {
|
|
||||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
|
||||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
f16x8_t reg;
|
|
||||||
|
|
||||||
explicit BF16Vec8(const void* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit BF16Vec8(const FP32Vec8&);
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
f16x16_t reg;
|
|
||||||
|
|
||||||
explicit BF16Vec16(const void* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit BF16Vec16(const FP32Vec16&);
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
|
||||||
|
|
||||||
void save(void* ptr, const int elem_num) const {
|
|
||||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
|
||||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 32;
|
|
||||||
f16x32_t reg;
|
|
||||||
|
|
||||||
explicit BF16Vec32(const void* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit BF16Vec32(f16x32_t data) : reg(data) {};
|
|
||||||
|
|
||||||
explicit BF16Vec32(BF16Vec8& vec8_data) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 4;
|
|
||||||
|
|
||||||
f32x4_t reg;
|
|
||||||
|
|
||||||
explicit FP32Vec4(float v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec4() {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec4(const float* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit FP32Vec4(f32x4_t data) : reg(data) {};
|
|
||||||
|
|
||||||
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
|
|
||||||
f32x8_t reg;
|
|
||||||
|
|
||||||
explicit FP32Vec8(float v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec8() {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec8(const float* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit FP32Vec8(f32x8_t data) : reg(data) {};
|
|
||||||
|
|
||||||
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
|
|
||||||
|
|
||||||
explicit FP32Vec8(const FP16Vec8& v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8(const BF16Vec8& v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_sum() const {
|
|
||||||
float result = 0;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result += reg.val[i];
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 exp() const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = expf(reg.val[i]);
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 tanh() const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = tanhf(reg.val[i]);
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 er() const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = erf(reg.val[i]);
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = reg.val[i] * b.reg.val[i];
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator+(const FP32Vec8& b) const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = reg.val[i] + b.reg.val[i];
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator-(const FP32Vec8& b) const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = reg.val[i] - b.reg.val[i];
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator/(const FP32Vec8& b) const {
|
|
||||||
f32x8_t ret;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
ret.val[i] = reg.val[i] / b.reg.val[i];
|
|
||||||
}
|
|
||||||
return FP32Vec8(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
f32x16_t reg;
|
|
||||||
|
|
||||||
explicit FP32Vec16(float v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec16() {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const float* ptr)
|
|
||||||
: reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
|
|
||||||
|
|
||||||
explicit FP32Vec16(f32x16_t data) : reg(data) {};
|
|
||||||
|
|
||||||
FP32Vec16(const FP32Vec4& data) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16(const FP32Vec8& data) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP16Vec16& v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec16& v) {
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
|
||||||
|
|
||||||
FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
|
||||||
|
|
||||||
FP32Vec16 operator*(const FP32Vec16& b) const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = reg.val[i] * b.reg.val[i];
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator+(const FP32Vec16& b) const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = reg.val[i] + b.reg.val[i];
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator-(const FP32Vec16& b) const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = reg.val[i] - b.reg.val[i];
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = reg.val[i] / b.reg.val[i];
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 max(const FP32Vec16& b) const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 min(const FP32Vec16& b) const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 abs() const {
|
|
||||||
FP32Vec16 result(0.0f);
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result.reg.val[i] = __abs(reg.val[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_sum() const {
|
|
||||||
float result = 0.0f;
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result += reg.val[i];
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_max() const {
|
|
||||||
float result = reg.val[0];
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result = __max(reg.val[i], result);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_min() const {
|
|
||||||
float result = reg.val[0];
|
|
||||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
|
||||||
result = __min(reg.val[i], result);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int group_size>
|
|
||||||
float reduce_sub_sum(int idx) {
|
|
||||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
|
||||||
float sum = 0.0;
|
|
||||||
int start = idx * group_size;
|
|
||||||
int end = (idx + 1) * group_size;
|
|
||||||
|
|
||||||
for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
|
|
||||||
sum += reg.val[start];
|
|
||||||
}
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct VecType {
|
|
||||||
using vec_type = void;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
using vec_t = typename VecType<T>::vec_type;
|
|
||||||
|
|
||||||
template <>
|
|
||||||
struct VecType<float> {
|
|
||||||
using vec_type = FP32Vec8;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <>
|
|
||||||
struct VecType<c10::Half> {
|
|
||||||
using vec_type = FP16Vec8;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <>
|
|
||||||
struct VecType<c10::BFloat16> {
|
|
||||||
using vec_type = BF16Vec8;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void storeFP32(float v, T* ptr) {
|
|
||||||
*ptr = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
|
||||||
c10::Half __attribute__((__may_alias__)) *v_ptr =
|
|
||||||
reinterpret_cast<c10::Half *>(&v);
|
|
||||||
*ptr = *(v_ptr + 1);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
template <>
|
|
||||||
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
|
|
||||||
uint16_t fp16 = float_to_fp16(v);
|
|
||||||
*reinterpret_cast<uint16_t*>(ptr) = fp16;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
|
||||||
c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
|
|
||||||
reinterpret_cast<c10::BFloat16*>(&v);
|
|
||||||
*ptr = *(v_ptr + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
|
|
||||||
int i = 0;
|
|
||||||
for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
|
|
||||||
int i = 0;
|
|
||||||
for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
|
||||||
acc = acc + a * b;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
|
|
||||||
int i = 0;
|
|
||||||
for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
|
||||||
int i = 0;
|
|
||||||
for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
|
|
||||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
|
|
||||||
|
|
||||||
}; // namespace vec_op
|
|
||||||
@ -1,106 +0,0 @@
|
|||||||
|
|
||||||
static float bf16_to_float(uint16_t bf16) {
|
|
||||||
uint32_t bits = static_cast<uint32_t>(bf16) << 16;
|
|
||||||
float fp32;
|
|
||||||
std::memcpy(&fp32, &bits, sizeof(fp32));
|
|
||||||
return fp32;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint16_t float_to_bf16(float fp32) {
|
|
||||||
uint32_t bits;
|
|
||||||
std::memcpy(&bits, &fp32, sizeof(fp32));
|
|
||||||
return static_cast<uint16_t>(bits >> 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
/************************************************
|
|
||||||
* Copyright (c) 2015 Princeton Vision Group
|
|
||||||
* Licensed under the MIT license.
|
|
||||||
* Codes below copied from
|
|
||||||
* https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
|
|
||||||
*************************************************/
|
|
||||||
static uint16_t float_to_fp16(float fp32) {
|
|
||||||
uint16_t fp16;
|
|
||||||
|
|
||||||
unsigned x;
|
|
||||||
unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
|
|
||||||
unsigned sign, exponent, mantissa;
|
|
||||||
|
|
||||||
std::memcpy(&x, &fp32, sizeof(fp32));
|
|
||||||
u = (x & 0x7fffffff);
|
|
||||||
|
|
||||||
// Get rid of +NaN/-NaN case first.
|
|
||||||
if (u > 0x7f800000) {
|
|
||||||
fp16 = 0x7fffU;
|
|
||||||
return fp16;
|
|
||||||
}
|
|
||||||
|
|
||||||
sign = ((x >> 16) & 0x8000);
|
|
||||||
|
|
||||||
// Get rid of +Inf/-Inf, +0/-0.
|
|
||||||
if (u > 0x477fefff) {
|
|
||||||
fp16 = sign | 0x7c00U;
|
|
||||||
return fp16;
|
|
||||||
}
|
|
||||||
if (u < 0x33000001) {
|
|
||||||
fp16 = (sign | 0x0000);
|
|
||||||
return fp16;
|
|
||||||
}
|
|
||||||
|
|
||||||
exponent = ((u >> 23) & 0xff);
|
|
||||||
mantissa = (u & 0x7fffff);
|
|
||||||
|
|
||||||
if (exponent > 0x70) {
|
|
||||||
shift = 13;
|
|
||||||
exponent -= 0x70;
|
|
||||||
} else {
|
|
||||||
shift = 0x7e - exponent;
|
|
||||||
exponent = 0;
|
|
||||||
mantissa |= 0x800000;
|
|
||||||
}
|
|
||||||
lsb = (1 << shift);
|
|
||||||
lsb_s1 = (lsb >> 1);
|
|
||||||
lsb_m1 = (lsb - 1);
|
|
||||||
|
|
||||||
// Round to nearest even.
|
|
||||||
remainder = (mantissa & lsb_m1);
|
|
||||||
mantissa >>= shift;
|
|
||||||
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
|
|
||||||
++mantissa;
|
|
||||||
if (!(mantissa & 0x3ff)) {
|
|
||||||
++exponent;
|
|
||||||
mantissa = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fp16 = (sign | (exponent << 10) | mantissa);
|
|
||||||
|
|
||||||
return fp16;
|
|
||||||
}
|
|
||||||
|
|
||||||
static float fp16_to_float(uint16_t fp16) {
|
|
||||||
unsigned sign = ((fp16 >> 15) & 1);
|
|
||||||
unsigned exponent = ((fp16 >> 10) & 0x1f);
|
|
||||||
unsigned mantissa = ((fp16 & 0x3ff) << 13);
|
|
||||||
int temp;
|
|
||||||
float fp32;
|
|
||||||
if (exponent == 0x1f) { /* NaN or Inf */
|
|
||||||
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
|
|
||||||
exponent = 0xff;
|
|
||||||
} else if (!exponent) { /* Denorm or Zero */
|
|
||||||
if (mantissa) {
|
|
||||||
unsigned int msb;
|
|
||||||
exponent = 0x71;
|
|
||||||
do {
|
|
||||||
msb = (mantissa & 0x400000);
|
|
||||||
mantissa <<= 1; /* normalize */
|
|
||||||
--exponent;
|
|
||||||
} while (!msb);
|
|
||||||
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
exponent += 0x70;
|
|
||||||
}
|
|
||||||
temp = ((sign << 31) | (exponent << 23) | mantissa);
|
|
||||||
std::memcpy(&fp32, &temp, sizeof(temp));
|
|
||||||
return fp32;
|
|
||||||
}
|
|
||||||
@ -88,18 +88,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" int tp_rank, int blocksparse_local_blocks,"
|
" int tp_rank, int blocksparse_local_blocks,"
|
||||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||||
" int blocksparse_head_sliding_step) -> ()");
|
" int blocksparse_head_sliding_step) -> ()");
|
||||||
|
|
||||||
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
||||||
|
|
||||||
ops.def(
|
|
||||||
"dynamic_4bit_int_moe("
|
|
||||||
"Tensor x, Tensor topk_ids, Tensor topk_weights,"
|
|
||||||
"Tensor w13_packed, Tensor w2_packed, int H, int I, int I2,"
|
|
||||||
"int group_size, bool apply_router_weight_on_input, int activation_kind"
|
|
||||||
") -> Tensor");
|
|
||||||
|
|
||||||
ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
|
|
||||||
|
|
||||||
// PagedAttention V2.
|
// PagedAttention V2.
|
||||||
ops.def(
|
ops.def(
|
||||||
"paged_attention_v2("
|
"paged_attention_v2("
|
||||||
|
|||||||
@ -12,7 +12,6 @@ using CubMaxOp = cub::Max;
|
|||||||
#endif // CUB_VERSION
|
#endif // CUB_VERSION
|
||||||
#else
|
#else
|
||||||
#include <hipcub/hipcub.hpp>
|
#include <hipcub/hipcub.hpp>
|
||||||
namespace cub = hipcub;
|
using CubAddOp = cub::Sum;
|
||||||
using CubAddOp = hipcub::Sum;
|
using CubMaxOp = cub::Max;
|
||||||
using CubMaxOp = hipcub::Max;
|
|
||||||
#endif // USE_ROCM
|
#endif // USE_ROCM
|
||||||
|
|||||||
@ -1,64 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <cuda_runtime_api.h>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
// maximum blocks per SM cap
|
|
||||||
#ifndef VLLM_LAUNCH_BLOCKS_CAP
|
|
||||||
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Compile-time estimate of max threads per SM for launch bounds.
|
|
||||||
// Families: 1024, 1536, 2048 threads/SM.
|
|
||||||
#ifndef VLLM_MAX_THREADS_PER_SM
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
|
|
||||||
/* 1024 thr/SM: Turing (sm_75) */
|
|
||||||
#if (__CUDA_ARCH__ == 750)
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 1024
|
|
||||||
|
|
||||||
/* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
|
|
||||||
GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
|
|
||||||
#elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
|
|
||||||
(__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \
|
|
||||||
(__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
|
|
||||||
(__CUDA_ARCH__ == 1210)
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
|
||||||
|
|
||||||
/* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
|
|
||||||
Hopper (sm_90), Blackwell (sm_100/103) */
|
|
||||||
#elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
|
|
||||||
(__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \
|
|
||||||
(__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
|
||||||
|
|
||||||
/* Fallback: use 2048 for unknown future CCs */
|
|
||||||
#else
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
|
||||||
/* Host pass (no __CUDA_ARCH__): neutral default */
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// compute the number of blocks per SM to request in __launch_bounds__
|
|
||||||
#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
|
|
||||||
#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
|
|
||||||
(((VAL) <= 0) \
|
|
||||||
? 1 \
|
|
||||||
: (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
|
|
||||||
#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
|
|
||||||
VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
|
|
||||||
|
|
||||||
// runtime-time helper to compute blocks/SM
|
|
||||||
static inline int vllm_runtime_blocks_per_sm(int block_threads) {
|
|
||||||
int device = -1;
|
|
||||||
cudaGetDevice(&device);
|
|
||||||
int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
|
|
||||||
cudaDeviceGetAttribute(&max_threads_per_sm,
|
|
||||||
cudaDevAttrMaxThreadsPerMultiProcessor, device);
|
|
||||||
int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
|
|
||||||
return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
|
|
||||||
}
|
|
||||||
@ -1,7 +1,6 @@
|
|||||||
#include "type_convert.cuh"
|
#include "type_convert.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -414,9 +413,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
wt_ptr % req_alignment_bytes == 0;
|
wt_ptr % req_alignment_bytes == 0;
|
||||||
bool offsets_are_multiple_of_vector_width =
|
bool offsets_are_multiple_of_vector_width =
|
||||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
|
||||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
|
||||||
!batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
@ -462,8 +459,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_POLY_NORM(8);
|
LAUNCH_FUSED_POLY_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_POLY_NORM(0);
|
LAUNCH_FUSED_POLY_NORM(0);
|
||||||
|
|||||||
@ -6,10 +6,9 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "type_convert.cuh"
|
#include "type_convert.cuh"
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -241,9 +240,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
bool ptrs_are_aligned =
|
bool ptrs_are_aligned =
|
||||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
|
||||||
!batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
|
|||||||
@ -1,156 +0,0 @@
|
|||||||
#include <ATen/ATen.h>
|
|
||||||
#include <ATen/Parallel.h>
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
// _dyn_quant_matmul_4bit is only available on AArch64.
|
|
||||||
#if defined(__aarch64__)
|
|
||||||
#include <ATen/ops/_dyn_quant_matmul_4bit.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w,
|
|
||||||
int64_t group_size_eff, int64_t in_features,
|
|
||||||
int64_t out_features) {
|
|
||||||
#if defined(__aarch64__)
|
|
||||||
return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff,
|
|
||||||
in_features, out_features);
|
|
||||||
#else
|
|
||||||
TORCH_CHECK(false,
|
|
||||||
"dynamic 4-bit int MoE path requires AArch64 (ARM64); "
|
|
||||||
"_dyn_quant_matmul_4bit is unavailable on this architecture");
|
|
||||||
return {};
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ActivationKind : int64_t {
|
|
||||||
SwiGLU_Gu = 0, // act = SiLU(g) * u
|
|
||||||
SwiGLUOAI = 1, // act = SiLU(u) * g
|
|
||||||
SiLU = 2 // SiLU
|
|
||||||
};
|
|
||||||
|
|
||||||
torch::Tensor dynamic_4bit_int_moe_cpu(
|
|
||||||
torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
|
|
||||||
torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
|
|
||||||
int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
|
|
||||||
int64_t activation_kind) {
|
|
||||||
TORCH_CHECK(x.dim() == 2, "x must be 2D");
|
|
||||||
TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2,
|
|
||||||
"topk tensors must be [T, K]");
|
|
||||||
TORCH_CHECK(
|
|
||||||
w13_packed.size(0) == w2_packed.size(0),
|
|
||||||
"w13_packed and w2_packed must have same number of experts in dim 0");
|
|
||||||
TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I");
|
|
||||||
|
|
||||||
const int64_t T = x.size(0);
|
|
||||||
const int64_t K = topk_ids.size(1);
|
|
||||||
const int64_t E = w13_packed.size(0);
|
|
||||||
const int64_t N = T * K;
|
|
||||||
|
|
||||||
auto x_c = x.contiguous();
|
|
||||||
auto ids_c = topk_ids.contiguous();
|
|
||||||
auto gates_c = topk_weights.to(at::kFloat).contiguous();
|
|
||||||
|
|
||||||
// bucketing tokens -> experts
|
|
||||||
c10::SmallVector<int64_t, 64> counts(
|
|
||||||
E, 0); // Small vector uses stack allocation
|
|
||||||
{
|
|
||||||
const auto* ids_ptr = ids_c.data_ptr<int64_t>();
|
|
||||||
for (int64_t i = 0; i < N; ++i) {
|
|
||||||
const int64_t e_id = ids_ptr[i];
|
|
||||||
TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range");
|
|
||||||
counts[e_id]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
c10::SmallVector<int64_t, 65> offsets(E + 1, 0); // ( E +1 )
|
|
||||||
for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e];
|
|
||||||
|
|
||||||
auto expert_tokens = at::empty({offsets[E]}, ids_c.options());
|
|
||||||
auto expert_gates = at::empty({offsets[E]}, gates_c.options());
|
|
||||||
{
|
|
||||||
c10::SmallVector<int64_t, 64> cursor(E, 0);
|
|
||||||
const auto* ids_ptr = ids_c.data_ptr<int64_t>();
|
|
||||||
const auto* gts_ptr = gates_c.data_ptr<float>();
|
|
||||||
auto* tok_ptr = expert_tokens.data_ptr<int64_t>();
|
|
||||||
auto* gate_ptr = expert_gates.data_ptr<float>();
|
|
||||||
|
|
||||||
for (int64_t t = 0; t < T; ++t) {
|
|
||||||
const int64_t base = t * K;
|
|
||||||
for (int64_t k = 0; k < K; ++k) {
|
|
||||||
const int64_t idx = base + k;
|
|
||||||
const int64_t e = ids_ptr[idx];
|
|
||||||
const int64_t p = offsets[e] + (cursor[e]++);
|
|
||||||
tok_ptr[p] = t;
|
|
||||||
gate_ptr[p] = gts_ptr[idx];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
|
|
||||||
const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
|
|
||||||
|
|
||||||
// Per-expert outputs filled in parallel
|
|
||||||
std::vector<torch::Tensor> y_list(E);
|
|
||||||
y_list.resize(E);
|
|
||||||
|
|
||||||
at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
|
|
||||||
for (int64_t e = e_begin; e < e_end; ++e) {
|
|
||||||
const int64_t te = counts[e];
|
|
||||||
if (te == 0) {
|
|
||||||
y_list[e] = at::empty({0, H}, x_c.options());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t start = offsets[e];
|
|
||||||
|
|
||||||
auto sel_tokens =
|
|
||||||
expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
|
|
||||||
auto gates_e =
|
|
||||||
expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
|
|
||||||
|
|
||||||
auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
|
|
||||||
|
|
||||||
if (apply_router_weight_on_input) {
|
|
||||||
x_e = x_e.mul(gates_e.unsqueeze(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto w13_e = w13_packed.select(/*dim=*/0, e);
|
|
||||||
auto w2_e = w2_packed.select(/*dim=*/0, e);
|
|
||||||
|
|
||||||
// W13
|
|
||||||
auto y13 =
|
|
||||||
mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2);
|
|
||||||
|
|
||||||
auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I);
|
|
||||||
auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I);
|
|
||||||
|
|
||||||
torch::Tensor act;
|
|
||||||
if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI
|
|
||||||
constexpr double kAlpha = 1.702; // GPT-OSS default
|
|
||||||
constexpr double kLimit = 7.0; // GPT-OSS default
|
|
||||||
auto gate_c = at::clamp_max(g_part, kLimit);
|
|
||||||
auto up_c = at::clamp(u_part, -kLimit, kLimit);
|
|
||||||
auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha)));
|
|
||||||
act = up_c.add(1.0).mul(glu);
|
|
||||||
} else { // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul()
|
|
||||||
act = at::silu(g_part).mul(u_part);
|
|
||||||
}
|
|
||||||
|
|
||||||
// W2
|
|
||||||
auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
|
|
||||||
|
|
||||||
if (!apply_router_weight_on_input) {
|
|
||||||
y = y.mul(gates_e.unsqueeze(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store per-expert result
|
|
||||||
y_list[e] = y;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Concatenate all expert outputs to match expert_tokens order
|
|
||||||
auto Y_all = at::cat(y_list, /*dim=*/0);
|
|
||||||
auto out = at::zeros({T, H}, x.options());
|
|
||||||
out =
|
|
||||||
at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
@ -21,7 +21,6 @@
|
|||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#include <cuda_bf16.h>
|
#include <cuda_bf16.h>
|
||||||
#include <cuda/std/limits>
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
#include <cooperative_groups/reduce.h>
|
#include <cooperative_groups/reduce.h>
|
||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
@ -29,6 +28,7 @@ namespace cg = cooperative_groups;
|
|||||||
namespace vllm {
|
namespace vllm {
|
||||||
namespace moe {
|
namespace moe {
|
||||||
|
|
||||||
|
constexpr float kNegInfinity = INFINITY * -1;
|
||||||
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
|
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
|
||||||
constexpr int32_t WARP_SIZE = 32;
|
constexpr int32_t WARP_SIZE = 32;
|
||||||
constexpr int32_t BLOCK_SIZE = 512;
|
constexpr int32_t BLOCK_SIZE = 512;
|
||||||
@ -411,30 +411,14 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
|
|||||||
return __bfloat162float(val);
|
return __bfloat162float(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__device__ inline T neg_inf() {
|
|
||||||
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
|
|
||||||
// so we need to cast from fp32
|
|
||||||
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__device__ inline bool is_finite(const T val) {
|
|
||||||
#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
|
|
||||||
return cuda::std::isfinite(val);
|
|
||||||
#else
|
|
||||||
return isfinite(cuda_cast<float, T>(val));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ void topk_with_k2(T* output, T const* input,
|
__device__ void topk_with_k2(T* output, T const* input,
|
||||||
cg::thread_block_tile<32> const& tile,
|
cg::thread_block_tile<32> const& tile,
|
||||||
int32_t const lane_id,
|
int32_t const lane_id,
|
||||||
int const num_experts_per_group) {
|
int const num_experts_per_group) {
|
||||||
// Get the top2 per thread
|
// Get the top2 per thread
|
||||||
T largest = neg_inf<T>();
|
T largest = -INFINITY;
|
||||||
T second_largest = neg_inf<T>();
|
T second_largest = -INFINITY;
|
||||||
|
|
||||||
if (num_experts_per_group > WARP_SIZE) {
|
if (num_experts_per_group > WARP_SIZE) {
|
||||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||||
@ -529,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
warp_id * topk;
|
warp_id * topk;
|
||||||
s_topk_idx += warp_id * topk;
|
s_topk_idx += warp_id * topk;
|
||||||
|
|
||||||
T value = neg_inf<T>();
|
T value = kNegInfinity;
|
||||||
T topk_group_value = neg_inf<T>();
|
T topk_group_value = kNegInfinity;
|
||||||
int32_t num_equalto_topkth_group;
|
int32_t num_equalto_topkth_group;
|
||||||
|
|
||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
@ -541,8 +525,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
if (case_id < num_tokens) {
|
if (case_id < num_tokens) {
|
||||||
// calculate group_idx
|
// calculate group_idx
|
||||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
||||||
// The check is necessary to avoid abnormal input
|
if (lane_id < n_group &&
|
||||||
if (lane_id < n_group && is_finite(group_scores[lane_id])) {
|
(isfinite(cuda_cast<float, T>(
|
||||||
|
group_scores[lane_id])))) // The check is necessary to avoid
|
||||||
|
// abnormal input
|
||||||
|
{
|
||||||
value = group_scores[lane_id];
|
value = group_scores[lane_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -553,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||||
if (value == topk_group_value) {
|
if (value == topk_group_value) {
|
||||||
value = neg_inf<T>();
|
value = kNegInfinity;
|
||||||
}
|
}
|
||||||
pre_count_equal_to_top_value = count_equal_to_top_value;
|
pre_count_equal_to_top_value = count_equal_to_top_value;
|
||||||
count_equal_to_top_value =
|
count_equal_to_top_value = __popc(__ballot_sync(
|
||||||
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
|
FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
|
||||||
}
|
}
|
||||||
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
|
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
|
||||||
}
|
}
|
||||||
@ -565,10 +552,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
|
|
||||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
||||||
/* is_stable */ true>
|
/* is_stable */ true>
|
||||||
queue((int32_t)topk, neg_inf<T>());
|
queue((int32_t)topk, -INFINITY);
|
||||||
|
|
||||||
int count_equalto_topkth_group = 0;
|
int count_equalto_topkth_group = 0;
|
||||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
bool if_proceed_next_topk =
|
||||||
|
(topk_group_value != cuda_cast<T, float>(kNegInfinity));
|
||||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
for (int i_group = 0; i_group < n_group; i_group++) {
|
||||||
if ((group_scores[i_group] > topk_group_value) ||
|
if ((group_scores[i_group] > topk_group_value) ||
|
||||||
@ -577,10 +565,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
int32_t offset = i_group * num_experts_per_group;
|
int32_t offset = i_group * num_experts_per_group;
|
||||||
for (int32_t i = lane_id; i < align_num_experts_per_group;
|
for (int32_t i = lane_id; i < align_num_experts_per_group;
|
||||||
i += WARP_SIZE) {
|
i += WARP_SIZE) {
|
||||||
T candidates = (i < num_experts_per_group) &&
|
T candidates =
|
||||||
is_finite(scores_with_bias[offset + i])
|
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
|
||||||
? scores_with_bias[offset + i]
|
scores_with_bias[offset + i]))
|
||||||
: neg_inf<T>();
|
? scores_with_bias[offset + i]
|
||||||
|
: cuda_cast<T, float>(kNegInfinity);
|
||||||
queue.add(candidates, offset + i);
|
queue.add(candidates, offset + i);
|
||||||
}
|
}
|
||||||
if (group_scores[i_group] == topk_group_value) {
|
if (group_scores[i_group] == topk_group_value) {
|
||||||
@ -609,8 +598,7 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
if (i < topk) {
|
if (i < topk) {
|
||||||
s_topk_value[i] = value;
|
s_topk_value[i] = value;
|
||||||
}
|
}
|
||||||
topk_sum +=
|
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -44,9 +44,6 @@ __global__ void moe_align_block_size_kernel(
|
|||||||
|
|
||||||
for (size_t i = tid; i < numel; i += stride) {
|
for (size_t i = tid; i < numel; i += stride) {
|
||||||
int expert_id = topk_ids[i];
|
int expert_id = topk_ids[i];
|
||||||
if (expert_id >= num_experts) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
int warp_idx = expert_id / experts_per_warp;
|
int warp_idx = expert_id / experts_per_warp;
|
||||||
int expert_offset = expert_id % experts_per_warp;
|
int expert_offset = expert_id % experts_per_warp;
|
||||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||||
@ -98,15 +95,12 @@ template <typename scalar_t>
|
|||||||
__global__ void count_and_sort_expert_tokens_kernel(
|
__global__ void count_and_sort_expert_tokens_kernel(
|
||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||||
size_t numel, int32_t num_experts) {
|
size_t numel) {
|
||||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const size_t stride = blockDim.x * gridDim.x;
|
const size_t stride = blockDim.x * gridDim.x;
|
||||||
|
|
||||||
for (size_t i = tid; i < numel; i += stride) {
|
for (size_t i = tid; i < numel; i += stride) {
|
||||||
int32_t expert_id = topk_ids[i];
|
int32_t expert_id = topk_ids[i];
|
||||||
if (expert_id >= num_experts) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
||||||
sorted_token_ids[rank_post_pad] = i;
|
sorted_token_ids[rank_post_pad] = i;
|
||||||
}
|
}
|
||||||
@ -275,7 +269,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||||
topk_ids.data_ptr<scalar_t>(),
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
sorted_token_ids.data_ptr<int32_t>(),
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
|
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,7 +21,6 @@
|
|||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "../cub_helpers.h"
|
#include "../cub_helpers.h"
|
||||||
#include "../core/batch_invariant.hpp"
|
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
|||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
|
||||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||||
|
|||||||
@ -328,12 +328,6 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
|
|||||||
const std::optional<torch::Tensor>& has_initial_state,
|
const std::optional<torch::Tensor>& has_initial_state,
|
||||||
const torch::Tensor& ssm_states, int64_t pad_slot_id);
|
const torch::Tensor& ssm_states, int64_t pad_slot_id);
|
||||||
|
|
||||||
torch::Tensor dynamic_4bit_int_moe_cpu(
|
|
||||||
torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
|
|
||||||
torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
|
|
||||||
int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
|
|
||||||
int64_t activation_kind);
|
|
||||||
|
|
||||||
using fptr_t = int64_t;
|
using fptr_t = int64_t;
|
||||||
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
|
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
|
||||||
torch::Tensor& rank_data, int64_t rank,
|
torch::Tensor& rank_data, int64_t rank,
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
#include <c10/util/Float8_e4m3fn.h>
|
#include <c10/util/Float8_e4m3fn.h>
|
||||||
|
|
||||||
@ -23,14 +23,9 @@
|
|||||||
typedef __hip_bfloat162 __nv_bfloat162;
|
typedef __hip_bfloat162 __nv_bfloat162;
|
||||||
typedef __hip_bfloat16 __nv_bfloat16;
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
typedef __hip_bfloat16_raw __nv_bfloat16_raw;
|
typedef __hip_bfloat16_raw __nv_bfloat16_raw;
|
||||||
#if defined(HIP_FP8_TYPE_OCP)
|
|
||||||
typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
|
typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
|
||||||
typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
|
typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
|
||||||
#else
|
|
||||||
// ROCm 6.2 fallback: only *_fnuz types exist
|
|
||||||
typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
|
|
||||||
typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "core/registration.h"
|
#include "core/registration.h"
|
||||||
@ -370,6 +365,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
|||||||
int32_t compute_pipeline_offset_64 = 0;
|
int32_t compute_pipeline_offset_64 = 0;
|
||||||
|
|
||||||
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
|
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
|
||||||
|
__nv_bfloat16 y_max_bf16 = EPS;
|
||||||
__nv_bfloat162 results_bf162[2];
|
__nv_bfloat162 results_bf162[2];
|
||||||
|
|
||||||
cp_async_wait<NUM_STAGES - 2>();
|
cp_async_wait<NUM_STAGES - 2>();
|
||||||
@ -409,7 +405,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
|||||||
auto _y_max2 =
|
auto _y_max2 =
|
||||||
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
|
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
|
||||||
|
|
||||||
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
|
y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
|
||||||
|
|
||||||
// An entire group is assigned to a single warp, so a simple warp reduce
|
// An entire group is assigned to a single warp, so a simple warp reduce
|
||||||
// is used.
|
// is used.
|
||||||
|
|||||||
@ -1,11 +1,15 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "dispatch_utils.h"
|
#include "../../cub_helpers.h"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
#include "../../dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "../vectorization_utils.cuh"
|
||||||
|
|
||||||
static inline __device__ int8_t float_to_int8_rn(float x) {
|
static inline __device__ int8_t float_to_int8_rn(float x) {
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
@ -21,6 +25,7 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
|
|||||||
float dst = std::nearbyint(x);
|
float dst = std::nearbyint(x);
|
||||||
|
|
||||||
// saturate
|
// saturate
|
||||||
|
|
||||||
// See https://github.com/pytorch/pytorch/issues/127666
|
// See https://github.com/pytorch/pytorch/issues/127666
|
||||||
// See https://github.com/llvm/llvm-project/issues/95183
|
// See https://github.com/llvm/llvm-project/issues/95183
|
||||||
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
||||||
@ -79,6 +84,7 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
|
|||||||
static_cast<int32_t>(std::numeric_limits<int8_t>::max());
|
static_cast<int32_t>(std::numeric_limits<int8_t>::max());
|
||||||
|
|
||||||
// saturate
|
// saturate
|
||||||
|
|
||||||
// See https://github.com/pytorch/pytorch/issues/127666
|
// See https://github.com/pytorch/pytorch/issues/127666
|
||||||
// See https://github.com/llvm/llvm-project/issues/95183
|
// See https://github.com/llvm/llvm-project/issues/95183
|
||||||
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
||||||
@ -170,6 +176,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
|||||||
|
|
||||||
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
|
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
|
||||||
|
|
||||||
|
// 2. quantize
|
||||||
vectorize_with_alignment<16>(
|
vectorize_with_alignment<16>(
|
||||||
row_in, row_out, hidden_size, tid, stride,
|
row_in, row_out, hidden_size, tid, stride,
|
||||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||||
@ -187,6 +194,7 @@ struct MinMax {
|
|||||||
|
|
||||||
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
|
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
|
||||||
|
|
||||||
|
// add a value to the MinMax
|
||||||
__host__ __device__ MinMax& operator+=(float v) {
|
__host__ __device__ MinMax& operator+=(float v) {
|
||||||
min = fminf(min, v);
|
min = fminf(min, v);
|
||||||
max = fmaxf(max, v);
|
max = fmaxf(max, v);
|
||||||
@ -220,6 +228,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
|||||||
const scalar_t* row_in = input + token_idx * hidden_size;
|
const scalar_t* row_in = input + token_idx * hidden_size;
|
||||||
int8_t* row_out = output + token_idx * hidden_size;
|
int8_t* row_out = output + token_idx * hidden_size;
|
||||||
|
|
||||||
|
// 1. calculate min & max
|
||||||
MinMax thread_mm;
|
MinMax thread_mm;
|
||||||
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
|
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
|
||||||
[&] __device__(const scalar_t& src) {
|
[&] __device__(const scalar_t& src) {
|
||||||
@ -252,6 +261,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
|||||||
const float inv_s = 1.f / scale_sh;
|
const float inv_s = 1.f / scale_sh;
|
||||||
const azp_t azp = azp_sh;
|
const azp_t azp = azp_sh;
|
||||||
|
|
||||||
|
// 2. quantize
|
||||||
vectorize_with_alignment<16>(
|
vectorize_with_alignment<16>(
|
||||||
row_in, row_out, hidden_size, tid, stride,
|
row_in, row_out, hidden_size, tid, stride,
|
||||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||||
@ -322,4 +332,14 @@ void dynamic_scaled_int8_quant(
|
|||||||
hidden_size);
|
hidden_size);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max) {
|
||||||
|
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||||
|
int8_min, int8_max);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
} else {
|
} else {
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
} else {
|
} else {
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
} else {
|
} else {
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
|
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
// TMA epilogue isn't compatible with Swap A/B
|
// TMA epilogue isn't compatible with Swap A/B
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
@ -25,10 +25,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
|
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
|
||||||
int8_func(c, a, b, a_scales, b_scales, bias);
|
int8_func(c, a, b, a_scales, b_scales, bias);
|
||||||
} else {
|
} else {
|
||||||
int32_t version_num = get_sm_version_num();
|
TORCH_CHECK(false, "Int8 not supported for this architecture");
|
||||||
TORCH_CHECK(
|
|
||||||
false, "Int8 not supported on SM", version_num,
|
|
||||||
". Use FP8 quantization instead, or run on older arch (SM < 100).");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
@ -67,9 +67,8 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
std::optional<torch::Tensor> const& bias);
|
std::optional<torch::Tensor> const& bias);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
|
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
|
||||||
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
|
|
||||||
void get_cutlass_moe_mm_data_caller(
|
void get_cutlass_moe_mm_data_caller(
|
||||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||||
@ -254,7 +253,7 @@ void cutlass_moe_mm(
|
|||||||
bool per_act_token, bool per_out_ch) {
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
if (version_num >= 100 && version_num < 110) {
|
if (version_num >= 100) {
|
||||||
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -262,7 +261,7 @@ void cutlass_moe_mm(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
if (version_num >= 90 && version_num < 100) {
|
if (version_num >= 90) {
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -26,7 +26,6 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
#include "launch_bounds_utils.h"
|
|
||||||
#include "nvfp4_utils.cuh"
|
#include "nvfp4_utils.cuh"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
@ -64,7 +63,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
|
|||||||
|
|
||||||
// Use UE4M3 by default.
|
// Use UE4M3 by default.
|
||||||
template <class Type, bool UE8M0_SF = false>
|
template <class Type, bool UE8M0_SF = false>
|
||||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
__global__ void __launch_bounds__(1024, 4)
|
||||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out,
|
float const* SFScale, uint32_t* out,
|
||||||
uint32_t* SFout) {
|
uint32_t* SFout) {
|
||||||
@ -132,8 +131,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
|||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
||||||
int const numBlocksPerSM =
|
int const numBlocksPerSM = 2048 / block.x;
|
||||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
|
||||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||||
|
|
||||||
VLLM_DISPATCH_HALF_TYPES(
|
VLLM_DISPATCH_HALF_TYPES(
|
||||||
|
|||||||
@ -14,8 +14,6 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "core/registration.h"
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <cutlass/arch/arch.h>
|
#include <cutlass/arch/arch.h>
|
||||||
|
|
||||||
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
|
|||||||
"12.8 or above.");
|
"12.8 or above.");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
|
||||||
m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
|
|
||||||
}
|
|
||||||
|
|||||||
@ -26,13 +26,12 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "nvfp4_utils.cuh"
|
#include "nvfp4_utils.cuh"
|
||||||
#include "launch_bounds_utils.h"
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Use UE4M3 by default.
|
// Use UE4M3 by default.
|
||||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
__global__ void __launch_bounds__(512, 4)
|
||||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||||
uint32_t* input_offset_by_experts,
|
uint32_t* input_offset_by_experts,
|
||||||
@ -130,7 +129,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
|||||||
|
|
||||||
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
||||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
__global__ void __launch_bounds__(1024, 4)
|
||||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||||
uint32_t* input_offset_by_experts,
|
uint32_t* input_offset_by_experts,
|
||||||
@ -234,9 +233,8 @@ void quant_impl(void* output, void* output_scale, void* input,
|
|||||||
int const workSizePerRow = k / ELTS_PER_THREAD;
|
int const workSizePerRow = k / ELTS_PER_THREAD;
|
||||||
int const totalWorkSize = m_topk * workSizePerRow;
|
int const totalWorkSize = m_topk * workSizePerRow;
|
||||||
dim3 block(std::min(workSizePerRow, 512));
|
dim3 block(std::min(workSizePerRow, 512));
|
||||||
// Get number of blocks per SM
|
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||||
int const numBlocksPerSM =
|
int const numBlocksPerSM = 2048 / block.x;
|
||||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
|
||||||
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
||||||
multiProcessorCount * numBlocksPerSM));
|
multiProcessorCount * numBlocksPerSM));
|
||||||
while (grid.x <= multiProcessorCount && block.x > 64) {
|
while (grid.x <= multiProcessorCount && block.x > 64) {
|
||||||
|
|||||||
@ -26,14 +26,13 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
#include "launch_bounds_utils.h"
|
|
||||||
#include "nvfp4_utils.cuh"
|
#include "nvfp4_utils.cuh"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Use UE4M3 by default.
|
// Use UE4M3 by default.
|
||||||
template <class Type, bool UE8M0_SF = false>
|
template <class Type, bool UE8M0_SF = false>
|
||||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
__global__ void __launch_bounds__(512, 4)
|
||||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
||||||
using PackedVec = PackedVec<Type>;
|
using PackedVec = PackedVec<Type>;
|
||||||
@ -76,9 +75,8 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
|
|||||||
// Grid, Block size.
|
// Grid, Block size.
|
||||||
// Each thread converts 8 values.
|
// Each thread converts 8 values.
|
||||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||||
// Get number of blocks per SM
|
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||||
int const numBlocksPerSM =
|
int const numBlocksPerSM = 2048 / block.x;
|
||||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
|
||||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||||
|
|
||||||
// Launch the cvt kernel.
|
// Launch the cvt kernel.
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user