Compare commits
397 Commits
wye-refact
...
use-uv-pyt
| Author | SHA1 | Date | |
|---|---|---|---|
| 728c365e4d | |||
| be8921fbba | |||
| d4e7a1152d | |||
| be22bb6f3d | |||
| 169313b9f8 | |||
| 0b018d8baf | |||
| c31246800c | |||
| 4134312b35 | |||
| da554f932e | |||
| aac622e0cd | |||
| 1726e93ef1 | |||
| ee04c0cd04 | |||
| c36f0aa300 | |||
| 5234dc7451 | |||
| 3b7c20a6b5 | |||
| f9e714813a | |||
| 2518230d3e | |||
| a332b84578 | |||
| 1405f0c7ba | |||
| 84d57342b6 | |||
| 57b46d769e | |||
| f48b6a03ba | |||
| 2a69ab4899 | |||
| 8d7da92fd7 | |||
| e952eee698 | |||
| 66bca9b8bd | |||
| 99028fda44 | |||
| 1244948885 | |||
| a73f6491c8 | |||
| 001e50c92c | |||
| 96ebcaa3ad | |||
| 5db1870bb9 | |||
| 2ce26b9b5d | |||
| a388252ac4 | |||
| 9a9f48dff7 | |||
| 67f3fb0844 | |||
| 43b752c325 | |||
| cfd302db9b | |||
| fb610ae684 | |||
| 2f652e6cdf | |||
| e6a226efba | |||
| a2e6fa7e03 | |||
| 9f1c4ecaf2 | |||
| ef283548f7 | |||
| f4db5e6de1 | |||
| 099aaee536 | |||
| 35fe398c7c | |||
| bb6d43047e | |||
| bc546f76a1 | |||
| 80608ba5af | |||
| e184c9c510 | |||
| d7e34b4210 | |||
| ef6e0e7132 | |||
| 1ad3aca682 | |||
| 8d0afa9b42 | |||
| fa7e254a7f | |||
| e23cacda35 | |||
| 2e1b8bc2b6 | |||
| e47433b3c1 | |||
| 23194d83e8 | |||
| 61aedb5ffe | |||
| d3bd171123 | |||
| 89e4050af4 | |||
| 78a47f87ce | |||
| 6a113d9aed | |||
| 2e4fe48c37 | |||
| 8eb0a1d906 | |||
| fea3e476aa | |||
| 61a3431613 | |||
| 9bedac9623 | |||
| c42ff4f4fd | |||
| d5ab28511c | |||
| e61eb5e09d | |||
| 0899ba5b42 | |||
| 145ac73317 | |||
| d0d138bc55 | |||
| 43227236ec | |||
| 8616300ae2 | |||
| edbaadd91f | |||
| 9360d34fa1 | |||
| 1b67b04656 | |||
| bd51f78e39 | |||
| 65ecb4f134 | |||
| 143844fa43 | |||
| 219cfbe7f6 | |||
| 9b44a7d926 | |||
| a3ae45a38c | |||
| 0307428d65 | |||
| 471997adf6 | |||
| b1ded114b9 | |||
| f4e4088c99 | |||
| 0efd540dbc | |||
| 6144754014 | |||
| 69311446ba | |||
| da63274d9f | |||
| c216119d64 | |||
| 5546acb463 | |||
| c0ec81836f | |||
| b65e56babe | |||
| 49996cd597 | |||
| ecb37e276a | |||
| a5354b3ed2 | |||
| f9df8b4ad7 | |||
| ec152c8748 | |||
| 7977e5027c | |||
| 3f5d902d2a | |||
| 27d7638b94 | |||
| 176173989a | |||
| 23b8ee672d | |||
| 3939152069 | |||
| cd87bfbf37 | |||
| b3613e3ace | |||
| d346ec695e | |||
| c242c98031 | |||
| f1d53d150c | |||
| 92da847cf5 | |||
| 3958b96bf5 | |||
| 8bf8f45822 | |||
| 6f5c0931c1 | |||
| 4e33a7ea85 | |||
| dc48ba0c75 | |||
| 4778b42660 | |||
| c70ac4b8ff | |||
| cf89202855 | |||
| f075693da7 | |||
| f708bd4904 | |||
| 0002b7f0d1 | |||
| 11aafd9886 | |||
| b761df963c | |||
| 33f6aaf972 | |||
| 56aafa8c0b | |||
| 8d52f2b3a7 | |||
| 984d18498a | |||
| d4d9899860 | |||
| db1e42f627 | |||
| bc9d7b5595 | |||
| fe6b19c314 | |||
| 2827b3f4a3 | |||
| 2b6b1d7809 | |||
| 633f943e30 | |||
| b03b1b97f6 | |||
| dfb9af2014 | |||
| 19f76ee68e | |||
| dd70437a4f | |||
| 99b3a504c5 | |||
| 6e30010d2f | |||
| 52621c8f5c | |||
| d48f4d6daf | |||
| e84e0735c7 | |||
| 3edf87d25f | |||
| 392edee34a | |||
| 983056e456 | |||
| 13dd93c667 | |||
| 53a30845be | |||
| 8b77328ffe | |||
| 9fe4c2bdb9 | |||
| 081b5594a2 | |||
| 57329a8c01 | |||
| 8c435c9bce | |||
| e71b8e210d | |||
| 89fa54e6f7 | |||
| 3d54bdcb73 | |||
| 6b0fcbbf43 | |||
| 0fa673af4c | |||
| 3468f17ebe | |||
| 71b25b0d48 | |||
| 0ea80c87d9 | |||
| b8d9e4a326 | |||
| 13cc7f5370 | |||
| 916bd9204d | |||
| e04a1b6b21 | |||
| 2e5df88c92 | |||
| 0754ac4c49 | |||
| 03858e6d1c | |||
| 532a6cfccb | |||
| eb32335e35 | |||
| 69a8c8e99a | |||
| 6c340da4df | |||
| 2f17117606 | |||
| 1e9a77e037 | |||
| d2af67441d | |||
| 0bcc3a160d | |||
| 70fbdb26e9 | |||
| 7f570f1caa | |||
| eaeca3cd7f | |||
| 12c1287d64 | |||
| 17b4c6685c | |||
| 3c2b2ccece | |||
| 7be9ffcd9f | |||
| 393de22d2e | |||
| 1260180c67 | |||
| af4ee63e0e | |||
| bc092ea873 | |||
| 755ed7b05b | |||
| a676e668ee | |||
| c85be1f6dd | |||
| 845adb3ec6 | |||
| 90b139cfff | |||
| 4492e3a554 | |||
| 05c19485a5 | |||
| 52d0cb8458 | |||
| 5c1e496a75 | |||
| e7f27ea648 | |||
| 1f29141258 | |||
| 6160ba4151 | |||
| fea8006062 | |||
| e6750d0b18 | |||
| 8c853050e7 | |||
| f84a472a03 | |||
| 54e42b72db | |||
| 2dda3e35d0 | |||
| d83f3f7cb3 | |||
| 302eb941f3 | |||
| 487745ff49 | |||
| 9313be5017 | |||
| 8938774c79 | |||
| e18b714b2e | |||
| b1068903fd | |||
| 164299500b | |||
| 58c360d9be | |||
| 42488dae69 | |||
| b67dece2d8 | |||
| 2338daffd3 | |||
| 2e19a848d4 | |||
| 77a7fce1bb | |||
| 6488f3481b | |||
| 27ec3c78f3 | |||
| 1cbcfb94de | |||
| fed8a9b107 | |||
| 190c45a6af | |||
| 5caaeb714c | |||
| d747c2ef18 | |||
| c30b405b8f | |||
| 77d906995c | |||
| 359d293006 | |||
| 9df8da548e | |||
| bf68fd76a9 | |||
| de94289a98 | |||
| 1983609239 | |||
| d06b5a95cb | |||
| be0bb568c9 | |||
| c8bde93367 | |||
| 88d7bdbd23 | |||
| 0d235b874a | |||
| 7ad5e50adf | |||
| dc464a3d39 | |||
| 1210e4d95b | |||
| e0b24ea030 | |||
| bde2a1a8a4 | |||
| 5e25b12236 | |||
| c85d75cf08 | |||
| abad204be6 | |||
| 7361ab379f | |||
| 95bc60e4cb | |||
| 4f2954f724 | |||
| eca7be9077 | |||
| 969b4da3a6 | |||
| 4f8c4b890a | |||
| ae002924e9 | |||
| 690f948e4a | |||
| 08275ec0a2 | |||
| c828d1bf98 | |||
| 8b8a8afc89 | |||
| 8bdd8b5c51 | |||
| a8ffc4f0f2 | |||
| d5944d5146 | |||
| 24fab45d96 | |||
| 63400259d0 | |||
| 8c1c81a3de | |||
| a3a7828010 | |||
| 5abb117901 | |||
| 867ecdd1c8 | |||
| 24e8222745 | |||
| 100b630a60 | |||
| 527821d191 | |||
| 846197f505 | |||
| 2357480b1a | |||
| f11e3c516b | |||
| 875d6def90 | |||
| cc1dc7ed6d | |||
| a903669e10 | |||
| 2c58742dff | |||
| 4c966e440e | |||
| da5e7e4329 | |||
| f05a4f0e34 | |||
| 61d1b35561 | |||
| b6a136b58c | |||
| 0d9fe260dd | |||
| 273690a50a | |||
| 231c2c63e4 | |||
| 4322c553a6 | |||
| babad6e5dd | |||
| 9383cd6f10 | |||
| ba8d2165b6 | |||
| c98be0a232 | |||
| 5774b0a1da | |||
| e8db44f883 | |||
| fafbe11af4 | |||
| 78237e43bf | |||
| eea1783989 | |||
| f225ea7dd9 | |||
| fc97733da8 | |||
| 4741239db7 | |||
| c625f9043c | |||
| 6fa78d8f23 | |||
| 9949aa2ef1 | |||
| 0b7bed9c38 | |||
| ac0048c0ae | |||
| 090197034f | |||
| f31ff87460 | |||
| d588cd2406 | |||
| 45d7d852d3 | |||
| 8bed179109 | |||
| f552d5e578 | |||
| 8db2939289 | |||
| d5e0fca264 | |||
| 8d0ee5a564 | |||
| 922979bfcc | |||
| 239ef0c1ac | |||
| 1d7f95b85c | |||
| cfbee3d0e7 | |||
| 06a41334c7 | |||
| 175811e3b5 | |||
| c10101a3eb | |||
| ac243886b0 | |||
| 3d2c56b7a9 | |||
| 64c824cd78 | |||
| 417a164af6 | |||
| b6f01bd9a7 | |||
| 4cf71cc88a | |||
| a66d131381 | |||
| 21467f9a1c | |||
| f92d952632 | |||
| 6d0b827cbd | |||
| 0eecb31663 | |||
| 793be8d057 | |||
| 7b57a433da | |||
| 5aeb925452 | |||
| 04d3752329 | |||
| bc6e542d9f | |||
| af7dfb0d1a | |||
| 1c3ffdbecc | |||
| c438b2951c | |||
| 0ff8ebb2d7 | |||
| 26e673fe93 | |||
| 65a5910ce3 | |||
| 9aea7373ff | |||
| 30d08911f7 | |||
| cf56cf78b4 | |||
| 7ed82d1974 | |||
| 12dbd834cf | |||
| 035fd2bd2c | |||
| 1cd885bd54 | |||
| 62b38dc832 | |||
| c99db8c8dd | |||
| 72dd1595b4 | |||
| 572ddf83ce | |||
| 86647d1cd0 | |||
| 52c2a8d4ad | |||
| 367a480bd3 | |||
| bef180f009 | |||
| d88918e4c2 | |||
| 3c713a9711 | |||
| bf8b26cad1 | |||
| 032d661d27 | |||
| e08a3a3fdb | |||
| 3d9a1d2de5 | |||
| be874c0201 | |||
| 9607d5eb44 | |||
| c60e6137f0 | |||
| f91480b2d4 | |||
| 6c5f82e5aa | |||
| b7f186bbb3 | |||
| 3642909617 | |||
| c308501cb6 | |||
| 535d80056b | |||
| a25ade5d47 | |||
| 8945b001db | |||
| b8a287a0a8 | |||
| c7e713616a | |||
| a36c675817 | |||
| 3da17c2cc2 | |||
| 14c1432789 | |||
| ee7a66dd9a | |||
| 431535b522 | |||
| 711e912946 | |||
| e69e0b8b5f | |||
| ddc9048394 | |||
| b1a63d1b3b | |||
| 48ecb4438b | |||
| e57fc15971 | |||
| 4bdf400218 | |||
| 7852b82b93 | |||
| a2a5f79e09 | |||
| c59a0eca42 | |||
| b716ab93a7 | |||
| 138f0d1e75 |
@ -181,14 +181,18 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -365,7 +365,8 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
|
|||||||
@ -1,191 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
||||||
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|||||||
30
.github/mergify.yml
vendored
30
.github/mergify.yml
vendored
@ -2,7 +2,6 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -15,7 +14,6 @@ pull_request_rules:
|
|||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -32,7 +30,6 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -49,7 +46,6 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -59,7 +55,6 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -75,7 +70,6 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -89,7 +83,6 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -101,7 +94,6 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@ -115,7 +107,6 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -130,7 +121,6 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
@ -152,7 +142,6 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -173,7 +162,6 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -193,7 +181,6 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -209,7 +196,6 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -222,7 +208,6 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -238,7 +223,6 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -253,7 +237,6 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
@ -272,9 +255,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- conflict
|
||||||
- conflict
|
- -closed
|
||||||
- -closed
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -288,8 +270,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
@ -301,7 +281,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
- name: assign reviewer for modelopt changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
@ -316,8 +295,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
@ -326,7 +305,6 @@ pull_request_rules:
|
|||||||
- name: label-kv-connector
|
- name: label-kv-connector
|
||||||
description: Automatically apply kv-connector label
|
description: Automatically apply kv-connector label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||||
|
|||||||
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/w8a8/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
@ -314,13 +314,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||||
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -424,11 +423,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -459,9 +458,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -493,9 +492,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -526,7 +525,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -649,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -668,12 +667,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -698,7 +697,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -721,7 +720,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
@ -158,7 +158,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
|
|||||||
@ -55,7 +55,9 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -63,7 +65,9 @@ benchmark() {
|
|||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -38,12 +38,16 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@ -58,14 +62,18 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -1,174 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import (
|
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def run_benchmark(
|
|
||||||
num_tokens: int,
|
|
||||||
num_heads: int,
|
|
||||||
head_size: int,
|
|
||||||
block_size: int,
|
|
||||||
num_blocks: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
num_iters: int,
|
|
||||||
benchmark_mode: str,
|
|
||||||
device: str = "cuda",
|
|
||||||
) -> float:
|
|
||||||
"""Return latency (seconds) for given num_tokens."""
|
|
||||||
|
|
||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
|
||||||
|
|
||||||
current_platform.seed_everything(42)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
|
|
||||||
# create random key / value tensors [T, H, D].
|
|
||||||
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
|
||||||
value = torch.randn_like(key)
|
|
||||||
|
|
||||||
# prepare the slot mapping.
|
|
||||||
# each token is assigned a unique slot in the KV-cache.
|
|
||||||
num_slots = block_size * num_blocks
|
|
||||||
if num_tokens > num_slots:
|
|
||||||
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
|
||||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
|
||||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
|
||||||
|
|
||||||
key_caches, value_caches = create_kv_caches_with_random(
|
|
||||||
num_blocks,
|
|
||||||
block_size,
|
|
||||||
1, # num_layers
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
kv_cache_dtype,
|
|
||||||
dtype,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
|
||||||
# to free unused memory
|
|
||||||
del key_caches, value_caches
|
|
||||||
|
|
||||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
|
||||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
|
||||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
|
||||||
|
|
||||||
function_under_test = lambda: ops.reshape_and_cache(
|
|
||||||
key, # noqa: F821
|
|
||||||
value, # noqa: F821
|
|
||||||
key_cache, # noqa: F821
|
|
||||||
value_cache, # noqa: F821
|
|
||||||
slot_mapping, # noqa: F821
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
|
|
||||||
if benchmark_mode == "cudagraph":
|
|
||||||
g = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(g):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
function_under_test = lambda: g.replay()
|
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.perf_counter()
|
|
||||||
for _ in range(n_iters):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.perf_counter()
|
|
||||||
return (end - start) / n_iters
|
|
||||||
|
|
||||||
# warm-up
|
|
||||||
run_cuda_benchmark(3)
|
|
||||||
|
|
||||||
lat = run_cuda_benchmark(num_iters)
|
|
||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
return lat
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
rows = []
|
|
||||||
for exp in range(1, 17):
|
|
||||||
n_tok = 2**exp
|
|
||||||
lat = run_benchmark(
|
|
||||||
num_tokens=n_tok,
|
|
||||||
num_heads=args.num_heads,
|
|
||||||
head_size=args.head_size,
|
|
||||||
block_size=args.block_size,
|
|
||||||
num_blocks=args.num_blocks,
|
|
||||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
|
||||||
num_iters=args.iters,
|
|
||||||
benchmark_mode=args.mode,
|
|
||||||
device="cuda",
|
|
||||||
)
|
|
||||||
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
|
||||||
|
|
||||||
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
|
||||||
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
|
|
||||||
parser.add_argument("--num-heads", type=int, default=128)
|
|
||||||
parser.add_argument(
|
|
||||||
"--head-size",
|
|
||||||
type=int,
|
|
||||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
|
||||||
default=128,
|
|
||||||
)
|
|
||||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
|
||||||
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["half", "bfloat16", "float"],
|
|
||||||
default="bfloat16",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--kv-cache-dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["auto", "fp8"],
|
|
||||||
default="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("--iters", type=int, default=200)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--mode",
|
|
||||||
type=str,
|
|
||||||
choices=["cudagraph", "no_graph"],
|
|
||||||
default="cudagraph",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
main(args)
|
|
||||||
@ -9,7 +9,7 @@ import torch
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.deep_gemm import (
|
from vllm.utils.deep_gemm import (
|
||||||
@ -63,7 +63,7 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
# === vLLM Triton Implementation ===
|
# === vLLM Triton Implementation ===
|
||||||
def vllm_triton_gemm():
|
def vllm_triton_gemm():
|
||||||
return w8a8_triton_block_scaled_mm(A_vllm,
|
return w8a8_block_fp8_matmul(A_vllm,
|
||||||
B_vllm,
|
B_vllm,
|
||||||
A_scale_vllm,
|
A_scale_vllm,
|
||||||
B_scale_vllm,
|
B_scale_vllm,
|
||||||
|
|||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -28,10 +28,10 @@
|
|||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "../quantization/fp8/amd/quant_utils.cuh"
|
||||||
typedef __hip_bfloat16 __nv_bfloat16;
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
#else
|
#else
|
||||||
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|||||||
@ -9,14 +9,16 @@
|
|||||||
#include "quantization/vectorization_utils.cuh"
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||||
#else
|
#else
|
||||||
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
#include "quantization/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cfloat>
|
#include <cfloat> // FLT_MIN
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Used to copy/convert one element
|
|
||||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
|
||||||
struct CopyWithScaleOp {
|
|
||||||
float scale;
|
|
||||||
|
|
||||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
dst = static_cast<OutT>(src);
|
|
||||||
} else {
|
|
||||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_kernel(
|
__global__ void reshape_and_cache_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
|
|||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
if (slot_idx < 0) {
|
if (slot_idx < 0) {
|
||||||
|
// Padding token that should be ignored.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
const int h_block_count = head_size / x; // head_size//x
|
|
||||||
|
|
||||||
const int h_block_idx = threadIdx.x;
|
const int n = num_heads * head_size;
|
||||||
if (h_block_idx >= num_heads * h_block_count) {
|
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||||
return;
|
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||||
}
|
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||||
|
|
||||||
const int head_idx = h_block_idx / h_block_count;
|
const int head_idx = i / head_size;
|
||||||
const int h_block = h_block_idx % h_block_count;
|
const int head_offset = i % head_size;
|
||||||
|
const int x_idx = head_offset / x;
|
||||||
|
const int x_offset = head_offset % x;
|
||||||
|
|
||||||
const scalar_t* __restrict__ key_src =
|
const int64_t tgt_key_idx =
|
||||||
key + token_idx * key_stride + head_idx * head_size + h_block * x;
|
block_idx * num_heads * (head_size / x) * block_size * x +
|
||||||
const int64_t src_value_start =
|
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
||||||
token_idx * value_stride + head_idx * head_size + h_block * x;
|
block_offset * x + x_offset;
|
||||||
|
const int64_t tgt_value_idx =
|
||||||
cache_t* __restrict__ key_dst =
|
block_idx * num_heads * head_size * block_size +
|
||||||
key_cache + block_idx * num_heads * h_block_count * block_size * x +
|
head_idx * head_size * block_size + head_offset * block_size +
|
||||||
head_idx * h_block_count * block_size * x + h_block * block_size * x +
|
block_offset;
|
||||||
block_offset * x;
|
scalar_t tgt_key = key[src_key_idx];
|
||||||
const int64_t tgt_value_start =
|
scalar_t tgt_value = value[src_value_idx];
|
||||||
block_idx * num_heads * h_block_count * x * block_size +
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
head_idx * h_block_count * x * block_size + h_block * x * block_size +
|
key_cache[tgt_key_idx] = tgt_key;
|
||||||
block_offset;
|
value_cache[tgt_value_idx] = tgt_value;
|
||||||
|
} else {
|
||||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
key_cache[tgt_key_idx] =
|
||||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
||||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
value_cache[tgt_value_idx] =
|
||||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
||||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
}
|
||||||
|
|
||||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
|
|
||||||
|
|
||||||
const scalar_t* __restrict__ value_src = value + src_value_start;
|
|
||||||
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < x; i++) {
|
|
||||||
v_op(value_dst[i * block_size], value_src[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Used by vectorization_utils to copy/convert one element
|
||||||
|
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||||
|
struct CopyWithScaleOp {
|
||||||
|
float scale;
|
||||||
|
|
||||||
|
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||||
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
|
dst = static_cast<OutT>(src);
|
||||||
|
} else {
|
||||||
|
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_flash_kernel(
|
__global__ void reshape_and_cache_flash_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -428,81 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
|||||||
const int64_t dst_idx_start =
|
const int64_t dst_idx_start =
|
||||||
block_idx * block_stride + block_offset * entry_stride;
|
block_idx * block_stride + block_offset * entry_stride;
|
||||||
|
|
||||||
// For the NoPE part, each tile of 128 elements is handled by half of one warp
|
// Create 4 tile scales in shared memory
|
||||||
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
__shared__ float smem[20];
|
||||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
float* shard_abs_max = smem;
|
||||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
|
float* tile_scales = smem + 16;
|
||||||
// So in total, we use 3 warps (96 threads) per block.
|
|
||||||
|
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||||
|
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||||
|
// The first thread of the first warp in each tile writes the scale
|
||||||
|
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||||
|
// by another 2 warps (64 threads).
|
||||||
|
// So in total, we use 18 warps (576 threads) per block.
|
||||||
|
|
||||||
// Cast kv_cache to 16_bit for RoPE values
|
// Cast kv_cache to 16_bit for RoPE values
|
||||||
scalar_t* kv_cache_16bit =
|
scalar_t* kv_cache_16bit =
|
||||||
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
||||||
|
|
||||||
// The last warp handles the RoPE part
|
// The last 64 threads handle the RoPE part
|
||||||
if (threadIdx.x >= 64) {
|
if (threadIdx.x >= kv_lora_rank) {
|
||||||
// Each thread handles two elements of RoPE
|
const int8_t pe_idx = threadIdx.x - kv_lora_rank;
|
||||||
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
|
const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
|
||||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
|
|
||||||
// Vectorized load of two 16-bit values, performed as one 32-bit load
|
|
||||||
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
|
|
||||||
// RoPE values start after the packed 8-bit NoPE values and the
|
// RoPE values start after the packed 8-bit NoPE values and the
|
||||||
// 32-bit scales
|
// 32-bit scales
|
||||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
|
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
|
||||||
// Vectorized store of two 16-bit values, performed as one 32-bit store
|
kv_cache_16bit[dst_idx] = k_pe[src_idx];
|
||||||
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The first two warps handle the NoPE part
|
// Determine the scale for each chunk of NoPE
|
||||||
const int8_t warp_idx = threadIdx.x >> 5;
|
const int16_t tile_idx = threadIdx.x >> 7;
|
||||||
const int8_t lane_idx = threadIdx.x & 31;
|
const int16_t warp_idx = (threadIdx.x & 127) >> 5;
|
||||||
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
|
const int16_t lane_idx = threadIdx.x & 31;
|
||||||
|
|
||||||
// Each thread handles 8 elements of NoPE
|
// Load the NoPE element for this thread into registers
|
||||||
// Load the NoPE elements for this thread into registers
|
const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
|
||||||
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
|
const scalar_t src_val = kv_c[src_idx];
|
||||||
// Vectorized load of eight 16-bit values, performed as an int4 load
|
|
||||||
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
|
|
||||||
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
|
|
||||||
|
|
||||||
// Max absolute value of this thread's elements
|
// Warp-level reduction to find the max absolute value in the warp
|
||||||
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
|
float max_abs = fabsf(src_val);
|
||||||
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
|
|
||||||
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
|
|
||||||
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
|
|
||||||
|
|
||||||
// Warp-level reduction to find the max absolute value in each half-warp
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 8; offset > 0; offset /= 2) {
|
for (int offset = 16; offset > 0; offset /= 2) {
|
||||||
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
|
#ifdef USE_ROCM
|
||||||
|
max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
|
||||||
|
#else
|
||||||
|
max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the scale for the tile
|
// The first lane of each warp in each tile writes the max_abs of this part
|
||||||
float tile_scale = max_abs / 448.f;
|
// of the tile to shared memory
|
||||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
if (lane_idx == 0) {
|
||||||
|
shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
// The first lane of each half-warp writes the scale to kv_cache
|
// The first lane of the first warp in each tile computes the scale for the
|
||||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
// tile and writes it to shared memory and to kv_cache
|
||||||
|
if (warp_idx == 0 && lane_idx == 0) {
|
||||||
|
float4 shard_abs_max_vec =
|
||||||
|
reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
|
||||||
|
float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
|
||||||
|
fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
|
||||||
|
448.f;
|
||||||
|
|
||||||
|
// Avoid division by zero in `scaled_convert`
|
||||||
|
tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
|
||||||
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
||||||
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
||||||
kv_cache_32bit[dst_idx] = tile_scale;
|
kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now all threads in the block scale and write their elements
|
__syncthreads();
|
||||||
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
|
|
||||||
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
|
|
||||||
|
|
||||||
uint8_t result[8];
|
// Now all threads in the block scale and write their element
|
||||||
#pragma unroll
|
const float scale_val = tile_scales[tile_idx];
|
||||||
for (int i = 0; i < 8; i++) {
|
const int64_t dst_idx = dst_idx_start + threadIdx.x;
|
||||||
result[i] =
|
kv_cache[dst_idx] =
|
||||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
||||||
vals[i], tile_scale);
|
src_val, scale_val);
|
||||||
}
|
|
||||||
|
|
||||||
// Store as aligned 64-bit writes
|
|
||||||
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
|
|
||||||
*reinterpret_cast<const uint64_t*>(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
@ -607,10 +606,9 @@ void reshape_and_cache(
|
|||||||
|
|
||||||
int key_stride = key.stride(0);
|
int key_stride = key.stride(0);
|
||||||
int value_stride = value.stride(0);
|
int value_stride = value.stride(0);
|
||||||
int head_div_x = head_size / x;
|
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * head_div_x, 512));
|
dim3 block(std::min(num_heads * head_size, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
@ -743,12 +741,13 @@ void concat_and_cache_mla(
|
|||||||
|
|
||||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
if (kv_cache_dtype == "fp8_ds_mla") {
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
// For the NoPE part, each tile of 128 elements is handled by half of one
|
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||||
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
// The first thread of the first warp in each tile writes the scale
|
||||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32
|
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||||
// threads). So in total, we use 3 warps (96 threads) per block.
|
// by another 2 warps (64 threads).
|
||||||
dim3 block(96);
|
// So in total, we use 18 warps (576 threads) per block.
|
||||||
|
dim3 block(576);
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||||
CALL_CONCAT_AND_CACHE_DS_MLA);
|
CALL_CONCAT_AND_CACHE_DS_MLA);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -12,7 +12,6 @@ using CubMaxOp = cub::Max;
|
|||||||
#endif // CUB_VERSION
|
#endif // CUB_VERSION
|
||||||
#else
|
#else
|
||||||
#include <hipcub/hipcub.hpp>
|
#include <hipcub/hipcub.hpp>
|
||||||
namespace cub = hipcub;
|
using CubAddOp = cub::Sum;
|
||||||
using CubAddOp = hipcub::Sum;
|
using CubMaxOp = cub::Max;
|
||||||
using CubMaxOp = hipcub::Max;
|
|
||||||
#endif // USE_ROCM
|
#endif // USE_ROCM
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "type_convert.cuh"
|
#include "type_convert.cuh"
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
#include "core/batch_invariant.hpp"
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
#include <c10/util/Float8_e4m3fn.h>
|
#include <c10/util/Float8_e4m3fn.h>
|
||||||
|
|
||||||
|
|||||||
@ -1,11 +1,15 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "dispatch_utils.h"
|
#include "../../cub_helpers.h"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
#include "../../dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "../vectorization_utils.cuh"
|
||||||
|
|
||||||
static inline __device__ int8_t float_to_int8_rn(float x) {
|
static inline __device__ int8_t float_to_int8_rn(float x) {
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
@ -21,6 +25,7 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
|
|||||||
float dst = std::nearbyint(x);
|
float dst = std::nearbyint(x);
|
||||||
|
|
||||||
// saturate
|
// saturate
|
||||||
|
|
||||||
// See https://github.com/pytorch/pytorch/issues/127666
|
// See https://github.com/pytorch/pytorch/issues/127666
|
||||||
// See https://github.com/llvm/llvm-project/issues/95183
|
// See https://github.com/llvm/llvm-project/issues/95183
|
||||||
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
||||||
@ -79,6 +84,7 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
|
|||||||
static_cast<int32_t>(std::numeric_limits<int8_t>::max());
|
static_cast<int32_t>(std::numeric_limits<int8_t>::max());
|
||||||
|
|
||||||
// saturate
|
// saturate
|
||||||
|
|
||||||
// See https://github.com/pytorch/pytorch/issues/127666
|
// See https://github.com/pytorch/pytorch/issues/127666
|
||||||
// See https://github.com/llvm/llvm-project/issues/95183
|
// See https://github.com/llvm/llvm-project/issues/95183
|
||||||
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
// hip-clang std::clamp __glibcxx_assert_fail host function when building on
|
||||||
@ -170,6 +176,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
|||||||
|
|
||||||
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
|
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
|
||||||
|
|
||||||
|
// 2. quantize
|
||||||
vectorize_with_alignment<16>(
|
vectorize_with_alignment<16>(
|
||||||
row_in, row_out, hidden_size, tid, stride,
|
row_in, row_out, hidden_size, tid, stride,
|
||||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||||
@ -187,6 +194,7 @@ struct MinMax {
|
|||||||
|
|
||||||
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
|
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
|
||||||
|
|
||||||
|
// add a value to the MinMax
|
||||||
__host__ __device__ MinMax& operator+=(float v) {
|
__host__ __device__ MinMax& operator+=(float v) {
|
||||||
min = fminf(min, v);
|
min = fminf(min, v);
|
||||||
max = fmaxf(max, v);
|
max = fmaxf(max, v);
|
||||||
@ -220,6 +228,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
|||||||
const scalar_t* row_in = input + token_idx * hidden_size;
|
const scalar_t* row_in = input + token_idx * hidden_size;
|
||||||
int8_t* row_out = output + token_idx * hidden_size;
|
int8_t* row_out = output + token_idx * hidden_size;
|
||||||
|
|
||||||
|
// 1. calculate min & max
|
||||||
MinMax thread_mm;
|
MinMax thread_mm;
|
||||||
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
|
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
|
||||||
[&] __device__(const scalar_t& src) {
|
[&] __device__(const scalar_t& src) {
|
||||||
@ -252,6 +261,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
|||||||
const float inv_s = 1.f / scale_sh;
|
const float inv_s = 1.f / scale_sh;
|
||||||
const azp_t azp = azp_sh;
|
const azp_t azp = azp_sh;
|
||||||
|
|
||||||
|
// 2. quantize
|
||||||
vectorize_with_alignment<16>(
|
vectorize_with_alignment<16>(
|
||||||
row_in, row_out, hidden_size, tid, stride,
|
row_in, row_out, hidden_size, tid, stride,
|
||||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||||
@ -323,3 +333,13 @@ void dynamic_scaled_int8_quant(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max) {
|
||||||
|
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||||
|
int8_min, int8_max);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
@ -254,7 +254,7 @@ void cutlass_moe_mm(
|
|||||||
bool per_act_token, bool per_out_ch) {
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
if (version_num >= 100 && version_num < 110) {
|
if (version_num >= 100) {
|
||||||
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -262,7 +262,7 @@ void cutlass_moe_mm(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
if (version_num >= 90 && version_num < 100) {
|
if (version_num >= 90) {
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -14,8 +14,6 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "core/registration.h"
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <cutlass/arch/arch.h>
|
#include <cutlass/arch/arch.h>
|
||||||
|
|
||||||
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
|
|||||||
"12.8 or above.");
|
"12.8 or above.");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
|
||||||
m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
|
|
||||||
}
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include <hip/hip_bfloat16.h>
|
#include <hip/hip_bfloat16.h>
|
||||||
|
|
||||||
#include "../../../../attention/attention_dtypes.h"
|
#include "../../../attention/attention_dtypes.h"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
@ -1,7 +1,7 @@
|
|||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "../../cub_helpers.h"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
#include "../vectorization_utils.cuh"
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <ATen/cuda/Exceptions.h>
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "../../../../attention/attention_dtypes.h"
|
#include "../../../attention/attention_dtypes.h"
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@ -1,6 +1,6 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
|
||||||
#include "quantization/w8a8/per_token_group_quant_8bit.h"
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
@ -8,9 +8,9 @@
|
|||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
#include "quantization/vectorization.cuh"
|
#include "../vectorization.cuh"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
#include "../vectorization_utils.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "../../dispatch_utils.h"
|
||||||
|
|
||||||
__device__ __forceinline__ float GroupReduceMax(float val) {
|
__device__ __forceinline__ float GroupReduceMax(float val) {
|
||||||
unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
|
unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
|
||||||
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
#include "quantization/vectorization.cuh"
|
#include "quantization/vectorization.cuh"
|
||||||
// TODO(luka/varun):refactor common.cuh to use this file instead
|
// TODO(luka/varun):refactor common.cuh to use this file instead
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
|
||||||
// 8-bit per-token-group quantization helper used by both FP8 and INT8
|
// 8-bit per-token-group quantization helper used by both FP8 and INT8
|
||||||
void per_token_group_quant_8bit(const torch::Tensor& input,
|
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||||
torch::Tensor& output_q,
|
torch::Tensor& output_q,
|
||||||
@ -1,12 +0,0 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
#include "quantization/w8a8/per_token_group_quant_8bit.h"
|
|
||||||
|
|
||||||
void per_token_group_quant_int8(const torch::Tensor& input,
|
|
||||||
torch::Tensor& output_q,
|
|
||||||
torch::Tensor& output_s, int64_t group_size,
|
|
||||||
double eps, double int8_min, double int8_max) {
|
|
||||||
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
|
||||||
int8_min, int8_max);
|
|
||||||
}
|
|
||||||
@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../attention/dtype_fp8.cuh"
|
#include "../attention/dtype_fp8.cuh"
|
||||||
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "../quantization/fp8/amd/quant_utils.cuh"
|
||||||
|
|
||||||
// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
|
// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
|
||||||
#if !defined(HIP_FP8_TYPE_OCP)
|
#if !defined(HIP_FP8_TYPE_OCP)
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
#if defined(__HIPCC__) && \
|
#if defined(__HIPCC__) && \
|
||||||
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
|
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
|
||||||
|
|||||||
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
||||||
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
||||||
{stride_tag});
|
{stride_tag});
|
||||||
// conditionally compiled so impl registration is in source file
|
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
// quantization, as well as bias
|
// quantization, as well as bias
|
||||||
|
|||||||
@ -13,13 +13,8 @@ ARG PYTHON_VERSION=3.12
|
|||||||
# private registries that use a different repository naming conventions.
|
# private registries that use a different repository naming conventions.
|
||||||
#
|
#
|
||||||
# Example:
|
# Example:
|
||||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
# Important: We build with an old version of Ubuntu to maintain broad
|
|
||||||
# compatibility with other Linux OSes. The main reason for this is that the
|
|
||||||
# glibc version is baked into the distro, and binaries built with one glibc
|
|
||||||
# version are not backwards compatible with OSes that use an earlier version.
|
|
||||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
|
||||||
@ -80,20 +75,15 @@ ARG TARGETPLATFORM
|
|||||||
ARG INSTALL_KV_CONNECTORS=false
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
ARG DEADSNAKES_MIRROR_URL
|
||||||
|
ARG DEADSNAKES_GPGKEY_URL
|
||||||
ARG GET_PIP_URL
|
ARG GET_PIP_URL
|
||||||
|
|
||||||
# Install system dependencies and uv, then create Python virtual environment
|
# Install minimal dependencies
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl sudo
|
||||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
|
||||||
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
|
||||||
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
|
||||||
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
|
|
||||||
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
|
|
||||||
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
|
||||||
&& python3 --version && python3 -m pip --version
|
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@ -101,9 +91,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||||
|
|
||||||
# Activate virtual environment and add uv to PATH
|
# Install uv and Python
|
||||||
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||||
ENV VIRTUAL_ENV="/opt/venv"
|
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@ -132,7 +122,7 @@ WORKDIR /workspace
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
uv pip install --system -r requirements/cuda.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
@ -162,7 +152,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|||||||
ENV UV_LINK_MODE=copy
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
uv pip install --system -r requirements/build.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
@ -259,7 +249,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
|||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
uv pip install --system -r requirements/dev.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
@ -286,32 +276,12 @@ ARG GET_PIP_URL
|
|||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install minimal dependencies
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
|
||||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
|
||||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
|
||||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
|
||||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
|
||||||
fi ; \
|
|
||||||
else \
|
|
||||||
for i in 1 2 3; do \
|
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done ; \
|
|
||||||
fi \
|
|
||||||
&& apt-get update -y \
|
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
|
||||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
|
||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
|
||||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
|
||||||
&& python3 --version && python3 -m pip --version
|
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@ -319,9 +289,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||||
|
|
||||||
# Install uv for faster pip installs
|
# Install uv and Python
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||||
python3 -m pip install uv
|
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@ -555,5 +525,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
|||||||
|
|
||||||
FROM vllm-openai-base AS vllm-openai
|
FROM vllm-openai-base AS vllm-openai
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
|
|||||||
@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
||||||
uv pip install dist/*.whl
|
uv pip install dist/*.whl
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@ -314,4 +314,4 @@ WORKDIR /workspace/
|
|||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@ -309,4 +309,4 @@ USER 2000
|
|||||||
WORKDIR /home/vllm
|
WORKDIR /home/vllm
|
||||||
|
|
||||||
# Set the default entrypoint
|
# Set the default entrypoint
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 127 KiB |
@ -661,7 +661,8 @@ Benchmark the performance of multi-modal requests in vLLM.
|
|||||||
Start vLLM:
|
Start vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
--dtype bfloat16 \
|
--dtype bfloat16 \
|
||||||
--limit-mm-per-prompt '{"image": 1}' \
|
--limit-mm-per-prompt '{"image": 1}' \
|
||||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
--allowed-local-media-path /path/to/sharegpt4v/images
|
||||||
@ -687,7 +688,8 @@ vllm bench serve \
|
|||||||
Start vLLM:
|
Start vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
--dtype bfloat16 \
|
--dtype bfloat16 \
|
||||||
--limit-mm-per-prompt '{"video": 1}' \
|
--limit-mm-per-prompt '{"video": 1}' \
|
||||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||||
|
|||||||
@ -258,21 +258,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
image_overrides = mm_options.get("image") if mm_options else None
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images,
|
num_images=num_images)
|
||||||
overrides=image_overrides)
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -442,20 +438,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
image_overrides = mm_options.get("image") if mm_options else None
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images,
|
num_images=num_images)
|
||||||
overrides=image_overrides)
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,8 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||||
vllm serve meta-llama/Meta-Llama-3-70B
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model meta-llama/Meta-Llama-3-70B
|
||||||
```
|
```
|
||||||
|
|
||||||
vllm bench command:
|
vllm bench command:
|
||||||
|
|||||||
@ -19,7 +19,8 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
|
|||||||
1. Start the vLLM server with the supported chat completion model, e.g.
|
1. Start the vLLM server with the supported chat completion model, e.g.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve mistralai/Mistral-7B-Instruct-v0.2
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model mistralai/Mistral-7B-Instruct-v0.2
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Call it with AutoGen:
|
1. Call it with AutoGen:
|
||||||
|
|||||||
@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
vllm serve <model> --host 0.0.0.0 --port 8000
|
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Start the Open WebUI Docker container:
|
3. Start the Open WebUI Docker container:
|
||||||
|
|||||||
@ -32,7 +32,6 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
PYTHONUNBUFFERED: 1
|
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -48,8 +47,9 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
vllm serve $MODEL_NAME \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log &
|
2>&1 | tee api_server.log &
|
||||||
@ -131,7 +131,6 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
PYTHONUNBUFFERED: 1
|
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -147,8 +146,9 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
vllm serve $MODEL_NAME \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
@ -243,7 +243,6 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
PYTHONUNBUFFERED: 1
|
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -259,8 +258,9 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
vllm serve $MODEL_NAME \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
|
|||||||
@ -69,11 +69,6 @@ Sometimes you may see the API server entrypoint used directly instead of via the
|
|||||||
python -m vllm.entrypoints.openai.api_server --model <model>
|
python -m vllm.entrypoints.openai.api_server --model <model>
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! warning
|
|
||||||
|
|
||||||
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
|
||||||
and may become unsupported in a future release.
|
|
||||||
|
|
||||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||||
|
|
||||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||||
|
|||||||
@ -8,9 +8,6 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
|
|||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
|
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
|
||||||
|
|
||||||
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions.
|
|
||||||
|
|
||||||
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
|
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
|
||||||
|
|
||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|||||||
@ -64,7 +64,8 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
|
|||||||
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
|
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen3-0.6B \
|
||||||
--enable-sleep-mode \
|
--enable-sleep-mode \
|
||||||
--port 8000
|
--port 8000
|
||||||
```
|
```
|
||||||
|
|||||||
@ -48,9 +48,10 @@ The following code configures vLLM in an offline mode to use speculative decodin
|
|||||||
To perform the same with an online mode launch the server:
|
To perform the same with an online mode launch the server:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve facebook/opt-6.7b \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
|
--model facebook/opt-6.7b \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
-tp 1 \
|
-tp 1 \
|
||||||
--gpu_memory_utilization 0.8 \
|
--gpu_memory_utilization 0.8 \
|
||||||
|
|||||||
@ -67,7 +67,8 @@ docker run -it \
|
|||||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve facebook/opt-13b \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model=facebook/opt-13b \
|
||||||
--dtype=bfloat16 \
|
--dtype=bfloat16 \
|
||||||
--max_model_len=1024 \
|
--max_model_len=1024 \
|
||||||
--distributed-executor-backend=mp \
|
--distributed-executor-backend=mp \
|
||||||
|
|||||||
@ -17,12 +17,12 @@ These models are what we list in [supported-text-models][supported-text-models]
|
|||||||
|
|
||||||
### Transformers
|
### Transformers
|
||||||
|
|
||||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||||
|
|
||||||
Currently, the Transformers backend works for the following:
|
Currently, the Transformers backend works for the following:
|
||||||
|
|
||||||
- Modalities: embedding models, language models and vision-language models*
|
- Modalities: embedding models, language models and vision-language models*
|
||||||
- Architectures: encoder-only, decoder-only, mixture-of-experts
|
- Architectures: encoder-only, decoder-only
|
||||||
- Attention types: full attention and/or sliding attention
|
- Attention types: full attention and/or sliding attention
|
||||||
|
|
||||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||||
@ -31,7 +31,6 @@ If the Transformers model implementation follows all the steps in [writing a cus
|
|||||||
|
|
||||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||||
- Any combination of the following vLLM parallelisation schemes:
|
- Any combination of the following vLLM parallelisation schemes:
|
||||||
- Data parallel
|
|
||||||
- Pipeline parallel
|
- Pipeline parallel
|
||||||
- Tensor parallel
|
- Tensor parallel
|
||||||
|
|
||||||
@ -677,7 +676,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
|||||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
||||||
|
|||||||
@ -66,9 +66,6 @@ Restrict domains that vLLM can access for media URLs by setting
|
|||||||
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
|
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
|
||||||
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
|
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
|
||||||
|
|
||||||
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
|
|
||||||
redirects from being followed to bypass domain restrictions.
|
|
||||||
|
|
||||||
## Security and Firewalls: Protecting Exposed vLLM Systems
|
## Security and Firewalls: Protecting Exposed vLLM Systems
|
||||||
|
|
||||||
While vLLM is designed to allow unsafe network services to be isolated to
|
While vLLM is designed to allow unsafe network services to be isolated to
|
||||||
|
|||||||
@ -576,7 +576,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
|
|
||||||
# Intern-S1
|
# Intern-S1
|
||||||
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
model_name = "internlm/Intern-S1-mini"
|
model_name = "internlm/Intern-S1"
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
|
|||||||
@ -309,7 +309,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
|
|
||||||
|
|
||||||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "internlm/Intern-S1-mini"
|
model_name = "internlm/Intern-S1"
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
|
|||||||
@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
|
|||||||
done < <(env | grep "^${PREFIX}")
|
done < <(env | grep "^${PREFIX}")
|
||||||
|
|
||||||
# Pass the collected arguments to the main entrypoint
|
# Pass the collected arguments to the main entrypoint
|
||||||
exec vllm serve "${ARGS[@]}"
|
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
|
||||||
@ -1,2 +1,2 @@
|
|||||||
lmcache
|
lmcache
|
||||||
nixl >= 0.6.0 # Required for disaggregated prefill
|
nixl >= 0.5.1 # Required for disaggregated prefill
|
||||||
|
|||||||
@ -11,8 +11,8 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
from tests.v1.attention.utils import _Backend
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.attention.backends.registry import _Backend
|
|
||||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||||
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
||||||
PassConfig)
|
PassConfig)
|
||||||
|
|||||||
@ -8,11 +8,11 @@ import torch._dynamo
|
|||||||
|
|
||||||
from tests.compile.backend import LazyInitPass, TestBackend
|
from tests.compile.backend import LazyInitPass, TestBackend
|
||||||
from tests.models.utils import check_outputs_equal
|
from tests.models.utils import check_outputs_equal
|
||||||
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
|
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||||
|
create_common_attn_metadata)
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||||
from vllm.attention import Attention, AttentionMetadata
|
from vllm.attention import Attention, AttentionMetadata
|
||||||
from vllm.attention.backends.registry import _Backend
|
|
||||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||||
from vllm.compilation.fusion import QUANT_OPS
|
from vllm.compilation.fusion import QUANT_OPS
|
||||||
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
|
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
|
||||||
|
|||||||
@ -756,7 +756,7 @@ class VllmRunner:
|
|||||||
|
|
||||||
def get_inputs(
|
def get_inputs(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
prompts: Union[list[str], list[torch.Tensor], list[int]],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: Optional[PromptVideoInput] = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
|||||||
@ -86,16 +86,3 @@ def test_max_model_len():
|
|||||||
# It can be less if generation finishes due to other reasons (e.g., EOS)
|
# It can be less if generation finishes due to other reasons (e.g., EOS)
|
||||||
# before reaching the absolute model length limit.
|
# before reaching the absolute model length limit.
|
||||||
assert num_total_tokens <= max_model_len
|
assert num_total_tokens <= max_model_len
|
||||||
|
|
||||||
|
|
||||||
def test_log_stats():
|
|
||||||
llm = LLM(
|
|
||||||
model=MODEL_NAME,
|
|
||||||
disable_log_stats=False,
|
|
||||||
gpu_memory_utilization=0.10,
|
|
||||||
enforce_eager=True, # reduce test time
|
|
||||||
)
|
|
||||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
|
||||||
|
|
||||||
# disable_log_stats is False, every output should have metrics
|
|
||||||
assert all(output.metrics is not None for output in outputs)
|
|
||||||
|
|||||||
@ -122,9 +122,6 @@ def mock_serving_setup():
|
|||||||
models,
|
models,
|
||||||
request_logger=None)
|
request_logger=None)
|
||||||
|
|
||||||
serving_completion._process_inputs = AsyncMock(return_value=(MagicMock(
|
|
||||||
name="engine_request"), {}))
|
|
||||||
|
|
||||||
return mock_engine, serving_completion
|
return mock_engine, serving_completion
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -698,22 +698,6 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
||||||
async def test_system_message_with_tools(client: OpenAI, model_name: str):
|
|
||||||
from vllm.entrypoints.harmony_utils import get_system_message
|
|
||||||
|
|
||||||
# Test with custom tools enabled - commentary channel should be available
|
|
||||||
sys_msg = get_system_message(with_custom_tools=True)
|
|
||||||
valid_channels = sys_msg.content[0].channel_config.valid_channels
|
|
||||||
assert "commentary" in valid_channels
|
|
||||||
|
|
||||||
# Test with custom tools disabled - commentary channel should be removed
|
|
||||||
sys_msg = get_system_message(with_custom_tools=False)
|
|
||||||
valid_channels = sys_msg.content[0].channel_config.valid_channels
|
|
||||||
assert "commentary" not in valid_channels
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import asyncio
|
|||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import TYPE_CHECKING, Any, Optional
|
from typing import TYPE_CHECKING, Any, Optional
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
@ -230,7 +230,6 @@ class MockHFConfig:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class MockModelConfig:
|
class MockModelConfig:
|
||||||
task = "generate"
|
task = "generate"
|
||||||
runner_type = "generate"
|
|
||||||
tokenizer = MODEL_NAME
|
tokenizer = MODEL_NAME
|
||||||
trust_remote_code = False
|
trust_remote_code = False
|
||||||
tokenizer_mode = "auto"
|
tokenizer_mode = "auto"
|
||||||
@ -245,33 +244,11 @@ class MockModelConfig:
|
|||||||
encoder_config = None
|
encoder_config = None
|
||||||
generation_config: str = "auto"
|
generation_config: str = "auto"
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
skip_tokenizer_init = False
|
|
||||||
|
|
||||||
def get_diff_sampling_param(self):
|
def get_diff_sampling_param(self):
|
||||||
return self.diff_sampling_param or {}
|
return self.diff_sampling_param or {}
|
||||||
|
|
||||||
|
|
||||||
def _build_serving_chat(engine: AsyncLLM,
|
|
||||||
model_config: MockModelConfig) -> OpenAIServingChat:
|
|
||||||
models = OpenAIServingModels(engine_client=engine,
|
|
||||||
base_model_paths=BASE_MODEL_PATHS,
|
|
||||||
model_config=model_config)
|
|
||||||
serving_chat = OpenAIServingChat(engine,
|
|
||||||
model_config,
|
|
||||||
models,
|
|
||||||
response_role="assistant",
|
|
||||||
chat_template=CHAT_TEMPLATE,
|
|
||||||
chat_template_content_format="auto",
|
|
||||||
request_logger=None)
|
|
||||||
|
|
||||||
async def _fake_process_inputs(request_id, engine_prompt, sampling_params,
|
|
||||||
*, lora_request, trace_headers, priority):
|
|
||||||
return dict(engine_prompt), {}
|
|
||||||
|
|
||||||
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
|
||||||
return serving_chat
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MockEngine:
|
class MockEngine:
|
||||||
|
|
||||||
@ -305,7 +282,16 @@ async def test_serving_chat_returns_correct_model_name():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
|
|
||||||
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
|
models = OpenAIServingModels(engine_client=mock_engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
model_config=MockModelConfig())
|
||||||
|
serving_chat = OpenAIServingChat(mock_engine,
|
||||||
|
MockModelConfig(),
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
chat_template=CHAT_TEMPLATE,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
request_logger=None)
|
||||||
messages = [{"role": "user", "content": "what is 1+1?"}]
|
messages = [{"role": "user", "content": "what is 1+1?"}]
|
||||||
|
|
||||||
async def return_model_name(*args):
|
async def return_model_name(*args):
|
||||||
@ -332,7 +318,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
|
|
||||||
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
|
models = OpenAIServingModels(engine_client=mock_engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
model_config=MockModelConfig())
|
||||||
|
serving_chat = OpenAIServingChat(mock_engine,
|
||||||
|
MockModelConfig(),
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
chat_template=CHAT_TEMPLATE,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
request_logger=None)
|
||||||
|
|
||||||
req = ChatCompletionRequest(
|
req = ChatCompletionRequest(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
@ -366,7 +361,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
|||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
|
|
||||||
# Initialize the serving chat
|
# Initialize the serving chat
|
||||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
models = OpenAIServingModels(engine_client=mock_engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
model_config=mock_model_config)
|
||||||
|
serving_chat = OpenAIServingChat(mock_engine,
|
||||||
|
mock_model_config,
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
chat_template=CHAT_TEMPLATE,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
request_logger=None)
|
||||||
|
|
||||||
# Test Case 1: No max_tokens specified in request
|
# Test Case 1: No max_tokens specified in request
|
||||||
req = ChatCompletionRequest(
|
req = ChatCompletionRequest(
|
||||||
@ -411,7 +415,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
|||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
|
|
||||||
# Initialize the serving chat
|
# Initialize the serving chat
|
||||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
models = OpenAIServingModels(engine_client=mock_engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
model_config=mock_model_config)
|
||||||
|
serving_chat = OpenAIServingChat(mock_engine,
|
||||||
|
mock_model_config,
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
chat_template=CHAT_TEMPLATE,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
request_logger=None)
|
||||||
|
|
||||||
# Test case 1: No max_tokens specified, defaults to context_window
|
# Test case 1: No max_tokens specified, defaults to context_window
|
||||||
req = ChatCompletionRequest(
|
req = ChatCompletionRequest(
|
||||||
@ -458,7 +471,16 @@ async def test_serving_chat_could_load_correct_generation_config():
|
|||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
|
|
||||||
# Initialize the serving chat
|
# Initialize the serving chat
|
||||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
models = OpenAIServingModels(engine_client=mock_engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
model_config=mock_model_config)
|
||||||
|
serving_chat = OpenAIServingChat(mock_engine,
|
||||||
|
mock_model_config,
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
chat_template=CHAT_TEMPLATE,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
request_logger=None)
|
||||||
|
|
||||||
req = ChatCompletionRequest(
|
req = ChatCompletionRequest(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
@ -503,7 +525,17 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
|
|
||||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
# Initialize the serving chat
|
||||||
|
models = OpenAIServingModels(engine_client=mock_engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
model_config=mock_model_config)
|
||||||
|
serving_chat = OpenAIServingChat(mock_engine,
|
||||||
|
mock_model_config,
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
chat_template=CHAT_TEMPLATE,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
request_logger=None)
|
||||||
|
|
||||||
# Test cache_salt
|
# Test cache_salt
|
||||||
req = ChatCompletionRequest(
|
req = ChatCompletionRequest(
|
||||||
@ -517,12 +549,10 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
|||||||
# By default, cache_salt in the engine prompt is not set
|
# By default, cache_salt in the engine prompt is not set
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
await serving_chat.create_chat_completion(req)
|
await serving_chat.create_chat_completion(req)
|
||||||
engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
|
assert "cache_salt" not in mock_engine.generate.call_args.args[0]
|
||||||
assert "cache_salt" not in engine_prompt
|
|
||||||
|
|
||||||
# Test with certain cache_salt
|
# Test with certain cache_salt
|
||||||
req.cache_salt = "test_salt"
|
req.cache_salt = "test_salt"
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
await serving_chat.create_chat_completion(req)
|
await serving_chat.create_chat_completion(req)
|
||||||
engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
|
assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
|
||||||
assert engine_prompt.get("cache_salt") == "test_salt"
|
|
||||||
|
|||||||
@ -1,129 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from contextlib import AsyncExitStack
|
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pytest_asyncio
|
|
||||||
|
|
||||||
from vllm.entrypoints.context import ConversationContext
|
|
||||||
from vllm.entrypoints.openai.protocol import ResponsesRequest
|
|
||||||
from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
|
|
||||||
from vllm.entrypoints.tool_server import ToolServer
|
|
||||||
|
|
||||||
|
|
||||||
class MockConversationContext(ConversationContext):
|
|
||||||
"""Mock conversation context for testing"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.init_tool_sessions_called = False
|
|
||||||
self.init_tool_sessions_args = None
|
|
||||||
self.init_tool_sessions_kwargs = None
|
|
||||||
|
|
||||||
def append_output(self, output) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def call_tool(self):
|
|
||||||
return []
|
|
||||||
|
|
||||||
def need_builtin_tool_call(self) -> bool:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def render_for_completion(self):
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def init_tool_sessions(self, tool_server, exit_stack, request_id,
|
|
||||||
mcp_tools):
|
|
||||||
self.init_tool_sessions_called = True
|
|
||||||
self.init_tool_sessions_args = (tool_server, exit_stack, request_id,
|
|
||||||
mcp_tools)
|
|
||||||
|
|
||||||
async def cleanup_session(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_serving_responses():
|
|
||||||
"""Create a mock OpenAIServingResponses instance"""
|
|
||||||
serving_responses = MagicMock(spec=OpenAIServingResponses)
|
|
||||||
serving_responses.tool_server = MagicMock(spec=ToolServer)
|
|
||||||
return serving_responses
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_context():
|
|
||||||
"""Create a mock conversation context"""
|
|
||||||
return MockConversationContext()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_exit_stack():
|
|
||||||
"""Create a mock async exit stack"""
|
|
||||||
return MagicMock(spec=AsyncExitStack)
|
|
||||||
|
|
||||||
|
|
||||||
class TestInitializeToolSessions:
|
|
||||||
"""Test class for _initialize_tool_sessions method"""
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def serving_responses_instance(self):
|
|
||||||
"""Create a real OpenAIServingResponses instance for testing"""
|
|
||||||
# Create minimal mocks for required dependencies
|
|
||||||
engine_client = MagicMock()
|
|
||||||
engine_client.get_model_config = AsyncMock()
|
|
||||||
|
|
||||||
model_config = MagicMock()
|
|
||||||
model_config.hf_config.model_type = "test"
|
|
||||||
model_config.get_diff_sampling_param.return_value = {}
|
|
||||||
|
|
||||||
models = MagicMock()
|
|
||||||
|
|
||||||
tool_server = MagicMock(spec=ToolServer)
|
|
||||||
|
|
||||||
# Create the actual instance
|
|
||||||
instance = OpenAIServingResponses(
|
|
||||||
engine_client=engine_client,
|
|
||||||
model_config=model_config,
|
|
||||||
models=models,
|
|
||||||
request_logger=None,
|
|
||||||
chat_template=None,
|
|
||||||
chat_template_content_format="auto",
|
|
||||||
tool_server=tool_server,
|
|
||||||
)
|
|
||||||
|
|
||||||
return instance
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_initialize_tool_sessions(self, serving_responses_instance,
|
|
||||||
mock_context, mock_exit_stack):
|
|
||||||
"""Test that method works correctly with only MCP tools"""
|
|
||||||
|
|
||||||
request = ResponsesRequest(input="test input", tools=[])
|
|
||||||
|
|
||||||
# Call the method
|
|
||||||
await serving_responses_instance._initialize_tool_sessions(
|
|
||||||
request, mock_context, mock_exit_stack)
|
|
||||||
assert mock_context.init_tool_sessions_called is False
|
|
||||||
|
|
||||||
# Create only MCP tools
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "web_search_preview"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "code_interpreter",
|
|
||||||
"container": {
|
|
||||||
"type": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
request = ResponsesRequest(input="test input", tools=tools)
|
|
||||||
|
|
||||||
# Call the method
|
|
||||||
await serving_responses_instance._initialize_tool_sessions(
|
|
||||||
request, mock_context, mock_exit_stack)
|
|
||||||
|
|
||||||
# Verify that init_tool_sessions was called
|
|
||||||
assert mock_context.init_tool_sessions_called
|
|
||||||
@ -10,9 +10,8 @@ from unittest.mock import patch
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.attention.backends.registry import _Backend
|
|
||||||
from vllm.attention.layer import MultiHeadAttention
|
from vllm.attention.layer import MultiHeadAttention
|
||||||
from vllm.attention.selector import _cached_get_attn_backend
|
from vllm.attention.selector import _Backend, _cached_get_attn_backend
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.platforms.cpu import CpuPlatform
|
from vllm.platforms.cpu import CpuPlatform
|
||||||
from vllm.platforms.cuda import CudaPlatform
|
from vllm.platforms.cuda import CudaPlatform
|
||||||
|
|||||||
@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
|
|||||||
native_w8a8_block_matmul)
|
native_w8a8_block_matmul)
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
|
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import has_deep_gemm
|
from vllm.utils import has_deep_gemm
|
||||||
from vllm.utils.deep_gemm import (fp8_gemm_nt,
|
from vllm.utils.deep_gemm import (fp8_gemm_nt,
|
||||||
@ -91,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
|||||||
|
|
||||||
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
|
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
|
||||||
out_dtype)
|
out_dtype)
|
||||||
out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
|
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
|
||||||
out_dtype)
|
|
||||||
|
|
||||||
rel_diff = (torch.mean(
|
rel_diff = (torch.mean(
|
||||||
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
||||||
|
|||||||
@ -20,11 +20,9 @@ from vllm.platforms import current_platform
|
|||||||
(8, 513, 64), # Non-divisible (native only)
|
(8, 513, 64), # Non-divisible (native only)
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [42])
|
@pytest.mark.parametrize("seed", [42])
|
||||||
@pytest.mark.parametrize("use_ue8m0", [True, False])
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||||
group_size: int, seed: int,
|
group_size: int, seed: int) -> None:
|
||||||
use_ue8m0: bool) -> None:
|
|
||||||
"""Test QuantFP8 group quantization with various configurations.
|
"""Test QuantFP8 group quantization with various configurations.
|
||||||
|
|
||||||
Tests both CUDA and native implementations, column-major scales,
|
Tests both CUDA and native implementations, column-major scales,
|
||||||
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
|||||||
group_shape = GroupShape(1, group_size)
|
group_shape = GroupShape(1, group_size)
|
||||||
quant_op = QuantFP8(static=False,
|
quant_op = QuantFP8(static=False,
|
||||||
group_shape=group_shape,
|
group_shape=group_shape,
|
||||||
column_major_scales=False,
|
column_major_scales=False)
|
||||||
use_ue8m0=use_ue8m0)
|
|
||||||
|
|
||||||
# 1. Test native implementation (always available)
|
# 1. Test native implementation (always available)
|
||||||
x_quant_native, scales_native = quant_op.forward_native(x.clone())
|
x_quant_native, scales_native = quant_op.forward_native(x.clone())
|
||||||
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
|||||||
# 2. Test column-major scales configuration
|
# 2. Test column-major scales configuration
|
||||||
quant_op_col = QuantFP8(static=False,
|
quant_op_col = QuantFP8(static=False,
|
||||||
group_shape=group_shape,
|
group_shape=group_shape,
|
||||||
column_major_scales=True,
|
column_major_scales=True)
|
||||||
use_ue8m0=use_ue8m0)
|
|
||||||
_, scales_col = quant_op_col.forward_native(x.clone())
|
_, scales_col = quant_op_col.forward_native(x.clone())
|
||||||
assert scales_col.shape == (batch_size, expected_num_groups)
|
assert scales_col.shape == (expected_num_groups, batch_size)
|
||||||
assert scales_col.stride(0) == 1
|
|
||||||
assert scales_col.stride(1) == batch_size
|
|
||||||
|
|
||||||
# Test column-major scales consistency
|
|
||||||
assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
|
|
||||||
|
|
||||||
# 3. Test CUDA implementation (only for divisible dimensions)
|
# 3. Test CUDA implementation (only for divisible dimensions)
|
||||||
if is_divisible:
|
if is_divisible:
|
||||||
@ -77,23 +68,21 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", [42])
|
@pytest.mark.parametrize("seed", [42])
|
||||||
@pytest.mark.parametrize("use_ue8m0", [True, False])
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
def test_quantfp8_group_multidimensional(seed: int) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
group_size = 64
|
group_size = 64
|
||||||
|
|
||||||
# Test with 3D input
|
# Test with 3D input
|
||||||
batch1, batch2, hidden_dim = 4, 8, 1024
|
batch1, batch2, hidden_dim = 4, 8, 512
|
||||||
x_3d = torch.randn(
|
x_3d = torch.randn(
|
||||||
(batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
|
(batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
|
||||||
|
|
||||||
group_shape = GroupShape(1, group_size)
|
group_shape = GroupShape(1, group_size)
|
||||||
quant_op = QuantFP8(static=False,
|
quant_op = QuantFP8(static=False,
|
||||||
group_shape=group_shape,
|
group_shape=group_shape,
|
||||||
column_major_scales=False,
|
column_major_scales=False)
|
||||||
use_ue8m0=use_ue8m0)
|
|
||||||
|
|
||||||
x_quant, scales = quant_op.forward_native(x_3d.clone())
|
x_quant, scales = quant_op.forward_native(x_3d.clone())
|
||||||
assert x_quant.shape == x_3d.shape
|
assert x_quant.shape == x_3d.shape
|
||||||
@ -102,10 +91,9 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
|||||||
# Test column_major_scales with multi-dim
|
# Test column_major_scales with multi-dim
|
||||||
quant_op_col = QuantFP8(static=False,
|
quant_op_col = QuantFP8(static=False,
|
||||||
group_shape=group_shape,
|
group_shape=group_shape,
|
||||||
column_major_scales=True,
|
column_major_scales=True)
|
||||||
use_ue8m0=use_ue8m0)
|
|
||||||
_, scales_col = quant_op_col.forward_native(x_3d.clone())
|
_, scales_col = quant_op_col.forward_native(x_3d.clone())
|
||||||
assert scales_col.shape == (batch1, batch2, hidden_dim // group_size)
|
assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)
|
||||||
|
|
||||||
# Test with 4D input
|
# Test with 4D input
|
||||||
batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256
|
batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user