Compare commits
1 Commits
fp8_ep_dp
...
benchmark_
| Author | SHA1 | Date | |
|---|---|---|---|
| 221118dc85 |
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
|
||||
@ -6,6 +6,11 @@
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
exclude = [
|
||||
# External file, leaving license intact
|
||||
"examples/other/fp8/quantizer/quantize.py",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
|
||||
@ -14,7 +14,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -31,7 +31,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -64,7 +64,7 @@ steps:
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
username: vllm
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -10,17 +10,15 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||
# Setup cleanup
|
||||
# certain versions of HPU software stack have a bug that can
|
||||
# override the exit code of the script, so we need to use
|
||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
||||
# separate remove_docker_container and remove_docker_container_and_exit
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
||||
trap remove_docker_containers_and_exit EXIT
|
||||
remove_docker_containers
|
||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||
trap remove_docker_container_and_exit EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
||||
|
||||
EXITCODE=$?
|
||||
|
||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
@ -48,16 +47,8 @@ trap remove_docker_container EXIT
|
||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "
|
||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||
echo 'Running test file: '$f;
|
||||
python3 -m pytest \$f -v --capture=tee-sys;
|
||||
done
|
||||
"
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
||||
|
||||
@ -2,180 +2,102 @@
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_XLA_CACHE_PATH= \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo HARDWARE \
|
||||
&& tpu-info \
|
||||
&& { \
|
||||
echo TEST_0: Running test_perf.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
||||
echo TEST_0_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_1: Running test_compilation.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
||||
echo TEST_1_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_2: Running test_basic.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
||||
echo TEST_2_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||
echo TEST_3_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
||||
echo TEST_4_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
||||
echo TEST_5_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
||||
echo TEST_6_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_7: Running test_sampler.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
||||
echo TEST_7_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
||||
echo TEST_8_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_9: Running test_multimodal.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
||||
echo TEST_9_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_10: Running test_pallas.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
||||
echo TEST_10_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_11: Running test_struct_output_generate.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
||||
echo TEST_11_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_12: Running test_moe_pallas.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
||||
echo TEST_12_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
# Disable the TPU LoRA tests until the feature is activated
|
||||
# & { \
|
||||
# echo TEST_13: Running test_moe_pallas.py; \
|
||||
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
||||
# echo TEST_13_EXIT_CODE: \$?; \
|
||||
# } & \
|
||||
wait \
|
||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
||||
"
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 0 "test_perf.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||
run_and_track_test 1 "test_compilation.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||
run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
@ -33,13 +33,14 @@ steps:
|
||||
|
||||
- label: Documentation Build # 2min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/test_docs"
|
||||
working_dir: "/vllm-workspace/test_docs/docs"
|
||||
fast_check: true
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r ../requirements/docs.txt
|
||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||
- mkdocs build
|
||||
- pip install -r ../../requirements/docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
# Check API reference (if it fails, you may have missing mock imports)
|
||||
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -58,7 +59,6 @@ steps:
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
@ -125,7 +125,7 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -138,7 +138,6 @@ steps:
|
||||
- vllm/core/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||
- tests/compile/test_basic_correctness
|
||||
- examples/offline_inference/rlhf.py
|
||||
@ -157,7 +156,6 @@ steps:
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
@ -199,9 +197,8 @@ steps:
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
@ -223,7 +220,6 @@ steps:
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
@ -248,7 +244,7 @@ steps:
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
@ -275,6 +271,17 @@ steps:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/model_executor/guided_decoding
|
||||
- tests/test_logits_processor
|
||||
- tests/model_executor/test_guided_processors
|
||||
commands:
|
||||
- pytest -v -s test_logits_processor.py
|
||||
- pytest -v -s model_executor/test_guided_processors.py
|
||||
|
||||
- label: Speculative decoding tests # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
@ -305,7 +312,6 @@ steps:
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@ -380,23 +386,10 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@ -474,7 +467,10 @@ steps:
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_utils.py
|
||||
- pytest -v -s models/test_vision.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
||||
|
||||
- label: Language Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -488,25 +484,16 @@ steps:
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m core_model
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||
- label: Language Models Test (Extended)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/language/generation -m 'not core_model'
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
- pytest -v -s models/language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -618,11 +605,9 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
6
.github/CODEOWNERS
vendored
6
.github/CODEOWNERS
vendored
@ -13,7 +13,6 @@
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# vLLM V1
|
||||
@ -41,8 +40,3 @@ CMakeLists.txt @tlrmchlsmth
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||
/tests/v1/structured_output @mgoin @russellb
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/lora @jeejeelee
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
6
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
6
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -81,14 +81,14 @@ body:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
||||
value: >
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||
|
||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||
|
||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||
|
||||
Thanks for reporting 🙏!
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
|
||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@ -1,69 +0,0 @@
|
||||
name: 🧪 CI failure report
|
||||
description: Report a failing test.
|
||||
title: "[CI Failure]: "
|
||||
labels: ["ci-failure"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Include the name of the failing Buildkite step and test file in the title.
|
||||
- type: input
|
||||
attributes:
|
||||
label: Name of failing test
|
||||
description: |
|
||||
Paste in the fully-qualified name of the failing test from the logs.
|
||||
placeholder: |
|
||||
`path/to/test_file.py::test_name[params]`
|
||||
validations:
|
||||
required: true
|
||||
- type: checkboxes
|
||||
attributes:
|
||||
label: Basic information
|
||||
description: Select all items that apply to the failing test.
|
||||
options:
|
||||
- label: Flaky test
|
||||
- label: Can reproduce locally
|
||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 🧪 Describe the failing test
|
||||
description: |
|
||||
Please provide a clear and concise description of the failing test.
|
||||
placeholder: |
|
||||
A clear and concise description of the failing test.
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 📝 History of failing test
|
||||
description: |
|
||||
Since when did the test start to fail?
|
||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
||||
|
||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
||||
|
||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
||||
|
||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
||||
|
||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
||||
placeholder: |
|
||||
Approximate timeline and/or problematic PRs
|
||||
|
||||
A link to the Buildkite analytics of the failing test (if available)
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: CC List.
|
||||
description: >
|
||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for reporting 🙏!
|
||||
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
|
||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||
|
||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
||||
6
.github/mergify.yml
vendored
6
.github/mergify.yml
vendored
@ -58,7 +58,7 @@ pull_request_rules:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/features/structured_outputs.md
|
||||
- files=docs/source/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
@ -135,7 +135,9 @@ pull_request_rules:
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files=docs/source/features/tool_calling.md
|
||||
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
||||
- files=docs/source/getting_started/examples/chat_with_tools.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
|
||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||
python3 - <<EOF
|
||||
import regex as re
|
||||
import re
|
||||
|
||||
with open("${NEW}", "r") as file:
|
||||
content = file.read()
|
||||
|
||||
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@ -20,12 +20,7 @@ jobs:
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install regex
|
||||
|
||||
- name: Update PR description
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -77,6 +77,11 @@ instance/
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/source/getting_started/examples/
|
||||
docs/source/api/vllm
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
@ -146,7 +151,6 @@ venv.bak/
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
docs/examples
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
@ -17,7 +17,7 @@ repos:
|
||||
- id: ruff
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
files: ^(.buildkite|benchmarks)/.*
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
hooks:
|
||||
@ -39,7 +39,6 @@ repos:
|
||||
rev: v0.9.29
|
||||
hooks:
|
||||
- id: pymarkdown
|
||||
exclude: '.*\.inc\.md'
|
||||
args: [fix]
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
@ -58,7 +57,7 @@ repos:
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
@ -128,21 +127,6 @@ repos:
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/update-dockerfile-graph.sh
|
||||
language: script
|
||||
- id: enforce-import-regex-instead-of-re
|
||||
name: Enforce import regex as re
|
||||
entry: python tools/enforce_regex_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# forbid directly import triton
|
||||
- id: forbid-direct-triton-import
|
||||
name: "Forbid direct 'import triton'"
|
||||
entry: python tools/check_triton_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -8,8 +8,12 @@ build:
|
||||
tools:
|
||||
python: "3.12"
|
||||
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
|
||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
|
||||
@ -23,15 +23,15 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
#
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||
|
||||
@ -79,15 +79,6 @@ endif()
|
||||
#
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
else()
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||
#
|
||||
@ -235,8 +226,6 @@ endif()
|
||||
#
|
||||
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
@ -292,6 +281,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
FetchContent_MakeAvailable(cutlass)
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
@ -788,7 +779,5 @@ endif()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# Contributing to vLLM
|
||||
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||
|
||||
10
README.md
10
README.md
@ -1,7 +1,7 @@
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||
</picture>
|
||||
</p>
|
||||
|
||||
@ -58,7 +58,7 @@ vLLM is fast with:
|
||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||
## Contributing
|
||||
|
||||
We welcome and value any contributions and collaborations.
|
||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
|
||||
|
||||
## Sponsors
|
||||
|
||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||
|
||||
<!-- Note: Please sort them in alphabetical order. -->
|
||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||
Cash Donations:
|
||||
- a16z
|
||||
- Dropbox
|
||||
|
||||
@ -8,6 +8,4 @@ Please report security issues privately using [the vulnerability submission form
|
||||
|
||||
---
|
||||
|
||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||
|
||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||
|
||||
@ -64,12 +64,6 @@ become available.
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Custom</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>Local file: <code>data.jsonl</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@ -130,38 +124,6 @@ P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
### Custom Dataset
|
||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name custom \
|
||||
--dataset-path <path-to-your-data-jsonl> \
|
||||
--custom-skip-chat-template \
|
||||
--num-prompts 80 \
|
||||
--max-concurrency 1 \
|
||||
--temperature=0.3 \
|
||||
--top-p=0.75 \
|
||||
--result-dir "./log/"
|
||||
```
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
@ -184,9 +146,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
--speculative-model "[ngram]" \
|
||||
--ngram_prompt_lookup_min 2 \
|
||||
--ngram-prompt-lookup-max 5 \
|
||||
--num_speculative_tokens 5
|
||||
```
|
||||
|
||||
``` bash
|
||||
@ -241,16 +204,6 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
**`philschmid/mt-bench`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
--num-prompts 80
|
||||
```
|
||||
|
||||
### Running With Sampling Parameters
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
@ -321,9 +274,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--output-len=100 \
|
||||
--num-prompts=2048 \
|
||||
--async-engine \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
--speculative-model="[ngram]" \
|
||||
--ngram_prompt_lookup_min=2 \
|
||||
--ngram-prompt-lookup-max=5 \
|
||||
--num_speculative_tokens=5
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
@ -194,11 +194,6 @@ async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
)
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
@ -209,8 +204,6 @@ async def async_request_deepspeed_mii(
|
||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||
"top_p": 1.0,
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
@ -222,7 +215,7 @@ async def async_request_deepspeed_mii(
|
||||
st = time.perf_counter()
|
||||
try:
|
||||
async with session.post(
|
||||
url=api_url, json=payload, headers=headers
|
||||
url=request_func_input.api_url, json=payload
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
parsed_resp = await response.json()
|
||||
@ -324,7 +317,7 @@ async def async_request_openai_completions(
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
if usage := data.get("usage"):
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
@ -611,7 +604,6 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
"llama.cpp": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
|
||||
@ -9,6 +9,9 @@ generation. Supported dataset types include:
|
||||
- BurstGPT
|
||||
- HuggingFace
|
||||
- VisionArena
|
||||
|
||||
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
||||
SampleRequest instances, similar to the approach used in ShareGPT.
|
||||
"""
|
||||
|
||||
import base64
|
||||
@ -32,7 +35,6 @@ from transformers import PreTrainedTokenizerBase
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -255,7 +257,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
||||
if isinstance(image, dict) and "bytes" in image:
|
||||
image = Image.open(BytesIO(image["bytes"]))
|
||||
if isinstance(image, Image.Image):
|
||||
image = convert_image_mode(image, "RGB")
|
||||
image = image.convert("RGB")
|
||||
with io.BytesIO() as image_data:
|
||||
image.save(image_data, format="JPEG")
|
||||
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
|
||||
@ -439,97 +441,6 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
return samples
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Custom Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class CustomDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the Custom dataset. Loads data from a JSONL file and generates
|
||||
sample requests based on conversation turns. E.g.,
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
# self.data will be a list of dictionaries
|
||||
# e.g., [{"prompt": "What is the capital of India?"}, ...]
|
||||
# This will be the standardized format which load_data()
|
||||
# has to convert into depending on the filetype of dataset_path.
|
||||
# sample() will assume this standardized format of self.data
|
||||
self.data = []
|
||||
|
||||
# Load the JSONL file
|
||||
if self.dataset_path.endswith(".jsonl"):
|
||||
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
|
||||
|
||||
# check if the JSONL file has a 'prompt' column
|
||||
if "prompt" not in jsonl_data.columns:
|
||||
raise ValueError("JSONL file must contain a 'prompt' column.")
|
||||
|
||||
# Convert each row to a dictionary and append to self.data
|
||||
# This will convert the DataFrame to a list of dictionaries
|
||||
# where each dictionary corresponds to a row in the DataFrame.
|
||||
# This is the standardized format we want for self.data
|
||||
for _, row in jsonl_data.iterrows():
|
||||
self.data.append(row.to_dict())
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only JSONL format is supported for CustomDataset."
|
||||
)
|
||||
|
||||
random.seed(self.random_seed)
|
||||
random.shuffle(self.data)
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
skip_chat_template: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
sampled_requests = []
|
||||
for item in self.data:
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
prompt = item["prompt"]
|
||||
|
||||
# apply template
|
||||
if not skip_chat_template:
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
)
|
||||
)
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
|
||||
return sampled_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Sonnet Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@ -6,12 +6,13 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@ -79,9 +80,17 @@ def main(args: argparse.Namespace):
|
||||
|
||||
def run_to_completion(profile_dir: Optional[str] = None):
|
||||
if profile_dir:
|
||||
llm.start_profile()
|
||||
llm_generate()
|
||||
llm.stop_profile()
|
||||
with torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
str(profile_dir)
|
||||
),
|
||||
) as p:
|
||||
llm_generate()
|
||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||
else:
|
||||
start_time = time.perf_counter()
|
||||
llm_generate()
|
||||
@ -94,7 +103,11 @@ def main(args: argparse.Namespace):
|
||||
run_to_completion(profile_dir=None)
|
||||
|
||||
if args.profile:
|
||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
profile_dir = args.profile_result_dir
|
||||
if not profile_dir:
|
||||
profile_dir = (
|
||||
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
||||
)
|
||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||
run_to_completion(profile_dir=profile_dir)
|
||||
return
|
||||
@ -151,6 +164,15 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="profile the generation process of a single batch",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile-result-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"path to save the pytorch profiler output. Can be visualized "
|
||||
"with ui.perfetto.dev or Tensorboard."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
type=str,
|
||||
@ -167,13 +189,5 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# V1 enables prefix caching by default which skews the latency
|
||||
# numbers. We need to disable prefix caching by default.
|
||||
parser.set_defaults(enable_prefix_caching=False)
|
||||
args = parser.parse_args()
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||
"Please set it to a valid path to use torch profiler."
|
||||
)
|
||||
main(args)
|
||||
|
||||
@ -60,7 +60,6 @@ from benchmark_dataset import (
|
||||
ASRDataset,
|
||||
BurstGPTDataset,
|
||||
ConversationDataset,
|
||||
CustomDataset,
|
||||
HuggingFaceDataset,
|
||||
InstructCoderDataset,
|
||||
MTBenchDataset,
|
||||
@ -276,7 +275,7 @@ async def benchmark(
|
||||
model_id: str,
|
||||
model_name: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_requests: list[SampleRequest],
|
||||
requests: list[SampleRequest],
|
||||
logprobs: Optional[int],
|
||||
request_rate: float,
|
||||
burstiness: float,
|
||||
@ -296,12 +295,14 @@ async def benchmark(
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
print("Starting initial single prompt test run...")
|
||||
last_idx = len(requests) - 1
|
||||
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
|
||||
input_requests[0].prompt,
|
||||
input_requests[0].prompt_len,
|
||||
input_requests[0].expected_output_len,
|
||||
input_requests[0].multi_modal_data,
|
||||
requests[last_idx].prompt,
|
||||
requests[last_idx].prompt_len,
|
||||
requests[last_idx].expected_output_len,
|
||||
requests[last_idx].multi_modal_data,
|
||||
)
|
||||
input_requests = requests[:last_idx]
|
||||
|
||||
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
||||
test_input = RequestFuncInput(
|
||||
@ -616,6 +617,9 @@ def main(args: argparse.Namespace):
|
||||
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||
base_url = f"http://{args.host}:{args.port}"
|
||||
|
||||
# Create one more request (for a test request)
|
||||
total_prompts = args.num_prompts + 1
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_id,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
@ -628,21 +632,12 @@ def main(args: argparse.Namespace):
|
||||
"'--dataset-path' if required."
|
||||
)
|
||||
|
||||
if args.dataset_name == "custom":
|
||||
dataset = CustomDataset(dataset_path=args.dataset_path)
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
output_len=args.custom_output_len,
|
||||
skip_chat_template=args.custom_skip_chat_template,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sonnet":
|
||||
if args.dataset_name == "sonnet":
|
||||
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
||||
# For the "sonnet" dataset, formatting depends on the backend.
|
||||
if args.backend == "openai-chat":
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
input_len=args.sonnet_input_len,
|
||||
output_len=args.sonnet_output_len,
|
||||
prefix_len=args.sonnet_prefix_len,
|
||||
@ -654,7 +649,7 @@ def main(args: argparse.Namespace):
|
||||
"Tokenizer/model must have chat template for sonnet dataset."
|
||||
)
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
input_len=args.sonnet_input_len,
|
||||
output_len=args.sonnet_output_len,
|
||||
prefix_len=args.sonnet_prefix_len,
|
||||
@ -717,7 +712,7 @@ def main(args: argparse.Namespace):
|
||||
dataset_split=args.hf_split,
|
||||
random_seed=args.seed,
|
||||
).sample(
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
tokenizer=tokenizer,
|
||||
output_len=args.hf_output_len,
|
||||
)
|
||||
@ -729,15 +724,15 @@ def main(args: argparse.Namespace):
|
||||
random_seed=args.seed, dataset_path=args.dataset_path
|
||||
).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
output_len=args.sharegpt_output_len,
|
||||
),
|
||||
"burstgpt": lambda: BurstGPTDataset(
|
||||
random_seed=args.seed, dataset_path=args.dataset_path
|
||||
).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
|
||||
).sample(tokenizer=tokenizer, num_requests=total_prompts),
|
||||
"random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
prefix_len=args.random_prefix_len,
|
||||
input_len=args.random_input_len,
|
||||
output_len=args.random_output_len,
|
||||
@ -772,10 +767,6 @@ def main(args: argparse.Namespace):
|
||||
if "temperature" not in sampling_params:
|
||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||
|
||||
if args.backend == "llama.cpp":
|
||||
# Disable prompt caching in llama.cpp backend
|
||||
sampling_params["cache_prompt"] = False
|
||||
|
||||
# Avoid GC processing "static" data - reduce pause times.
|
||||
gc.collect()
|
||||
gc.freeze()
|
||||
@ -788,7 +779,7 @@ def main(args: argparse.Namespace):
|
||||
model_id=model_id,
|
||||
model_name=model_name,
|
||||
tokenizer=tokenizer,
|
||||
input_requests=input_requests,
|
||||
requests=input_requests,
|
||||
logprobs=args.logprobs,
|
||||
request_rate=args.request_rate,
|
||||
burstiness=args.burstiness,
|
||||
@ -848,8 +839,6 @@ def main(args: argparse.Namespace):
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
if field in benchmark_result:
|
||||
del benchmark_result[field]
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
@ -862,7 +851,6 @@ def main(args: argparse.Namespace):
|
||||
if args.result_filename:
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
os.makedirs(args.result_dir, exist_ok=True)
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(
|
||||
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
||||
@ -903,7 +891,7 @@ if __name__ == "__main__":
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="sharegpt",
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@ -1073,19 +1061,6 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
# group for dataset specific arguments
|
||||
custom_group = parser.add_argument_group("custom dataset options")
|
||||
custom_group.add_argument(
|
||||
"--custom-output-len",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Number of output tokens per request, used only for custom dataset.",
|
||||
)
|
||||
custom_group.add_argument(
|
||||
"--custom-skip-chat-template",
|
||||
action="store_true",
|
||||
help="Skip applying chat template to prompt, used only for custom dataset.",
|
||||
)
|
||||
|
||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||
sonnet_group.add_argument(
|
||||
"--sonnet-input-len",
|
||||
|
||||
@ -672,7 +672,7 @@ async def benchmark(
|
||||
def evaluate(ret, args):
|
||||
def _eval_correctness_json(expected, actual):
|
||||
# extract json string from string using regex
|
||||
import regex as re
|
||||
import re
|
||||
|
||||
actual = actual.replace("\n", "").replace(" ", "").strip()
|
||||
try:
|
||||
@ -687,7 +687,7 @@ def evaluate(ret, args):
|
||||
return actual in args.choice
|
||||
|
||||
def _eval_correctness_regex(expected, actual):
|
||||
import regex as re
|
||||
import re
|
||||
|
||||
return re.match(args.regex, actual) is not None
|
||||
|
||||
|
||||
@ -1,222 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
import triton
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=[
|
||||
"torch-bf16",
|
||||
# "fp8-tensor-w-token-a",
|
||||
"fp8-tensor-w-tensor-a",
|
||||
"fp8-channel-w-token-a",
|
||||
# "fp8-channel-w-tensor-a",
|
||||
# "fp8-tensor-w-token-a-noquant",
|
||||
"fp8-tensor-w-tensor-a-noquant",
|
||||
"fp8-channel-w-token-a-noquant",
|
||||
# "fp8-channel-w-tensor-a-noquant",
|
||||
],
|
||||
line_names=[
|
||||
"torch-bf16",
|
||||
# "fp8-tensor-w-token-a",
|
||||
"fp8-tensor-w-tensor-a",
|
||||
"fp8-channel-w-token-a",
|
||||
# "fp8-channel-w-tensor-a",
|
||||
# "fp8-tensor-w-token-a-noquant",
|
||||
"fp8-tensor-w-tensor-a-noquant",
|
||||
"fp8-channel-w-token-a-noquant",
|
||||
# "fp8-channel-w-tensor-a-noquant",
|
||||
],
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs FP8 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# Create input tensors
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if "torch-bf16" in provider:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||
)
|
||||
|
||||
elif "fp8" in provider:
|
||||
# Weights are always quantized ahead of time
|
||||
if "noquant" in provider:
|
||||
# For no quantization, we just measure the GEMM
|
||||
if "tensor-w-token-a" in provider:
|
||||
# Dynamic per-token quant for A, per-tensor quant for B
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "tensor-w-tensor-a" in provider:
|
||||
# Static per-tensor quantization with fixed scales
|
||||
# for both A and B
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-token-a" in provider:
|
||||
# Static per-channel quantization for weights, per-token
|
||||
# quant for A
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-tensor-a" in provider:
|
||||
# Static per-channel quantization for weights, per-tensor
|
||||
# quant for A
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
else:
|
||||
# In these cases, we quantize the activations during the GEMM call
|
||||
if "tensor-w-token-a" in provider:
|
||||
# Dynamic per-token quant for A, per-tensor quant for B
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "tensor-w-tensor-a" in provider:
|
||||
# Static per-tensor quantization with fixed scales
|
||||
# for both A and B
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-token-a" in provider:
|
||||
# Static per-channel quantization for weights, per-token
|
||||
# quant for A
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-tensor-a" in provider:
|
||||
# Static per-channel quantization for weights, per-tensor
|
||||
# quant for A
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
b_fp8 = b_fp8.t()
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), quantiles=quantiles
|
||||
)
|
||||
|
||||
# Calculate TFLOP/s, two flops per multiply-add
|
||||
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
|
||||
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
KN_model_names = []
|
||||
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||
for model, tp_size in models_tps:
|
||||
assert model in WEIGHT_SHAPES
|
||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||
KN.append(model)
|
||||
KN_model_names.append(KN)
|
||||
return KN_model_names
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=[*WEIGHT_SHAPES.keys()],
|
||||
help="List of models to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[1],
|
||||
help="List of tensor parallel sizes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
KN_model_names = prepare_shapes(args)
|
||||
for K, N, model_name in KN_model_names:
|
||||
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_fp8_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
||||
@ -84,10 +84,7 @@ def main(
|
||||
if version == "v2":
|
||||
if current_platform.is_rocm():
|
||||
global PARTITION_SIZE
|
||||
if not args.custom_paged_attn and not current_platform.is_navi():
|
||||
PARTITION_SIZE = 1024
|
||||
else:
|
||||
PARTITION_SIZE = PARTITION_SIZE_ROCM
|
||||
PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
|
||||
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||
@ -162,7 +159,6 @@ def main(
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
None,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
|
||||
@ -22,7 +22,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: float = 10000,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
@ -2,11 +2,11 @@
|
||||
|
||||
import math
|
||||
import pickle
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import regex as re
|
||||
import seaborn as sns
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
|
||||
|
||||
@ -48,50 +48,4 @@ WEIGHT_SHAPES = {
|
||||
([16384, 106496], 1),
|
||||
([53248, 16384], 0),
|
||||
],
|
||||
"meta-llama/Llama-3.1-8B-Instruct": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-3.3-70B-Instruct": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 57344], 1),
|
||||
([28672, 8192], 0),
|
||||
],
|
||||
"mistralai/Mistral-Large-Instruct-2407": [
|
||||
([12288, 14336], 1),
|
||||
([12288, 12288], 0),
|
||||
([12288, 57344], 1),
|
||||
([28672, 12288], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-7B-Instruct": [
|
||||
([3584, 4608], 1),
|
||||
([3584, 3584], 0),
|
||||
([3584, 37888], 1),
|
||||
([18944, 3584], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-32B-Instruct": [
|
||||
([5120, 7168], 1),
|
||||
([5120, 5120], 0),
|
||||
([5120, 55296], 1),
|
||||
([27648, 5120], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-72B-Instruct": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 59136], 1),
|
||||
([29568, 8192], 0),
|
||||
],
|
||||
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
|
||||
([2048, 3072], 1),
|
||||
([2048, 4096], 1),
|
||||
([2048, 2048], 0),
|
||||
([2048, 576], 0),
|
||||
([2048, 21888], 1),
|
||||
([10944, 2048], 0),
|
||||
([2048, 2816], 1),
|
||||
([1408, 2048], 0),
|
||||
],
|
||||
}
|
||||
|
||||
@ -6,6 +6,11 @@
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
exclude = [
|
||||
# External file, leaving license intact
|
||||
"examples/other/fp8/quantizer/quantize.py",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
|
||||
@ -46,38 +46,22 @@ else()
|
||||
endif()
|
||||
|
||||
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
|
||||
|
||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||
# This is here to support installing all components under the same prefix with cmake --install.
|
||||
# setup.py installs every component separately but uses the same prefix for all.
|
||||
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
|
||||
# and these statements don't hurt when installing neither component.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Restore the install prefix
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
|
||||
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
||||
add_custom_target(
|
||||
hipify${NAME}
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
||||
BYPRODUCTS ${HIP_SRCS}
|
||||
COMMENT "Running hipify on ${NAME} extension source files.")
|
||||
|
||||
@ -143,14 +143,6 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
TORCH_CHECK(head_size % pack_size == 0,
|
||||
"headsize must be multiple of pack_size:", pack_size);
|
||||
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
||||
"output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
||||
"prefix_output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
||||
"suffix_output heads must be contiguous in memory");
|
||||
float* output_lse_ptr = nullptr;
|
||||
if (output_lse.has_value()) {
|
||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||
|
||||
@ -19,7 +19,6 @@ namespace vec_op {
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
|
||||
@ -15,6 +15,15 @@
|
||||
cutlassGetStatusString(error)); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Panic wrapper for unwinding CUDA runtime errors
|
||||
*/
|
||||
#define CUDA_CHECK(status) \
|
||||
{ \
|
||||
cudaError_t error = status; \
|
||||
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
|
||||
}
|
||||
|
||||
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
|
||||
int max_shared_mem_per_block_opt_in = 0;
|
||||
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
|
||||
|
||||
@ -13,10 +13,6 @@
|
||||
#include <cub/block/block_load.cuh>
|
||||
#include <cub/block/block_store.cuh>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
namespace cub = hipcub;
|
||||
#endif
|
||||
|
||||
#include "static_switch.h"
|
||||
|
||||
|
||||
@ -505,9 +501,15 @@ void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
||||
auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
|
||||
|
||||
if (kSmemSize >= 48 * 1024) {
|
||||
#ifndef USE_ROCM
|
||||
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||
#else
|
||||
// There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
|
||||
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||
(void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||
std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
|
||||
#endif
|
||||
}
|
||||
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
||||
|
||||
|
||||
@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
||||
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
||||
if (kSmemSize >= 48 * 1024) {
|
||||
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||
(void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||
}
|
||||
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
@ -28,6 +28,4 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor num_tokens_post_pad, int64_t top_k,
|
||||
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
|
||||
int64_t BLOCK_SIZE_K, int64_t bit);
|
||||
#endif
|
||||
|
||||
bool moe_permute_unpermute_supported();
|
||||
#endif
|
||||
@ -5,9 +5,6 @@
|
||||
#include "permute_unpermute_kernels/dispatch.h"
|
||||
#include "core/registration.h"
|
||||
|
||||
// moe_permute kernels require at least CUDA 12.0
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
|
||||
|
||||
void moe_permute(
|
||||
const torch::Tensor& input, // [n_token, hidden]
|
||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||
@ -130,45 +127,7 @@ void moe_unpermute(
|
||||
});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
torch::Tensor& permuted_input,
|
||||
torch::Tensor& expert_first_token_offset,
|
||||
torch::Tensor& src_row_id2dst_row_id_map,
|
||||
torch::Tensor& m_indices) {
|
||||
TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
|
||||
}
|
||||
|
||||
void moe_unpermute(const torch::Tensor& input,
|
||||
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
torch::Tensor& permuted_input,
|
||||
torch::Tensor& expert_first_token_offset,
|
||||
torch::Tensor& src_row_id2dst_row_id_map,
|
||||
torch::Tensor& m_indices) {
|
||||
TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool moe_permute_unpermute_supported() {
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("moe_permute", &moe_permute);
|
||||
m.impl("moe_unpermute", &moe_unpermute);
|
||||
}
|
||||
}
|
||||
@ -1,9 +1,6 @@
|
||||
|
||||
#include "moe_permute_unpermute_kernel.h"
|
||||
|
||||
// moe_permute kernels require at least CUDA 12.0
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
|
||||
|
||||
// CubKeyValueSorter definition begin
|
||||
CubKeyValueSorter::CubKeyValueSorter()
|
||||
: num_experts_(0), num_bits_(sizeof(int) * 8) {}
|
||||
@ -134,6 +131,9 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
|
||||
int num_experts) {
|
||||
auto tidx = threadIdx.x;
|
||||
auto bidx = blockIdx.x;
|
||||
auto lidx = tidx & 31;
|
||||
auto widx = tidx >> 5;
|
||||
auto warp_count = (blockDim.x + 31) >> 5;
|
||||
auto offset = bidx * blockDim.x;
|
||||
auto bound = min(offset + blockDim.x, size);
|
||||
extern __shared__ int smem_expert_map[];
|
||||
@ -226,6 +226,4 @@ void getMIndices(int64_t* expert_first_token_offset,
|
||||
expert_first_token_offset, align_expert_first_token_offset, m_indices,
|
||||
num_local_expert, align_block_size);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
|
||||
// Calculate the result of moe by summing up the partial results
|
||||
// from all selected experts.
|
||||
m.def("moe_sum(Tensor input, Tensor! output) -> ()");
|
||||
m.def("moe_sum(Tensor! input, Tensor output) -> ()");
|
||||
m.impl("moe_sum", torch::kCUDA, &moe_sum);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
@ -77,9 +77,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
|
||||
"expert_first_token_offset, int n_expert, int n_local_expert,int "
|
||||
"topk, Tensor! hidden_states)->()");
|
||||
|
||||
m.def("moe_permute_unpermute_supported() -> bool");
|
||||
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
|
||||
}
|
||||
|
||||
bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
|
||||
// CUTLASS grouped FP8 kernels need at least CUDA 12.3
|
||||
// CUTLASS groped FP8 kernels need at least CUDA 12.3
|
||||
// and SM90 (Hopper)
|
||||
|
||||
#if defined CUDA_VERSION
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -13,34 +13,14 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "quantization/fp8/common.cuh"
|
||||
|
||||
#if defined(__HIPCC__) && \
|
||||
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__GFX9__
|
||||
#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
|
||||
#define __HIP__MI300_MI250__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__MI3XX__
|
||||
#if defined(__HIPCC__) && defined(__gfx942__)
|
||||
#define __HIP__MI300__
|
||||
#endif
|
||||
|
||||
#if defined(__gfx950__)
|
||||
#define LDS_SIZE 160 * 1024
|
||||
#else
|
||||
#define LDS_SIZE 64 * 1024
|
||||
#endif
|
||||
|
||||
int get_lds_size() {
|
||||
static bool is_cached = false;
|
||||
static int result;
|
||||
if (is_cached == false) {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
size_t substring = device_arch.find("gfx95");
|
||||
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
|
||||
is_cached = true;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#if defined(NDEBUG)
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
@ -287,7 +267,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
||||
// This version targets cases where A[] fits LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@ -295,8 +275,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A, scalar_t* C,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
#if defined(__HIP__MI3XX__)
|
||||
#if defined(__HIP__MI300__)
|
||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||
#else
|
||||
constexpr bool use_mfma = false;
|
||||
@ -316,13 +295,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
};
|
||||
|
||||
//----------------------------------------------------
|
||||
// Reserving 64/160 KB of LDS to have 1 WG / CU
|
||||
// Reserving 64 KB of LDS to have 1 WG / CU
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
__shared__ scalar_t s[1024 * 32];
|
||||
|
||||
//----------------------------------------------------
|
||||
// Fetch the activation matrix to LDS
|
||||
@ -333,11 +312,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// - Then the WG will move to another 8 K elements
|
||||
// TODO: Logic below will only work when K is multiple of 8
|
||||
//----------------------------------------------------
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@ -538,7 +517,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||
@ -546,9 +525,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
||||
// This version targets cases where A[] marginally exceeds LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@ -556,8 +535,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A, scalar_t* C,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
#if defined(__HIP__MI3XX__)
|
||||
#if defined(__HIP__MI300__)
|
||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||
#else
|
||||
constexpr bool use_mfma = false;
|
||||
@ -583,7 +561,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
__shared__ scalar_t s[1024 * 32];
|
||||
|
||||
//----------------------------------------------------
|
||||
// Computation of columns that need to be committed to memory!
|
||||
@ -620,11 +598,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// - Then the WG will move to another 8 K elements
|
||||
// TODO: Logic below will only work when K is multiple of 8
|
||||
//----------------------------------------------------
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@ -708,7 +686,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Fetch A activation matrix in interleaved fashion from LDS or memory
|
||||
|
||||
for (int n = 0; n < N; n++) {
|
||||
if (k_ + K * n < max_lds_len)
|
||||
if (k_ + K * n < 32 * 1024)
|
||||
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
||||
else
|
||||
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
||||
@ -839,7 +817,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||
@ -847,9 +825,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
||||
// This version targets big A[] cases, where it is much larger than LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@ -857,8 +835,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A, scalar_t* C,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
#if defined(__HIP__MI3XX__)
|
||||
#if defined(__HIP__MI300__)
|
||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||
#else
|
||||
constexpr bool use_mfma = false;
|
||||
@ -878,13 +855,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
};
|
||||
|
||||
//----------------------------------------------------
|
||||
// Reserving 64/160 KB of LDS to have 1 WG / CU
|
||||
// Reserving 64 KB of LDS to have 1 WG / CU
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
__shared__ scalar_t s[1024 * 32];
|
||||
|
||||
//----------------------------------------------------
|
||||
// Computation of columns that need to be committed to memory!
|
||||
@ -925,11 +902,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
//----------------------------------------------------
|
||||
#define PCML
|
||||
#ifndef PCML
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@ -939,7 +916,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#define TUC (THRDS * UNRL * A_CHUNK)
|
||||
uint32_t kBase = 0;
|
||||
// find biggest k size that fits in LDS
|
||||
uint32_t kFit = (max_lds_len) / N;
|
||||
uint32_t kFit = (32 * 1024) / N;
|
||||
// kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
|
||||
// of TUC
|
||||
kFit = (kFit % TUC == 0)
|
||||
@ -1187,7 +1164,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||
@ -1195,7 +1172,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
|
||||
int mindiv(int N, int div1, int div2) {
|
||||
int nPrRnd = div1 * div2;
|
||||
@ -1245,18 +1222,17 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||
_N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
} else if (K_in * N_in <= max_lds_len * 1.2) { \
|
||||
} else if (K_in * N_in <= 32 * 1024 * 1.2) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
||||
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
||||
@ -1296,7 +1272,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI300__) // TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@ -1305,7 +1281,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
const float* __restrict__ s_A,
|
||||
const float* __restrict__ s_B, const int _WvPrGrp,
|
||||
const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE;
|
||||
using scalar8 =
|
||||
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
||||
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
@ -1321,10 +1296,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
scalar8 h8;
|
||||
};
|
||||
|
||||
__shared__ fp8_t s[max_lds_len];
|
||||
__shared__ fp8_t s[1024 * 64];
|
||||
|
||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||
}
|
||||
__syncthreads();
|
||||
@ -1461,7 +1436,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
||||
@ -1471,9 +1446,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI300__) // TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@ -1481,7 +1456,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
const fp8_t* __restrict__ A, scalar_t* C,
|
||||
const float* __restrict__ s_A, const float* __restrict__ s_B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE;
|
||||
using scalar8 =
|
||||
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
||||
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
@ -1497,10 +1471,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
scalar8 h8;
|
||||
};
|
||||
|
||||
__shared__ fp8_t s[max_lds_len];
|
||||
__shared__ fp8_t s[1024 * 64];
|
||||
|
||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||
}
|
||||
__syncthreads();
|
||||
@ -1543,7 +1517,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
||||
if (k_ >= K) break;
|
||||
for (int n = 0; n < N; n++) {
|
||||
if (k_ + K * n < max_lds_len)
|
||||
if (k_ + K * n < 64 * 1024)
|
||||
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
||||
else
|
||||
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
||||
@ -1634,7 +1608,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||
@ -1644,7 +1618,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||
const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
|
||||
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||
at::Tensor& scale_a, at::Tensor& scale_b,
|
||||
@ -1664,13 +1638,12 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||
dim3 grid(CuCount);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size();
|
||||
|
||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||
_N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
|
||||
|
||||
@ -8,8 +8,6 @@
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
@ -97,9 +95,9 @@ struct cutlass_sparse_3x_gemm {
|
||||
// clang-format off
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
|
||||
ElementAB, cutlass::layout::RowMajor, AlignmentAB,
|
||||
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
|
||||
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
|
||||
ElementAB, cutlass::layout::RowMajor, AlignmentAB,
|
||||
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
|
||||
ElementAcc, TileShape, ClusterShape,
|
||||
Stages,
|
||||
KernelSchedule>::CollectiveOp;
|
||||
|
||||
@ -482,6 +482,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor page_table, float scale) -> ()");
|
||||
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
|
||||
|
||||
// Mamba selective scan kernel
|
||||
ops.def(
|
||||
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
||||
"Tensor! A, Tensor! B, Tensor! C,"
|
||||
"Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
|
||||
"bool delta_softplus,"
|
||||
"Tensor? query_start_loc,"
|
||||
"Tensor? cache_indices,"
|
||||
"Tensor? has_initial_state,"
|
||||
"Tensor! ssm_states,"
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||
|
||||
ops.def(
|
||||
"causal_conv1d_update(Tensor! x,"
|
||||
"Tensor! conv_state,"
|
||||
"Tensor! weight,"
|
||||
"Tensor? bias_,"
|
||||
"bool silu_activation,"
|
||||
"Tensor? cache_seqlens_,"
|
||||
"Tensor? conv_state_indices,"
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
|
||||
|
||||
ops.def(
|
||||
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
|
||||
"Tensor? bias_,"
|
||||
"Tensor!? conv_states,"
|
||||
"Tensor? query_start_loc,"
|
||||
"Tensor? cache_indices,"
|
||||
"Tensor? has_initial_state,"
|
||||
"bool silu_activation,"
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
|
||||
|
||||
// Compute NVFP4 block quantized tensor.
|
||||
ops.def(
|
||||
"scaled_fp4_quant(Tensor! output, Tensor input,"
|
||||
@ -549,41 +584,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
|
||||
&dynamic_scaled_int8_quant);
|
||||
|
||||
// Mamba selective scan kernel
|
||||
ops.def(
|
||||
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
||||
"Tensor! A, Tensor! B, Tensor! C,"
|
||||
"Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
|
||||
"bool delta_softplus,"
|
||||
"Tensor? query_start_loc,"
|
||||
"Tensor? cache_indices,"
|
||||
"Tensor? has_initial_state,"
|
||||
"Tensor! ssm_states,"
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||
|
||||
ops.def(
|
||||
"causal_conv1d_update(Tensor! x,"
|
||||
"Tensor! conv_state,"
|
||||
"Tensor! weight,"
|
||||
"Tensor? bias_,"
|
||||
"bool silu_activation,"
|
||||
"Tensor? cache_seqlens_,"
|
||||
"Tensor? conv_state_indices,"
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
|
||||
|
||||
ops.def(
|
||||
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
|
||||
"Tensor? bias_,"
|
||||
"Tensor!? conv_states,"
|
||||
"Tensor? query_start_loc,"
|
||||
"Tensor? cache_indices,"
|
||||
"Tensor? has_initial_state,"
|
||||
"bool silu_activation,"
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
||||
ops.def(
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
# Please update any changes made here to
|
||||
# docs/contributing/dockerfile/dockerfile.md and
|
||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
||||
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
@ -189,8 +189,6 @@ WORKDIR /vllm-workspace
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
@ -257,17 +255,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
# FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
|
||||
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
|
||||
else \
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -lt 12 ]; then \
|
||||
export FLASHINFER_ENABLE_SM90=0; \
|
||||
fi; \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
|
||||
fi \
|
||||
# uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||
# TESTING: install FlashInfer from source to test 2.7.0 final RC
|
||||
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@e00e8cedbfcb220f328fd36aa8f529f869b01e6b" ; \
|
||||
fi
|
||||
COPY examples examples
|
||||
COPY benchmarks benchmarks
|
||||
@ -277,7 +268,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
uv pip list
|
||||
|
||||
# Even when we build Flashinfer with AOT mode, there's still
|
||||
# Although we build Flashinfer with AOT mode, there's still
|
||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||
# install build dependencies for JIT compilation.
|
||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||
@ -305,11 +296,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
||||
uv pip install --system -r requirements/dev.txt; \
|
||||
fi
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -328,9 +316,7 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
|
||||
# will not be imported by other tests
|
||||
RUN mkdir test_docs
|
||||
RUN mv docs test_docs/
|
||||
RUN cp -r examples test_docs/
|
||||
RUN mv vllm test_docs/
|
||||
RUN mv mkdocs.yaml test_docs/
|
||||
#################### TEST IMAGE ####################
|
||||
|
||||
#################### OPENAI API SERVER ####################
|
||||
|
||||
@ -51,6 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --upgrade pip && \
|
||||
uv pip install -r requirements/cpu.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
|
||||
|
||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# default base image
|
||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
@ -22,7 +22,8 @@ WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
||||
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
||||
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install pytest
|
||||
|
||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||
@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
||||
|
||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
||||
|
||||
# overwrite entrypoint to run bash script
|
||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="1a7f4dfa"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="c1debd8"
|
||||
ARG AITER_BRANCH="5a77249"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
@ -84,40 +84,16 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
|
||||
rustup default stable && \
|
||||
rustup show
|
||||
|
||||
FROM python-install AS torch
|
||||
ARG TORCH_VERSION=2.7.0
|
||||
ENV export _GLIBCXX_USE_CXX11_ABI=1
|
||||
ENV CARGO_HOME=/root/.cargo
|
||||
ENV RUSTUP_HOME=/root/.rustup
|
||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
|
||||
WORKDIR /tmp
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||
git clone https://github.com/pytorch/pytorch.git && \
|
||||
cd pytorch && \
|
||||
git checkout v2.7.0 && \
|
||||
git submodule sync && \
|
||||
git submodule update --init --recursive && \
|
||||
uv pip install cmake ninja && \
|
||||
uv pip install -r requirements.txt && \
|
||||
python setup.py bdist_wheel
|
||||
|
||||
|
||||
FROM python-install AS torch-vision
|
||||
# Install torchvision
|
||||
ARG TORCH_VERSION=2.7.0
|
||||
ARG TORCH_VERSION=2.7.0.dev20250304
|
||||
ARG TORCH_VISION_VERSION=v0.20.1
|
||||
WORKDIR /tmp
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
||||
git clone https://github.com/pytorch/vision.git && \
|
||||
cd vision && \
|
||||
git checkout $TORCH_VISION_VERSION && \
|
||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
|
||||
uv pip install -v $TORCH_WHL_FILE && \
|
||||
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
|
||||
python setup.py bdist_wheel
|
||||
|
||||
FROM python-install AS hf-xet-builder
|
||||
@ -162,17 +138,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
|
||||
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
|
||||
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
|
||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
||||
sed -i '/^torch/d' requirements/build.txt && \
|
||||
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
|
||||
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
|
||||
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
|
||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
|
||||
uv pip install -v \
|
||||
$ARROW_WHL_FILE \
|
||||
$VISION_WHL_FILE \
|
||||
$HF_XET_WHL_FILE \
|
||||
$TORCH_WHL_FILE \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||
--index-strategy unsafe-best-match \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
nav:
|
||||
- Home:
|
||||
- vLLM: README.md
|
||||
- Getting Started:
|
||||
- getting_started/quickstart.md
|
||||
- getting_started/installation
|
||||
- Examples:
|
||||
- Offline Inference: examples/offline_inference
|
||||
- Online Serving: examples/online_serving
|
||||
- Others: examples/others
|
||||
- Quick Links:
|
||||
- User Guide: usage/README.md
|
||||
- Developer Guide: contributing/README.md
|
||||
- API Reference: api/README.md
|
||||
- CLI Reference: cli/README.md
|
||||
- Timeline:
|
||||
- Roadmap: https://roadmap.vllm.ai
|
||||
- Releases: https://github.com/vllm-project/vllm/releases
|
||||
- User Guide:
|
||||
- Summary: usage/README.md
|
||||
- usage/v1_guide.md
|
||||
- General:
|
||||
- usage/*
|
||||
- Inference and Serving:
|
||||
- serving/offline_inference.md
|
||||
- serving/openai_compatible_server.md
|
||||
- serving/*
|
||||
- serving/integrations
|
||||
- Deployment:
|
||||
- deployment/*
|
||||
- deployment/frameworks
|
||||
- deployment/integrations
|
||||
- Training: training
|
||||
- Configuration:
|
||||
- Summary: configuration/README.md
|
||||
- configuration/*
|
||||
- Models:
|
||||
- models/supported_models.md
|
||||
- models/generative_models.md
|
||||
- models/pooling_models.md
|
||||
- models/extensions
|
||||
- Features:
|
||||
- features/compatibility_matrix.md
|
||||
- features/*
|
||||
- features/quantization
|
||||
- Developer Guide:
|
||||
- Summary: contributing/README.md
|
||||
- General:
|
||||
- glob: contributing/*
|
||||
flatten_single_child_sections: true
|
||||
- Model Implementation: contributing/model
|
||||
- Design Documents:
|
||||
- V0: design
|
||||
- V1: design/v1
|
||||
- API Reference:
|
||||
- Summary: api/README.md
|
||||
- Contents:
|
||||
- glob: api/vllm/*
|
||||
preserve_directory_names: true
|
||||
- CLI Reference:
|
||||
- Summary: cli/README.md
|
||||
- Community:
|
||||
- community/*
|
||||
- Blog: https://blog.vllm.ai
|
||||
- Forum: https://discuss.vllm.ai
|
||||
- Slack: https://slack.vllm.ai
|
||||
25
docs/Makefile
Normal file
25
docs/Makefile
Normal file
@ -0,0 +1,25 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
clean:
|
||||
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
rm -rf "$(SOURCEDIR)/getting_started/examples"
|
||||
rm -rf "$(SOURCEDIR)/api/vllm"
|
||||
@ -1,50 +1,43 @@
|
||||
# Welcome to vLLM
|
||||
# vLLM documents
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="vLLM" class="no-scaled-link" width="60%" }
|
||||
</figure>
|
||||
## Build the docs
|
||||
|
||||
<p style="text-align:center">
|
||||
<strong>Easy, fast, and cheap LLM serving for everyone
|
||||
</strong>
|
||||
</p>
|
||||
- Make sure in `docs` directory
|
||||
|
||||
<p style="text-align:center">
|
||||
<script async defer src="https://buttons.github.io/buttons.js"></script>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
|
||||
</p>
|
||||
```bash
|
||||
cd docs
|
||||
```
|
||||
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
- Install the dependencies:
|
||||
|
||||
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
|
||||
```bash
|
||||
pip install -r ../requirements/docs.txt
|
||||
```
|
||||
|
||||
vLLM is fast with:
|
||||
- Clean the previous build (optional but recommended):
|
||||
|
||||
- State-of-the-art serving throughput
|
||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
```bash
|
||||
make clean
|
||||
```
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
- Generate the HTML documentation:
|
||||
|
||||
- Seamless integration with popular HuggingFace models
|
||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
|
||||
- Prefix caching support
|
||||
- Multi-lora support
|
||||
```bash
|
||||
make html
|
||||
```
|
||||
|
||||
For more information, check out the following:
|
||||
## Open the docs with your browser
|
||||
|
||||
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
|
||||
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
|
||||
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
|
||||
- [vLLM Meetups][meetups]
|
||||
- Serve the documentation locally:
|
||||
|
||||
```bash
|
||||
python -m http.server -d build/html/
|
||||
```
|
||||
|
||||
This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
|
||||
|
||||
If port 8000 is already in use, you can specify a different port, for example:
|
||||
|
||||
```bash
|
||||
python -m http.server 3000 -d build/html/
|
||||
```
|
||||
|
||||
@ -1,107 +0,0 @@
|
||||
# Summary
|
||||
|
||||
[](){ #configuration }
|
||||
|
||||
## Configuration
|
||||
|
||||
API documentation for vLLM's configuration classes.
|
||||
|
||||
- [vllm.config.ModelConfig][]
|
||||
- [vllm.config.CacheConfig][]
|
||||
- [vllm.config.TokenizerPoolConfig][]
|
||||
- [vllm.config.LoadConfig][]
|
||||
- [vllm.config.ParallelConfig][]
|
||||
- [vllm.config.SchedulerConfig][]
|
||||
- [vllm.config.DeviceConfig][]
|
||||
- [vllm.config.SpeculativeConfig][]
|
||||
- [vllm.config.LoRAConfig][]
|
||||
- [vllm.config.PromptAdapterConfig][]
|
||||
- [vllm.config.MultiModalConfig][]
|
||||
- [vllm.config.PoolerConfig][]
|
||||
- [vllm.config.DecodingConfig][]
|
||||
- [vllm.config.ObservabilityConfig][]
|
||||
- [vllm.config.KVTransferConfig][]
|
||||
- [vllm.config.CompilationConfig][]
|
||||
- [vllm.config.VllmConfig][]
|
||||
|
||||
[](){ #offline-inference-api }
|
||||
|
||||
## Offline Inference
|
||||
|
||||
LLM Class.
|
||||
|
||||
- [vllm.LLM][]
|
||||
|
||||
LLM Inputs.
|
||||
|
||||
- [vllm.inputs.PromptType][]
|
||||
- [vllm.inputs.TextPrompt][]
|
||||
- [vllm.inputs.TokensPrompt][]
|
||||
|
||||
## vLLM Engines
|
||||
|
||||
Engine classes for offline and online inference.
|
||||
|
||||
- [vllm.LLMEngine][]
|
||||
- [vllm.AsyncLLMEngine][]
|
||||
|
||||
## Inference Parameters
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
[](){ #sampling-params }
|
||||
[](){ #pooling-params }
|
||||
|
||||
- [vllm.SamplingParams][]
|
||||
- [vllm.PoolingParams][]
|
||||
|
||||
[](){ #multi-modality }
|
||||
|
||||
## Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
||||
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal].
|
||||
|
||||
- [vllm.multimodal.MULTIMODAL_REGISTRY][]
|
||||
|
||||
### Inputs
|
||||
|
||||
User-facing inputs.
|
||||
|
||||
- [vllm.multimodal.inputs.MultiModalDataDict][]
|
||||
|
||||
Internal data structures.
|
||||
|
||||
- [vllm.multimodal.inputs.PlaceholderRange][]
|
||||
- [vllm.multimodal.inputs.NestedTensors][]
|
||||
- [vllm.multimodal.inputs.MultiModalFieldElem][]
|
||||
- [vllm.multimodal.inputs.MultiModalFieldConfig][]
|
||||
- [vllm.multimodal.inputs.MultiModalKwargsItem][]
|
||||
- [vllm.multimodal.inputs.MultiModalKwargs][]
|
||||
- [vllm.multimodal.inputs.MultiModalInputs][]
|
||||
|
||||
### Data Parsing
|
||||
|
||||
- [vllm.multimodal.parse][]
|
||||
|
||||
### Data Processing
|
||||
|
||||
- [vllm.multimodal.processing][]
|
||||
|
||||
### Memory Profiling
|
||||
|
||||
- [vllm.multimodal.profiling][]
|
||||
|
||||
### Registry
|
||||
|
||||
- [vllm.multimodal.registry][]
|
||||
|
||||
## Model Development
|
||||
|
||||
- [vllm.model_executor.models.interfaces_base][]
|
||||
- [vllm.model_executor.models.interfaces][]
|
||||
- [vllm.model_executor.models.adapters][]
|
||||
@ -1,2 +0,0 @@
|
||||
search:
|
||||
boost: 0.5
|
||||
@ -1,179 +0,0 @@
|
||||
# vLLM CLI Guide
|
||||
|
||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||
|
||||
```
|
||||
vllm --help
|
||||
```
|
||||
|
||||
Available Commands:
|
||||
|
||||
```
|
||||
vllm {chat,complete,serve,bench,collect-env,run-batch}
|
||||
```
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [serve](#serve)
|
||||
- [chat](#chat)
|
||||
- [complete](#complete)
|
||||
- [bench](#bench)
|
||||
- [latency](#latency)
|
||||
- [serve](#serve-1)
|
||||
- [throughput](#throughput)
|
||||
- [collect-env](#collect-env)
|
||||
- [run-batch](#run-batch)
|
||||
- [More Help](#more-help)
|
||||
|
||||
## serve
|
||||
|
||||
Start the vLLM OpenAI Compatible API server.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Start with a model
|
||||
vllm serve meta-llama/Llama-2-7b-hf
|
||||
|
||||
# Specify the port
|
||||
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
||||
|
||||
# Check with --help for more options
|
||||
# To list all groups
|
||||
vllm serve --help=listgroup
|
||||
|
||||
# To view a argument group
|
||||
vllm serve --help=ModelConfig
|
||||
|
||||
# To view a single argument
|
||||
vllm serve --help=max-num-seqs
|
||||
|
||||
# To search by keyword
|
||||
vllm serve --help=max
|
||||
```
|
||||
|
||||
## chat
|
||||
|
||||
Generate chat completions via the running API server.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Directly connect to localhost API without arguments
|
||||
vllm chat
|
||||
|
||||
# Specify API url
|
||||
vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
||||
|
||||
# Quick chat with a single prompt
|
||||
vllm chat --quick "hi"
|
||||
```
|
||||
|
||||
## complete
|
||||
|
||||
Generate text completions based on the given prompt via the running API server.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Directly connect to localhost API without arguments
|
||||
vllm complete
|
||||
|
||||
# Specify API url
|
||||
vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
||||
|
||||
# Quick complete with a single prompt
|
||||
vllm complete --quick "The future of AI is"
|
||||
```
|
||||
|
||||
## bench
|
||||
|
||||
Run benchmark tests for latency online serving throughput and offline inference throughput.
|
||||
|
||||
Available Commands:
|
||||
|
||||
```bash
|
||||
vllm bench {latency, serve, throughput}
|
||||
```
|
||||
|
||||
### latency
|
||||
|
||||
Benchmark the latency of a single batch of requests.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
vllm bench latency \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--input-len 32 \
|
||||
--output-len 1 \
|
||||
--enforce-eager \
|
||||
--load-format dummy
|
||||
```
|
||||
|
||||
### serve
|
||||
|
||||
Benchmark the online serving throughput.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--host server-host \
|
||||
--port server-port \
|
||||
--random-input-len 32 \
|
||||
--random-output-len 4 \
|
||||
--num-prompts 5
|
||||
```
|
||||
|
||||
### throughput
|
||||
|
||||
Benchmark offline inference throughput.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--input-len 32 \
|
||||
--output-len 1 \
|
||||
--enforce-eager \
|
||||
--load-format dummy
|
||||
```
|
||||
|
||||
## collect-env
|
||||
|
||||
Start collecting environment information.
|
||||
|
||||
```bash
|
||||
vllm collect-env
|
||||
```
|
||||
|
||||
## run-batch
|
||||
|
||||
Run batch prompts and write results to file.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Running with a local file
|
||||
vllm run-batch \
|
||||
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# Using remote file
|
||||
vllm run-batch \
|
||||
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
## More Help
|
||||
|
||||
For detailed options of any subcommand, use:
|
||||
|
||||
```bash
|
||||
vllm <subcommand> --help
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Configuration Options
|
||||
|
||||
This section lists the most common options for running vLLM.
|
||||
|
||||
There are three main levels of configuration, from highest priority to lowest priority:
|
||||
|
||||
- [Request parameters][completions-api] and [input arguments][sampling-params]
|
||||
- [Engine arguments](./engine_args.md)
|
||||
- [Environment variables](./env_vars.md)
|
||||
@ -1,144 +0,0 @@
|
||||
# Conserving Memory
|
||||
|
||||
Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
|
||||
|
||||
## Tensor Parallelism (TP)
|
||||
|
||||
Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
|
||||
|
||||
The following code splits the model across 2 GPUs.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
|
||||
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||
|
||||
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
|
||||
!!! note
|
||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||
|
||||
You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
|
||||
## Quantization
|
||||
|
||||
Quantized models take less memory at the cost of lower precision.
|
||||
|
||||
Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
|
||||
and used directly without extra configuration.
|
||||
|
||||
Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details.
|
||||
|
||||
## Context length and batch size
|
||||
|
||||
You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
|
||||
and the maximum batch size (`max_num_seqs` option).
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2)
|
||||
```
|
||||
|
||||
## Reduce CUDA Graphs
|
||||
|
||||
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
|
||||
|
||||
!!! warning
|
||||
CUDA graph capture takes up more memory in V1 than in V0.
|
||||
|
||||
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
# By default, it goes up to max_num_seqs
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
You can disable graph capturing completely via the `enforce_eager` flag:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
enforce_eager=True)
|
||||
```
|
||||
|
||||
## Adjust cache size
|
||||
|
||||
If you run out of CPU RAM, try the following options:
|
||||
|
||||
- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
|
||||
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
|
||||
|
||||
## Multi-modal input limits
|
||||
|
||||
You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Accept up to 3 images and 1 video per prompt
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1})
|
||||
```
|
||||
|
||||
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
||||
For example, if your application only accepts image input, there is no need to allocate any memory for videos.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Accept any number of images but no videos
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0})
|
||||
```
|
||||
|
||||
You can even run a multi-modal model for text-only inference:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Don't accept images. Just text.
|
||||
llm = LLM(model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0})
|
||||
```
|
||||
|
||||
## Multi-modal processor arguments
|
||||
|
||||
For certain models, you can adjust the multi-modal processor arguments to
|
||||
reduce the size of the processed multi-modal inputs, which in turn saves memory.
|
||||
|
||||
Here are some examples:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Available for Qwen2-VL series models
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={
|
||||
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||
})
|
||||
|
||||
# Available for InternVL series models
|
||||
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={
|
||||
"max_dynamic_patch": 4, # Default is 12
|
||||
})
|
||||
```
|
||||
@ -1,18 +0,0 @@
|
||||
---
|
||||
title: Engine Arguments
|
||||
---
|
||||
[](){ #engine-args }
|
||||
|
||||
Engine arguments control the behavior of the vLLM engine.
|
||||
|
||||
- For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class.
|
||||
- For [online serving][openai-compatible-server], they are part of the arguments to `vllm serve`.
|
||||
|
||||
You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
|
||||
|
||||
However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented.
|
||||
|
||||
For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config.
|
||||
|
||||
!!! note
|
||||
Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help`
|
||||
@ -1,12 +0,0 @@
|
||||
# Environment Variables
|
||||
|
||||
vLLM uses the following environment variables to configure the system:
|
||||
|
||||
!!! warning
|
||||
Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
|
||||
|
||||
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
|
||||
|
||||
```python
|
||||
--8<-- "vllm/envs.py:env-vars-definition"
|
||||
```
|
||||
@ -1,23 +0,0 @@
|
||||
# Model Resolution
|
||||
|
||||
vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
|
||||
and finding the corresponding implementation that is registered to vLLM.
|
||||
Nevertheless, our model resolution may fail for the following reasons:
|
||||
|
||||
- The `config.json` of the model repository lacks the `architectures` field.
|
||||
- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
|
||||
- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
|
||||
|
||||
To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
|
||||
For example:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
model = LLM(
|
||||
model="cerebras/Cerebras-GPT-1.3B",
|
||||
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
|
||||
)
|
||||
```
|
||||
|
||||
Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM.
|
||||
@ -1,38 +0,0 @@
|
||||
---
|
||||
title: Server Arguments
|
||||
---
|
||||
[](){ #serve-args }
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
|
||||
## CLI Arguments
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
To see the available CLI arguments, run `vllm serve --help`!
|
||||
|
||||
## Configuration file
|
||||
|
||||
You can load CLI arguments via a [YAML](https://yaml.org/) config file.
|
||||
The argument names must be the long form of those outlined [above][serve-args].
|
||||
|
||||
For example:
|
||||
|
||||
```yaml
|
||||
# config.yaml
|
||||
|
||||
model: meta-llama/Llama-3.1-8B-Instruct
|
||||
host: "127.0.0.1"
|
||||
port: 6379
|
||||
uvicorn-log-level: "info"
|
||||
```
|
||||
|
||||
To use the above config file:
|
||||
|
||||
```bash
|
||||
vllm serve --config config.yaml
|
||||
```
|
||||
|
||||
!!! note
|
||||
In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
|
||||
The order of priorities is `command line > config file values > defaults`.
|
||||
e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
|
||||
@ -1,23 +0,0 @@
|
||||
---
|
||||
title: Adding a New Model
|
||||
---
|
||||
[](){ #new-model }
|
||||
|
||||
This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
|
||||
|
||||
Contents:
|
||||
|
||||
- [Basic](basic.md)
|
||||
- [Registration](registration.md)
|
||||
- [Tests](tests.md)
|
||||
- [Multimodal](multimodal.md)
|
||||
|
||||
!!! note
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
|
||||
!!! tip
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
|
||||
or ask on our [developer slack](https://slack.vllm.ai).
|
||||
We will be happy to help you out!
|
||||
@ -1,803 +0,0 @@
|
||||
---
|
||||
title: Multi-Modal Support
|
||||
---
|
||||
[](){ #supports-multimodal }
|
||||
|
||||
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs].
|
||||
|
||||
## 1. Update the base vLLM model
|
||||
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
|
||||
Further update the model as follows:
|
||||
|
||||
- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
|
||||
|
||||
```diff
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
```
|
||||
|
||||
More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
|
||||
|
||||
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
|
||||
# Run multimodal inputs through encoder and projector
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
return vision_embeddings
|
||||
```
|
||||
|
||||
!!! warning
|
||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
|
||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||
|
||||
```python
|
||||
from .utils import merge_multimodal_embeddings
|
||||
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
# `get_input_embeddings` should already be implemented for the language
|
||||
# model as one of the requirements of basic vLLM model implementation.
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids=input_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
multimodal_embeddings=multimodal_embeddings,
|
||||
placeholder_token_id=self.config.image_token_index)
|
||||
|
||||
return inputs_embeds
|
||||
```
|
||||
|
||||
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_language_model(self) -> torch.nn.Module:
|
||||
# Change `language_model` according to your implementation.
|
||||
return self.language_model
|
||||
```
|
||||
|
||||
- Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
|
||||
|
||||
```diff
|
||||
+ from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
|
||||
- class YourModelForImage2Seq(nn.Module):
|
||||
+ class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
!!! note
|
||||
The model class does not have to be named `*ForCausalLM`.
|
||||
Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
|
||||
|
||||
## 2. Specify processing information
|
||||
|
||||
Next, create a subclass of [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo]
|
||||
to provide basic information related to HF processing.
|
||||
|
||||
### Maximum number of input items
|
||||
|
||||
You need to override the abstract method [get_supported_mm_limits][vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits]
|
||||
to return the maximum number of input items for each modality supported by the model.
|
||||
|
||||
For example, if the model supports any number of images but only one video per prompt:
|
||||
|
||||
```python
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
## 3. Specify dummy inputs
|
||||
|
||||
Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for
|
||||
HF processing as well as memory profiling.
|
||||
|
||||
### For memory profiling
|
||||
|
||||
Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
|
||||
|
||||
Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
|
||||
|
||||
=== "Basic example: LLaVA"
|
||||
|
||||
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||
|
||||
if n_image_tokens != n_image_features:
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||
)
|
||||
special_image_mask = (
|
||||
(input_ids == self.config.image_token_index)
|
||||
.unsqueeze(-1)
|
||||
.expand_as(inputs_embeds)
|
||||
.to(inputs_embeds.device)
|
||||
)
|
||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||
```
|
||||
|
||||
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
||||
`image_features` is calculated inside the `get_image_features` method:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
||||
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
|
||||
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_image_feature = selected_image_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_image_feature = selected_image_feature
|
||||
else:
|
||||
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
||||
image_features = self.multi_modal_projector(selected_image_feature)
|
||||
return image_features
|
||||
```
|
||||
|
||||
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
|
||||
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
|
||||
Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
|
||||
The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
|
||||
mechanism doesn't change the sequence length of the output hidden states.
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
|
||||
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
|
||||
hidden_states = self.pre_layrnorm(hidden_states)
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
inputs_embeds=hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
```
|
||||
|
||||
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
||||
target_dtype = self.patch_embedding.weight.dtype
|
||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||
|
||||
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
||||
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
||||
if interpolate_pos_encoding:
|
||||
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
||||
else:
|
||||
embeddings = embeddings + self.position_embedding(self.position_ids)
|
||||
return embeddings
|
||||
```
|
||||
|
||||
We can infer that `embeddings.shape[1] == self.num_positions`, where
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
```
|
||||
|
||||
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
||||
|
||||
```python
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
hf_processor = self.get_hf_processor()
|
||||
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
|
||||
num_image_tokens = (image_size // patch_size) ** 2 + 1
|
||||
if hf_processor.vision_feature_select_strategy == "default":
|
||||
num_image_tokens -= 1
|
||||
|
||||
return num_image_tokens
|
||||
```
|
||||
|
||||
Notice that the number of image tokens doesn't depend on the image width and height.
|
||||
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
||||
|
||||
```python
|
||||
# NOTE: In actuality, this is usually implemented as part of the
|
||||
# model's subclass of `BaseProcessingInfo`, but we show it as is
|
||||
# here for simplicity.
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
hf_config = self.get_hf_config()
|
||||
width = height = hf_config.image_size
|
||||
return ImageSize(width=width, height=height)
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
|
||||
|
||||
```python
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
processor = self.info.get_hf_processor()
|
||||
image_token = processor.image_token
|
||||
|
||||
return image_token * num_images
|
||||
```
|
||||
|
||||
=== "No input placeholders: Fuyu"
|
||||
|
||||
Looking at the code of HF's `FuyuForCausalLM`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
||||
if image_patches is not None and past_key_values is None:
|
||||
patch_embeddings = [
|
||||
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
|
||||
.squeeze(0)
|
||||
.to(inputs_embeds.device)
|
||||
for patch in image_patches
|
||||
]
|
||||
inputs_embeds = self.gather_continuous_embeddings(
|
||||
word_embeddings=inputs_embeds,
|
||||
continuous_embeddings=patch_embeddings,
|
||||
image_patch_input_indices=image_patches_indices,
|
||||
)
|
||||
```
|
||||
|
||||
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
|
||||
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
|
||||
|
||||
Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
|
||||
Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
|
||||
|
||||
The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
|
||||
`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
|
||||
|
||||
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
||||
returning the dimensions after resizing (but before padding) as metadata.
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
||||
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
|
||||
batch_images = image_encoding["images"]
|
||||
image_unpadded_heights = image_encoding["image_unpadded_heights"]
|
||||
image_unpadded_widths = image_encoding["image_unpadded_widths"]
|
||||
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
|
||||
if do_resize:
|
||||
batch_images = [
|
||||
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
|
||||
for images in batch_images
|
||||
]
|
||||
|
||||
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
|
||||
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
||||
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
||||
|
||||
if do_pad:
|
||||
batch_images = [
|
||||
[
|
||||
self.pad_image(
|
||||
image,
|
||||
size=size,
|
||||
mode=padding_mode,
|
||||
constant_values=padding_value,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
for images in batch_images
|
||||
]
|
||||
```
|
||||
|
||||
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
|
||||
image_height, image_width = image.shape[1], image.shape[2]
|
||||
if variable_sized: # variable_sized=True
|
||||
new_h = min(
|
||||
image_height,
|
||||
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
||||
)
|
||||
new_w = min(
|
||||
image_width,
|
||||
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
||||
)
|
||||
image = image[:, :new_h, :new_w]
|
||||
image_height, image_width = new_h, new_w
|
||||
|
||||
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
|
||||
tensor_of_image_ids = torch.full(
|
||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||
)
|
||||
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
||||
assert num_patches == patches.shape[0]
|
||||
```
|
||||
|
||||
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
|
||||
|
||||
if image_height % patch_height != 0:
|
||||
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
||||
if image_width % patch_width != 0:
|
||||
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
||||
|
||||
num_patches_per_dim_h = image_height // patch_height
|
||||
num_patches_per_dim_w = image_width // patch_width
|
||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||
```
|
||||
|
||||
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
|
||||
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
|
||||
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
```
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
the dummy prompt text is empty regardless of the number of images.
|
||||
|
||||
```python
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
return ""
|
||||
```
|
||||
|
||||
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
||||
|
||||
```python
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
## 4. Specify processing details
|
||||
|
||||
Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]
|
||||
to fill in the missing details about HF processing.
|
||||
|
||||
!!! info
|
||||
[Multi-Modal Data Processing][mm-processing]
|
||||
|
||||
### Multi-modal fields
|
||||
|
||||
Override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] to
|
||||
return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
|
||||
|
||||
=== "Basic example: LLaVA"
|
||||
|
||||
The output of `CLIPImageProcessor` is a simple tensor with shape
|
||||
`(num_images, num_channels, image_height, image_width)`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in all_images
|
||||
]
|
||||
|
||||
data = {"pixel_values": images}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
```
|
||||
|
||||
So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
|
||||
|
||||
```python
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.batched("image"),
|
||||
)
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
|
||||
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
||||
|
||||
=== "With postprocessing: Fuyu"
|
||||
|
||||
The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
|
||||
the patches from each image belonging to an item in the batch:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
|
||||
image_input_ids.append(tensor_of_image_ids)
|
||||
image_patches.append(patches)
|
||||
else:
|
||||
image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
|
||||
|
||||
batch_image_input_ids.append(image_input_ids)
|
||||
batch_image_patches.append(image_patches)
|
||||
```
|
||||
|
||||
The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
|
||||
`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
|
||||
|
||||
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
||||
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
||||
|
||||
```python
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
)
|
||||
|
||||
image_patches = processed_outputs.get("image_patches")
|
||||
if image_patches is not None:
|
||||
images = mm_data["images"]
|
||||
assert isinstance(images, list)
|
||||
|
||||
# Original output: (1, num_images, Pn, Px * Py * C)
|
||||
# New output: (num_images, Pn, Px * Py * C)
|
||||
assert (isinstance(image_patches, list)
|
||||
and len(image_patches) == 1)
|
||||
assert (isinstance(image_patches[0], torch.Tensor)
|
||||
and len(image_patches[0]) == len(images))
|
||||
|
||||
processed_outputs["image_patches"] = image_patches[0]
|
||||
|
||||
return processed_outputs
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||
|
||||
This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
|
||||
|
||||
```python
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(image_patches=MultiModalFieldConfig.batched("image"))
|
||||
```
|
||||
|
||||
### Prompt updates
|
||||
|
||||
Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to
|
||||
return a list of [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instances.
|
||||
|
||||
Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies an update operation
|
||||
(e.g.: insertion, replacement) performed by the HF processor.
|
||||
|
||||
=== "Basic example: LLaVA"
|
||||
|
||||
Looking at HF's `LlavaProcessor`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
|
||||
prompt_strings = []
|
||||
for sample in text:
|
||||
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
||||
prompt_strings.append(sample)
|
||||
```
|
||||
|
||||
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
||||
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
|
||||
image_size = images.get_image_size(item_idx)
|
||||
num_image_tokens = self.info.get_num_image_tokens(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
return [image_token_id] * num_image_tokens
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[image_token_id],
|
||||
replacement=get_replacement,
|
||||
),
|
||||
]
|
||||
```
|
||||
|
||||
=== "Handling additional tokens: Fuyu"
|
||||
|
||||
Recall the layout of feature tokens from Step 2:
|
||||
|
||||
```
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
...
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
```
|
||||
|
||||
We define a helper function to return `ncols` and `nrows` directly:
|
||||
|
||||
```python
|
||||
def get_image_feature_grid_size(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> tuple[int, int]:
|
||||
image_processor = self.get_image_processor()
|
||||
target_width = image_processor.size["width"]
|
||||
target_height = image_processor.size["height"]
|
||||
patch_width = image_processor.patch_size["width"]
|
||||
patch_height = image_processor.patch_size["height"]
|
||||
|
||||
if not (image_width <= target_width and image_height <= target_height):
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
|
||||
image_height = int(image_height * optimal_scale_factor)
|
||||
image_width = int(image_width * optimal_scale_factor)
|
||||
|
||||
ncols = math.ceil(image_width / patch_width)
|
||||
nrows = math.ceil(image_height / patch_height)
|
||||
return ncols, nrows
|
||||
```
|
||||
|
||||
Based on this, we can initially define our replacement tokens as:
|
||||
|
||||
```python
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
|
||||
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
|
||||
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
```
|
||||
|
||||
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
||||
a BOS token (`<s>`) is also added to the promopt:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||
tokenizer=self.tokenizer,
|
||||
prompts=prompts,
|
||||
scale_factors=scale_factors,
|
||||
max_tokens_to_generate=self.max_tokens_to_generate,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
add_BOS=True,
|
||||
add_beginning_of_answer_token=True,
|
||||
)
|
||||
```
|
||||
|
||||
To assign the vision embeddings to only the image tokens, instead of a string
|
||||
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
|
||||
|
||||
```python
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id # `<s>`
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
```
|
||||
|
||||
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
||||
we can search for it to conduct the replacement at the start of the string:
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
eot_token_id = tokenizer.bos_token_id
|
||||
assert isinstance(eot_token_id, int)
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[eot_token_id],
|
||||
replacement=get_replacement_fuyu,
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
## 5. Register processor-related classes
|
||||
|
||||
After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
|
||||
[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
|
||||
and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
|
||||
decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
|
||||
to register them to the multi-modal registry:
|
||||
|
||||
```diff
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder)
|
||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
### Inserting feature tokens without replacement
|
||||
|
||||
Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use [PromptInsertion][vllm.multimodal.processing.PromptInsertion] instead of [PromptReplacement][vllm.multimodal.processing.PromptReplacement] inside [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
|
||||
|
||||
Examples:
|
||||
|
||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
|
||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
|
||||
### Handling prompt updates unrelated to multi-modal data
|
||||
|
||||
[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing].
|
||||
|
||||
Examples:
|
||||
|
||||
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
|
||||
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
|
||||
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
|
||||
### Custom HF processor
|
||||
|
||||
Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
|
||||
|
||||
Examples:
|
||||
|
||||
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
|
||||
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
|
||||
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
|
||||
@ -1,54 +0,0 @@
|
||||
---
|
||||
title: Registering a Model to vLLM
|
||||
---
|
||||
[](){ #new-model-registration }
|
||||
|
||||
vLLM relies on a model registry to determine how to run each model.
|
||||
A list of pre-registered architectures can be found [here][supported-models].
|
||||
|
||||
If your model is not on this list, you must register it to vLLM.
|
||||
This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial][new-model-basic]), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
Finally, update our [list of supported models][supported-models] to promote your model!
|
||||
|
||||
!!! warning
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
|
||||
## Out-of-tree models
|
||||
|
||||
You can load an external model [using a plugin][plugin-system] without modifying the vLLM codebase.
|
||||
|
||||
To register the model, use the following code:
|
||||
|
||||
```python
|
||||
# The entrypoint of your plugin
|
||||
def register():
|
||||
from vllm import ModelRegistry
|
||||
from your_code import YourModelForCausalLM
|
||||
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
```
|
||||
|
||||
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||
|
||||
```python
|
||||
# The entrypoint of your plugin
|
||||
def register():
|
||||
from vllm import ModelRegistry
|
||||
|
||||
ModelRegistry.register_model(
|
||||
"YourModelForCausalLM",
|
||||
"your_code:YourModelForCausalLM"
|
||||
)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
|
||||
Read more about that [here][supports-multimodal].
|
||||
@ -1,129 +0,0 @@
|
||||
---
|
||||
title: Using Docker
|
||||
---
|
||||
[](){ #deployment-docker }
|
||||
|
||||
[](){ #deployment-docker-pre-built-image }
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
||||
vLLM offers an official Docker image for deployment.
|
||||
The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
|
||||
|
||||
```console
|
||||
docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
This image can also be used with other container engines such as [Podman](https://podman.io/).
|
||||
|
||||
```console
|
||||
podman run --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`).
|
||||
|
||||
!!! note
|
||||
You can either use the `ipc=host` flag or `--shm-size` flag to allow the
|
||||
container to access the host's shared memory. vLLM uses PyTorch, which uses shared
|
||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||
|
||||
!!! note
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
||||
|
||||
If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:v0.8.3
|
||||
|
||||
# e.g. install the `audio` optional dependencies
|
||||
# NOTE: Make sure the version of vLLM matches the base image!
|
||||
RUN uv pip install --system vllm[audio]==0.8.3
|
||||
```
|
||||
|
||||
!!! tip
|
||||
Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
|
||||
|
||||
To use the development version of `transformers`, create a custom Dockerfile on top of the base image
|
||||
with an extra layer that installs their code from source:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:latest
|
||||
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
[](){ #deployment-docker-build-image-from-source }
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
||||
|
||||
```console
|
||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--target vllm-openai \
|
||||
--tag vllm/vllm-openai \
|
||||
--file docker/Dockerfile
|
||||
```
|
||||
|
||||
!!! note
|
||||
By default vLLM will build for all GPU types for widest distribution. If you are just building for the
|
||||
current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
|
||||
for vLLM to find the current GPU type and build for that.
|
||||
|
||||
If you are using Podman instead of Docker, you might need to disable SELinux labeling by
|
||||
adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
|
||||
|
||||
## Building for Arm64/aarch64
|
||||
|
||||
A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
|
||||
of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
|
||||
|
||||
!!! note
|
||||
Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
|
||||
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
|
||||
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
||||
|
||||
```console
|
||||
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
||||
python3 use_existing_torch.py
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t vllm/vllm-gh200-openai:latest \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
```
|
||||
|
||||
## Use the custom-built vLLM Docker image
|
||||
|
||||
To run vLLM with the custom-built Docker image:
|
||||
|
||||
```console
|
||||
docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
vllm/vllm-openai <args...>
|
||||
```
|
||||
|
||||
The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
|
||||
|
||||
!!! note
|
||||
**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
|
||||
@ -1,83 +0,0 @@
|
||||
---
|
||||
title: AutoGen
|
||||
---
|
||||
[](){ #deployment-autogen }
|
||||
|
||||
[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
|
||||
- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
|
||||
|
||||
```console
|
||||
pip install vllm
|
||||
|
||||
# Install AgentChat and OpenAI client from Extensions
|
||||
# AutoGen requires Python 3.10 or later.
|
||||
pip install -U "autogen-agentchat" "autogen-ext[openai]"
|
||||
```
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model mistralai/Mistral-7B-Instruct-v0.2
|
||||
```
|
||||
|
||||
- Call it with AutoGen:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from autogen_core.models import UserMessage
|
||||
from autogen_ext.models.openai import OpenAIChatCompletionClient
|
||||
from autogen_core.models import ModelFamily
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
# Create a model client
|
||||
model_client = OpenAIChatCompletionClient(
|
||||
model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
|
||||
api_key="EMPTY",
|
||||
model_info={
|
||||
"vision": False,
|
||||
"function_calling": False,
|
||||
"json_output": False,
|
||||
"family": ModelFamily.MISTRAL,
|
||||
"structured_output": True,
|
||||
},
|
||||
)
|
||||
|
||||
messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
|
||||
|
||||
# Create a stream.
|
||||
stream = model_client.create_stream(messages=messages)
|
||||
|
||||
# Iterate over the stream and print the responses.
|
||||
print("Streamed responses:")
|
||||
async for response in stream:
|
||||
if isinstance(response, str):
|
||||
# A partial response is a string.
|
||||
print(response, flush=True, end="")
|
||||
else:
|
||||
# The last response is a CreateResult object with the complete message.
|
||||
print("\n\n------------\n")
|
||||
print("The complete response:", flush=True)
|
||||
print(response.content, flush=True)
|
||||
|
||||
# Close the client when done.
|
||||
await model_client.close()
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
For details, see the tutorial:
|
||||
|
||||
- [Using vLLM in AutoGen](https://microsoft.github.io/autogen/0.2/docs/topics/non-openai-models/local-vllm/)
|
||||
|
||||
- [OpenAI-compatible API examples](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.models.openai.html#autogen_ext.models.openai.OpenAIChatCompletionClient)
|
||||
@ -1,60 +0,0 @@
|
||||
---
|
||||
title: Haystack
|
||||
---
|
||||
[](){ #deployment-haystack }
|
||||
|
||||
# Haystack
|
||||
|
||||
[Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case.
|
||||
|
||||
It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM and Haystack environment
|
||||
|
||||
```console
|
||||
pip install vllm haystack-ai
|
||||
```
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve mistralai/Mistral-7B-Instruct-v0.1
|
||||
```
|
||||
|
||||
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
|
||||
|
||||
```python
|
||||
from haystack.components.generators.chat import OpenAIChatGenerator
|
||||
from haystack.dataclasses import ChatMessage
|
||||
from haystack.utils import Secret
|
||||
|
||||
generator = OpenAIChatGenerator(
|
||||
# for compatibility with the OpenAI API, a placeholder api_key is needed
|
||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||
generation_kwargs = {"max_tokens": 512}
|
||||
)
|
||||
|
||||
response = generator.run(
|
||||
messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
|
||||
)
|
||||
|
||||
print("-"*30)
|
||||
print(response)
|
||||
print("-"*30)
|
||||
```
|
||||
|
||||
Output e.g.:
|
||||
|
||||
```console
|
||||
------------------------------
|
||||
{'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text=' Of course! Where in Italy would you like to go and what type of trip are you looking to plan?')], _name=None, _meta={'model': 'mistralai/Mistral-7B-Instruct-v0.1', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 21, 'total_tokens': 44, 'completion_tokens_details': None, 'prompt_tokens_details': None}})]}
|
||||
------------------------------
|
||||
```
|
||||
|
||||
For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md).
|
||||
@ -1,95 +0,0 @@
|
||||
---
|
||||
title: Helm
|
||||
---
|
||||
[](){ #deployment-helm }
|
||||
|
||||
A Helm chart to deploy vLLM for Kubernetes
|
||||
|
||||
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
|
||||
|
||||
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, ensure that you have the following:
|
||||
|
||||
- A running Kubernetes cluster
|
||||
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
|
||||
- Available GPU resources in your cluster
|
||||
- S3 with the model which will be deployed
|
||||
|
||||
## Installing the chart
|
||||
|
||||
To install the chart with the release name `test-vllm`:
|
||||
|
||||
```console
|
||||
helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
|
||||
```
|
||||
|
||||
## Uninstalling the Chart
|
||||
|
||||
To uninstall the `test-vllm` deployment:
|
||||
|
||||
```console
|
||||
helm uninstall test-vllm --namespace=ns-vllm
|
||||
```
|
||||
|
||||
The command removes all the Kubernetes components associated with the
|
||||
chart **including persistent volumes** and deletes the release.
|
||||
|
||||
## Architecture
|
||||
|
||||

|
||||
|
||||
## Values
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
|
||||
| autoscaling.enabled | bool | false | Enable autoscaling |
|
||||
| autoscaling.maxReplicas | int | 100 | Maximum replicas |
|
||||
| autoscaling.minReplicas | int | 1 | Minimum replicas |
|
||||
| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
|
||||
| configs | object | {} | Configmap |
|
||||
| containerPort | int | 8000 | Container port |
|
||||
| customObjects | list | [] | Custom Objects configuration |
|
||||
| deploymentStrategy | object | {} | Deployment strategy configuration |
|
||||
| externalConfigs | list | [] | External configuration |
|
||||
| extraContainers | list | [] | Additional containers configuration |
|
||||
| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
|
||||
| extraInit.pvcStorage | string | "50Gi" | Storage size of the s3 |
|
||||
| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
|
||||
| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
|
||||
| extraPorts | list | [] | Additional ports configuration |
|
||||
| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
|
||||
| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
|
||||
| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
|
||||
| image.repository | string | "vllm/vllm-openai" | Image repository |
|
||||
| image.tag | string | "latest" | Image tag |
|
||||
| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
|
||||
| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
|
||||
| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server |
|
||||
| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
|
||||
| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
|
||||
| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
|
||||
| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
|
||||
| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
|
||||
| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
|
||||
| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
|
||||
| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server |
|
||||
| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
|
||||
| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
|
||||
| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
|
||||
| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
|
||||
| replicaCount | int | 1 | Number of replicas |
|
||||
| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
|
||||
| resources.limits."nvidia.com/gpu" | int | 1 | Number of gpus used |
|
||||
| resources.limits.cpu | int | 4 | Number of CPUs |
|
||||
| resources.limits.memory | string | "16Gi" | CPU memory configuration |
|
||||
| resources.requests."nvidia.com/gpu" | int | 1 | Number of gpus used |
|
||||
| resources.requests.cpu | int | 4 | Number of CPUs |
|
||||
| resources.requests.memory | string | "16Gi" | CPU memory configuration |
|
||||
| secrets | object | {} | Secrets configuration |
|
||||
| serviceName | string | Service name | |
|
||||
| servicePort | int | 80 | Service port |
|
||||
| labels.environment | string | test | Environment name |
|
||||
@ -1,76 +0,0 @@
|
||||
---
|
||||
title: LiteLLM
|
||||
---
|
||||
[](){ #deployment-litellm }
|
||||
|
||||
[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
|
||||
|
||||
LiteLLM manages:
|
||||
|
||||
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
|
||||
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
|
||||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||
- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
|
||||
|
||||
And LiteLLM supports all models on VLLM.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM and litellm environment
|
||||
|
||||
```console
|
||||
pip install vllm litellm
|
||||
```
|
||||
|
||||
## Deploy
|
||||
|
||||
### Chat completion
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
- Call it with litellm:
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
response = litellm.completion(
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
### Embeddings
|
||||
|
||||
- Start the vLLM server with the supported embedding model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve BAAI/bge-base-en-v1.5
|
||||
```
|
||||
|
||||
- Call it with litellm:
|
||||
|
||||
```python
|
||||
from litellm import embedding
|
||||
import os
|
||||
|
||||
os.environ["HOSTED_VLLM_API_BASE"] = "http://{your-vllm-server-host}:{your-vllm-server-port}/v1"
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
# pass the vllm model name
|
||||
embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello world"])
|
||||
|
||||
print(embedding)
|
||||
```
|
||||
|
||||
For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm).
|
||||
@ -1,498 +0,0 @@
|
||||
---
|
||||
title: vLLM Paged Attention
|
||||
---
|
||||
[](){ #design-paged-attention }
|
||||
|
||||
Currently, vLLM utilizes its own implementation of a multi-head query
|
||||
attention kernel (`csrc/attention/attention_kernels.cu`).
|
||||
This kernel is designed to be compatible with
|
||||
vLLM's paged KV caches, where the key and value cache are stored in
|
||||
separate blocks (note that this block concept differs from the GPU
|
||||
thread block. So in a later document, I will refer to vLLM paged
|
||||
attention block as "block", while refer to GPU thread block as
|
||||
"thread block").
|
||||
|
||||
To achieve high performance, this kernel relies on a specially
|
||||
designed memory layout and access method, specifically when threads
|
||||
read data from global memory to shared memory. The purpose of this
|
||||
document is to provide a high-level explanation of the kernel
|
||||
implementation step by step, aiding those who wish to learn about the
|
||||
vLLM multi-head query attention kernel. After going through this
|
||||
document, users will likely have a better understanding and feel easier
|
||||
to follow the actual implementation.
|
||||
|
||||
Please note that this document may not cover all details, such as how
|
||||
to calculate the correct index for the corresponding data or the dot
|
||||
multiplication implementation. However, after reading this document
|
||||
and becoming familiar with the high-level logic flow, it should be
|
||||
easier for you to read the actual code and understand the details.
|
||||
|
||||
## Inputs
|
||||
|
||||
The kernel function takes a list of arguments for the current thread
|
||||
to perform its assigned work. The three most important arguments are
|
||||
the input pointers `q`, `k_cache`, and `v_cache`, which point
|
||||
to query, key, and value data on global memory that need to be read
|
||||
and processed. The output pointer `out` points to global memory
|
||||
where the result should be written. These four pointers actually
|
||||
refer to multi-dimensional arrays, but each thread only accesses the
|
||||
portion of data assigned to it. I have omitted all other runtime
|
||||
parameters here for simplicity.
|
||||
|
||||
```cpp
|
||||
template<typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, int PARTITION_SIZE = 0>
|
||||
__device__ void paged_attention_kernel(
|
||||
... // Other side args.
|
||||
const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
|
||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
|
||||
... // Other side args.
|
||||
)
|
||||
```
|
||||
|
||||
There are also a list of template arguments above the function
|
||||
signature that are determined during compilation time. `scalar_t`
|
||||
represents the data type of the query, key, and value data elements,
|
||||
such as FP16. `HEAD_SIZE` indicates the number of elements in each
|
||||
head. `BLOCK_SIZE` refers to the number of tokens in each block.
|
||||
`NUM_THREADS` denotes the number of threads in each thread block.
|
||||
`PARTITION_SIZE` represents the number of tensor parallel GPUs (For
|
||||
simplicity, we assume this is 0 and tensor parallel is disabled).
|
||||
|
||||
With these arguments, we need to perform a sequence of preparations.
|
||||
This includes calculating the current head index, block index, and
|
||||
other necessary variables. However, for now, we can ignore these
|
||||
preparations and proceed directly to the actual calculations. It will
|
||||
be easier to understand them once we grasp the entire flow.
|
||||
|
||||
## Concepts
|
||||
|
||||
Just before we dive into the calculation flow, I want to describe a
|
||||
few concepts that are needed for later sections. However, you may
|
||||
skip this section and return later if you encounter any confusing
|
||||
terminologies.
|
||||
|
||||
- **Sequence**: A sequence represents a client request. For example,
|
||||
the data pointed to by `q` has a shape of
|
||||
`[num_seqs, num_heads, head_size]`. That represents there are total
|
||||
`num_seqs` of query sequence data are pointed by `q`. Since this
|
||||
kernel is a single query attention kernel, each sequence only has one
|
||||
query token. Hence, the `num_seqs` equals the total number of tokens
|
||||
that are processed in the batch.
|
||||
- **Context**: The context consists of the generated tokens from the
|
||||
sequence. For instance, `["What", "is", "your"]` are the context
|
||||
tokens, and the input query token is `"name"`. The model might
|
||||
generate the token `"?"`.
|
||||
- **Vec**: The vec is a list of elements that are fetched and
|
||||
calculated together. For query and key data, the vec size
|
||||
(`VEC_SIZE`) is determined so that each thread group can fetch and
|
||||
calculate 16 bytes of data at a time. For value data, the vec size
|
||||
(`V_VEC_SIZE`) is determined so that each thread can fetch and
|
||||
calculate 16 bytes of data at a time. For example, if the
|
||||
`scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
|
||||
`VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
|
||||
- **Thread group**: The thread group is a small group of
|
||||
threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
|
||||
query token and one key token at a time. Each thread handles only a
|
||||
portion of the token data. The total number of elements processed by
|
||||
one thread group is referred as `x`. For example, if the thread
|
||||
group contains 2 threads and the head size is 8, then thread 0
|
||||
handles the query and key elements at index 0, 2, 4, 6, while thread
|
||||
1 handles the elements at index 1, 3, 5, 7.
|
||||
- **Block**: The key and value cache data in vLLM are split into
|
||||
blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
|
||||
of tokens at one head. Each block may contain only a portion of the
|
||||
whole context tokens. For example, if the block size is 16 and the
|
||||
head size is 128, then for one head, one block can store 16 * 128 =
|
||||
2048 elements.
|
||||
- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
|
||||
execute simultaneously on a stream multiprocessor (SM). In this
|
||||
kernel, each warp processes the calculation between one query token
|
||||
and key tokens of one entire block at a time (it may process multiple
|
||||
blocks in multiple iterations). For example, if there are 4 warps and
|
||||
6 blocks for one context, the assignment would be like warp 0 handles
|
||||
the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
|
||||
handles the 2nd block and warp 3 handles the 3rd block.
|
||||
- **Thread block**: A thread block is a group of
|
||||
threads(`NUM_THREADS`) that can access the same shared memory.
|
||||
Each thread block contains multiple warps(`NUM_WARPS`), and in
|
||||
this kernel, each thread block processes the calculation between one
|
||||
query token and key tokens of a whole context.
|
||||
- **Grid**: A grid is a collection of thread blocks and defines the
|
||||
shape of the collection. In this kernel, the shape is
|
||||
`(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
|
||||
block only handles the calculation for one head, one sequence, and
|
||||
one partition.
|
||||
|
||||
## Query
|
||||
|
||||
This section will introduce how query data is stored in memory and
|
||||
fetched by each thread. As mentioned above, each thread group fetches
|
||||
one query token data, while each thread itself only handles a part of
|
||||
one query token data. Within each warp, every thread group will fetch
|
||||
the same query token data, but will multiply it with different key
|
||||
token data.
|
||||
|
||||
```cpp
|
||||
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
```
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="query" width="70%" }
|
||||
</figure>
|
||||
|
||||
Each thread defines its own `q_ptr` which points to the assigned
|
||||
query token data on global memory. For example, if `VEC_SIZE` is 4
|
||||
and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
|
||||
total of 128 elements divided into 128 / 4 = 32 vecs.
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="q_vecs" width="70%" }
|
||||
</figure>
|
||||
|
||||
```cpp
|
||||
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
|
||||
```
|
||||
|
||||
Next, we need to read the global memory data pointed to by `q_ptr`
|
||||
into shared memory as `q_vecs`. It is important to note that each
|
||||
vecs is assigned to a different row. For example, if the
|
||||
`THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
|
||||
while thread 1 handles the 1st row vecs. By reading the query data in
|
||||
this way, neighboring threads like thread 0 and thread 1 can read
|
||||
neighbor memory, achieving the memory coalescing to improve
|
||||
performance.
|
||||
|
||||
## Key
|
||||
|
||||
Similar to the "Query" section, this section introduces memory layout
|
||||
and assignment for keys. While each thread group only handle one
|
||||
query token one kernel run, it may handle multiple key tokens across
|
||||
multiple iterations. Meanwhile, each warp will process multiple blocks
|
||||
of key tokens in multiple iterations, ensuring that all context
|
||||
tokens are processed by the entire thread group after the kernel run.
|
||||
In this context, "handle" refers to performing the dot multiplication
|
||||
between query data and key data.
|
||||
|
||||
```cpp
|
||||
const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
|
||||
+ kv_head_idx * kv_head_stride
|
||||
+ physical_block_offset * x;
|
||||
```
|
||||
|
||||
Unlike to `q_ptr`, `k_ptr` in each thread will point to different
|
||||
key token at different iterations. As shown above, that `k_ptr`
|
||||
points to key token data based on `k_cache` at assigned block,
|
||||
assigned head and assigned token.
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="key" width="70%" }
|
||||
</figure>
|
||||
|
||||
The diagram above illustrates the memory layout for key data. It
|
||||
assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
|
||||
8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
|
||||
rectangle represents all the elements for one key token at one head,
|
||||
which will be processed by one thread group. The left half shows the
|
||||
total 16 blocks of key token data for warp 0, while the right half
|
||||
represents the remaining key token data for other warps or
|
||||
iterations. Inside each rectangle, there are a total 32 vecs (128
|
||||
elements for one token) that will be processed by 2 threads (one
|
||||
thread group) separately.
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="k_vecs" width="70%" }
|
||||
</figure>
|
||||
|
||||
```cpp
|
||||
K_vec k_vecs[NUM_VECS_PER_THREAD]
|
||||
```
|
||||
|
||||
Next, we need to read the key token data from `k_ptr` and store
|
||||
them on register memory as `k_vecs`. We use register memory for
|
||||
`k_vecs` because it will only be accessed by one thread once,
|
||||
whereas `q_vecs` will be accessed by multiple threads multiple
|
||||
times. Each `k_vecs` will contain multiple vectors for later
|
||||
calculation. Each vec will be set at each inner iteration. The
|
||||
assignment of vecs allows neighboring threads in a warp to read
|
||||
neighboring memory together, which again promotes the memory
|
||||
coalescing. For instance, thread 0 will read vec 0, while thread 1
|
||||
will read vec 1. In the next inner loop, thread 0 will read vec 2,
|
||||
while thread 1 will read vec 3, and so on.
|
||||
|
||||
You may still be a little confused about the overall flow. Don't
|
||||
worry, please keep reading the next "QK" section. It will illustrate
|
||||
the query and key calculation flow in a clearer and higher-level
|
||||
manner.
|
||||
|
||||
## QK
|
||||
|
||||
As shown the pseudo code below, before the entire for loop block, we
|
||||
fetch the query data for one token and store it in `q_vecs`. Then,
|
||||
in the outer for loop, we iterate through different `k_ptrs` that
|
||||
point to different tokens and prepare the `k_vecs` in the inner for
|
||||
loop. Finally, we perform the dot multiplication between the
|
||||
`q_vecs` and each `k_vecs`.
|
||||
|
||||
```cpp
|
||||
q_vecs = ...
|
||||
for ... {
|
||||
k_ptr = ...
|
||||
for ... {
|
||||
k_vecs[i] = ...
|
||||
}
|
||||
...
|
||||
float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
|
||||
}
|
||||
```
|
||||
|
||||
As mentioned before, for each thread, it only fetches part of the
|
||||
query and key token data at a time. However, there will be a cross
|
||||
thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
|
||||
returned here is not just between part of the query and key token dot
|
||||
multiplication, but actually a full result between entire query and
|
||||
key token data.
|
||||
|
||||
For example, if the value of `HEAD_SIZE` is 128 and
|
||||
`THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
|
||||
total 64 elements. However, the returned `qk` is actually the
|
||||
result of dot multiplication between 128 query elements and 128 key
|
||||
elements. If you want to learn more about the details of the dot
|
||||
multiplication and reduction, you may refer to the implementation of
|
||||
`Qk_dot<>::dot`. However, for the sake of simplicity, I will not
|
||||
cover it in this document.
|
||||
|
||||
## Softmax
|
||||
|
||||
Next, we need to calculate the normalized softmax for all `qk`s,
|
||||
as shown above, where each $x$ represents a `qk`. To do this,
|
||||
we must obtain the reduced value of `qk_max`($m(x)$) and
|
||||
the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
|
||||
should be performed across the entire thread block, encompassing
|
||||
results between the query token and all context key tokens.
|
||||
|
||||
$$
|
||||
\begin{gather*}
|
||||
m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
|
||||
\quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
|
||||
\end{gather*}
|
||||
$$
|
||||
|
||||
### `qk_max` and `logits`
|
||||
|
||||
Just right after we get the `qk` result, we can set the temporary
|
||||
`logits` result with `qk` (In the end, the `logits` should
|
||||
store the normalized softmax result). Also we can compare and collect
|
||||
the `qk_max` for all `qk`s that are calculated by current
|
||||
thread group.
|
||||
|
||||
```cpp
|
||||
if (thread_group_offset == 0) {
|
||||
const bool mask = token_idx >= context_len;
|
||||
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
|
||||
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
|
||||
}
|
||||
```
|
||||
|
||||
Please note that the `logits` here is on shared memory, so each
|
||||
thread group will set the fields for its own assigned context tokens.
|
||||
Overall, the size of logits should be number of context tokens.
|
||||
|
||||
```cpp
|
||||
for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
|
||||
qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
|
||||
}
|
||||
|
||||
if (lane == 0) {
|
||||
red_smem[warp_idx] = qk_max;
|
||||
}
|
||||
```
|
||||
|
||||
Then we need to get the reduced `qk_max` across each warp. The main
|
||||
idea is to make threads in warp to communicate with each other and
|
||||
get the final max `qk` .
|
||||
|
||||
```cpp
|
||||
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
|
||||
qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
|
||||
}
|
||||
qk_max = VLLM_SHFL_SYNC(qk_max, 0);
|
||||
```
|
||||
|
||||
Finally, we can get the reduced `qk_max` from whole thread block by
|
||||
compare the `qk_max` from all warps in this thread block. Then we
|
||||
need to broadcast the final result to each thread.
|
||||
|
||||
### `exp_sum`
|
||||
|
||||
Similar to `qk_max`, we need to get the reduced sum value from the
|
||||
entire thread block too.
|
||||
|
||||
```cpp
|
||||
for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
|
||||
float val = __expf(logits[i] - qk_max);
|
||||
logits[i] = val;
|
||||
exp_sum += val;
|
||||
}
|
||||
...
|
||||
exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
|
||||
```
|
||||
|
||||
Firstly, sum all exp values from each thread group, and meanwhile,
|
||||
convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
|
||||
Please note, the `qk_max` here is already the max `qk` across the
|
||||
whole thread block. And then we can do reduction for `exp_sum`
|
||||
across whole thread block just like the `qk_max`.
|
||||
|
||||
```cpp
|
||||
const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
|
||||
for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
|
||||
logits[i] *= inv_sum;
|
||||
}
|
||||
```
|
||||
|
||||
Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
|
||||
the final normalized softmax result as `logits`. This `logits`
|
||||
variable will be used for dot multiplication with the value data in
|
||||
later steps. Now, it should store the normalized softmax result of
|
||||
`qk` for all assigned context tokens.
|
||||
|
||||
## Value
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="value" width="70%" }
|
||||
</figure>
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="logits_vec" width="50%" }
|
||||
</figure>
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="v_vec" width="70%" }
|
||||
</figure>
|
||||
|
||||
Now we need to retrieve the value data and perform dot multiplication
|
||||
with `logits`. Unlike query and key, there is no thread group
|
||||
concept for value data. As shown in diagram, different from key token
|
||||
memory layout, elements from the same column correspond to the same
|
||||
value token. For one block of value data, there are `HEAD_SIZE` of
|
||||
rows and `BLOCK_SIZE` of columns that are split into multiple
|
||||
`v_vecs`.
|
||||
|
||||
Each thread always fetches `V_VEC_SIZE` elements from the same
|
||||
`V_VEC_SIZE` of tokens at a time. As a result, a single thread
|
||||
retrieves multiple `v_vec`s from different rows and the same
|
||||
columns through multiple inner iterations. For each `v_vec`, it
|
||||
needs to be dot multiplied with the corresponding `logits_vec`,
|
||||
which is also `V_VEC_SIZE` elements from `logits`. Overall, with
|
||||
multiple inner iterations, each warp will process one block of value
|
||||
tokens. And with multiple outer iterations, the whole context value
|
||||
tokens are processed
|
||||
|
||||
```cpp
|
||||
float accs[NUM_ROWS_PER_THREAD];
|
||||
for ... { // Iteration over different blocks.
|
||||
logits_vec = ...
|
||||
for ... { // Iteration over different rows.
|
||||
v_vec = ...
|
||||
...
|
||||
accs[i] += dot(logits_vec, v_vec);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
As shown in the above pseudo code, in the outer loop, similar to
|
||||
`k_ptr`, `logits_vec` iterates over different blocks and reads
|
||||
`V_VEC_SIZE` elements from `logits`. In the inner loop, each
|
||||
thread reads `V_VEC_SIZE` elements from the same tokens as a
|
||||
`v_vec` and performs dot multiplication. It is important to note
|
||||
that in each inner iteration, the thread fetches different head
|
||||
position elements for the same tokens. The dot result is then
|
||||
accumulated in `accs`. Therefore, each entry of `accs` is mapped
|
||||
to a head position assigned to the current thread.
|
||||
|
||||
For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
|
||||
thread fetches 8 value elements for 8 tokens at a time. Each element
|
||||
is from different tokens at the same head position. If `HEAD_SIZE`
|
||||
is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
|
||||
fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
|
||||
a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
|
||||
a whole block of value tokens. And each `accs` in each thread
|
||||
contains 8 elements that accumulated at 8 different head positions.
|
||||
For the thread 0, the `accs` variable will have 8 elements, which
|
||||
are 0th, 32th … 224th elements of a value head that are accumulated
|
||||
from all assigned 8 tokens.
|
||||
|
||||
## LV
|
||||
|
||||
Now, we need to perform reduction for `accs` within each warp. This
|
||||
process allows each thread to accumulate the `accs` for the
|
||||
assigned head positions of all tokens in one block.
|
||||
|
||||
```cpp
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
float acc = accs[i];
|
||||
for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
|
||||
acc += VLLM_SHFL_XOR_SYNC(acc, mask);
|
||||
}
|
||||
accs[i] = acc;
|
||||
}
|
||||
```
|
||||
|
||||
Next, we perform reduction for `accs` across all warps, allowing
|
||||
each thread to have the accumulation of `accs` for the assigned
|
||||
head positions of all context tokens. Please note that each `accs`
|
||||
in every thread only stores the accumulation for a portion of
|
||||
elements of the entire head for all context tokens. However, overall,
|
||||
all results for output have been calculated but are just stored in
|
||||
different thread register memory.
|
||||
|
||||
```cpp
|
||||
float* out_smem = reinterpret_cast<float*>(shared_mem);
|
||||
for (int i = NUM_WARPS; i > 1; i /= 2) {
|
||||
// Upper warps write to shared memory.
|
||||
...
|
||||
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
...
|
||||
dst[row_idx] = accs[i];
|
||||
}
|
||||
|
||||
// Lower warps update the output.
|
||||
const float* src = &out_smem[warp_idx * HEAD_SIZE];
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
...
|
||||
accs[i] += src[row_idx];
|
||||
}
|
||||
|
||||
// Write out the accs.
|
||||
}
|
||||
```
|
||||
|
||||
## Output
|
||||
|
||||
Now we can write all of calculated result from local register memory
|
||||
to final output global memory.
|
||||
|
||||
```cpp
|
||||
scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
|
||||
+ head_idx * max_num_partitions * HEAD_SIZE
|
||||
+ partition_idx * HEAD_SIZE;
|
||||
```
|
||||
|
||||
First, we need to define the `out_ptr` variable, which points to
|
||||
the start address of the assigned sequence and assigned head.
|
||||
|
||||
```cpp
|
||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
|
||||
if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
|
||||
from_float(*(out_ptr + row_idx), accs[i]);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Finally, we need to iterate over different assigned head positions
|
||||
and write out the corresponding accumulated result based on the
|
||||
`out_ptr`.
|
||||
@ -1,28 +0,0 @@
|
||||
---
|
||||
title: Automatic Prefix Caching
|
||||
---
|
||||
[](){ #automatic-prefix-caching }
|
||||
|
||||
## Introduction
|
||||
|
||||
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
|
||||
|
||||
!!! note
|
||||
Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching].
|
||||
|
||||
## Enabling APC in vLLM
|
||||
|
||||
Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
|
||||
|
||||
<gh-file:examples/offline_inference/automatic_prefix_caching.py>
|
||||
|
||||
## Example workloads
|
||||
|
||||
We describe two example workloads, where APC can provide huge performance benefit:
|
||||
|
||||
- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
|
||||
- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
|
||||
|
||||
## Limits
|
||||
|
||||
APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
|
||||
@ -1,81 +0,0 @@
|
||||
---
|
||||
title: Compatibility Matrix
|
||||
---
|
||||
[](){ #compatibility-matrix }
|
||||
|
||||
The tables below show mutually exclusive features and the support on some hardware.
|
||||
|
||||
The symbols used have the following meanings:
|
||||
|
||||
- ✅ = Full compatibility
|
||||
- 🟠 = Partial compatibility
|
||||
- ❌ = No compatibility
|
||||
- ❔ = Unknown or TBD
|
||||
|
||||
!!! note
|
||||
Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
|
||||
|
||||
## Feature x Feature
|
||||
|
||||
<style>
|
||||
td:not(:first-child) {
|
||||
text-align: center !important;
|
||||
}
|
||||
td {
|
||||
padding: 0.5rem !important;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
th {
|
||||
padding: 0.5rem !important;
|
||||
min-width: 0 !important;
|
||||
}
|
||||
|
||||
th:not(:first-child) {
|
||||
writing-mode: vertical-lr;
|
||||
transform: rotate(180deg)
|
||||
}
|
||||
</style>
|
||||
|
||||
| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
||||
| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
||||
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
|
||||
| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
|
||||
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
|
||||
|
||||
[](){ #feature-x-hardware }
|
||||
|
||||
## Feature x Hardware
|
||||
|
||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD |
|
||||
|-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------|
|
||||
| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [APC][automatic-prefix-caching] | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8475) | ✅ |
|
||||
| [SD][spec-decode] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| <abbr title="Pooling Models">pooling</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
|
||||
!!! note
|
||||
Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
|
||||
@ -1,43 +0,0 @@
|
||||
# Prompt Embedding Inputs
|
||||
|
||||
This page teaches you how to pass prompt embedding inputs to vLLM.
|
||||
|
||||
## What are prompt embeddings?
|
||||
|
||||
The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
|
||||
|
||||
!!! note
|
||||
Prompt embeddings are currently only supported in the v0 engine.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
|
||||
|
||||
- `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model.
|
||||
|
||||
### Hugging Face Transformers Inputs
|
||||
|
||||
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
|
||||
|
||||
<gh-file:examples/offline_inference/prompt_embed_inference.py>
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
|
||||
|
||||
When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
|
||||
|
||||
Prompt embeddings are passed in as base64 encoded torch tensors.
|
||||
|
||||
### Transformers Inputs via OpenAI Client
|
||||
|
||||
First, launch the OpenAI-compatible server:
|
||||
|
||||
```bash
|
||||
vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
|
||||
--max-model-len 4096 --enable-prompt-embeds
|
||||
```
|
||||
|
||||
Then, you can use the OpenAI client as follows:
|
||||
|
||||
<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
|
||||
@ -1,22 +0,0 @@
|
||||
---
|
||||
title: Quantization
|
||||
---
|
||||
[](){ #quantization-index }
|
||||
|
||||
Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
|
||||
|
||||
Contents:
|
||||
|
||||
- [Supported_Hardware](supported_hardware.md)
|
||||
- [Auto_Awq](auto_awq.md)
|
||||
- [Bnb](bnb.md)
|
||||
- [Bitblas](bitblas.md)
|
||||
- [Gguf](gguf.md)
|
||||
- [Gptqmodel](gptqmodel.md)
|
||||
- [Int4](int4.md)
|
||||
- [Int8](int8.md)
|
||||
- [Fp8](fp8.md)
|
||||
- [Modelopt](modelopt.md)
|
||||
- [Quark](quark.md)
|
||||
- [Quantized_Kvcache](quantized_kvcache.md)
|
||||
- [Torchao](torchao.md)
|
||||
@ -1,28 +0,0 @@
|
||||
---
|
||||
title: Supported Hardware
|
||||
---
|
||||
[](){ #quantization-supported-hardware }
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU |
|
||||
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
|
||||
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ |
|
||||
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ |
|
||||
|
||||
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
|
||||
- ✅︎ indicates that the quantization method is supported on the specified hardware.
|
||||
- ❌ indicates that the quantization method is not supported on the specified hardware.
|
||||
|
||||
!!! note
|
||||
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
|
||||
For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
|
||||
@ -1,5 +0,0 @@
|
||||
nav:
|
||||
- README.md
|
||||
- gpu.md
|
||||
- cpu.md
|
||||
- ai_accelerator.md
|
||||
@ -1,20 +0,0 @@
|
||||
---
|
||||
title: Installation
|
||||
---
|
||||
[](){ #installation-index }
|
||||
|
||||
vLLM supports the following hardware platforms:
|
||||
|
||||
- [GPU](gpu.md)
|
||||
- [NVIDIA CUDA](gpu.md#nvidia-cuda)
|
||||
- [AMD ROCm](gpu.md#amd-rocm)
|
||||
- [Intel XPU](gpu.md#intel-xpu)
|
||||
- [CPU](cpu.md)
|
||||
- [Intel/AMD x86](cpu.md#intelamd-x86)
|
||||
- [ARM AArch64](cpu.md#arm-aarch64)
|
||||
- [Apple silicon](cpu.md#apple-silicon)
|
||||
- [IBM Z (S390X)](cpu.md#ibm-z-s390x)
|
||||
- [Other AI accelerators](ai_accelerator.md)
|
||||
- [Google TPU](ai_accelerator.md#google-tpu)
|
||||
- [Intel Gaudi](ai_accelerator.md#intel-gaudi)
|
||||
- [AWS Neuron](ai_accelerator.md#aws-neuron)
|
||||
@ -1,117 +0,0 @@
|
||||
# Other AI accelerators
|
||||
|
||||
vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation"
|
||||
|
||||
## Requirements
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements"
|
||||
|
||||
## Configure a new environment
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment"
|
||||
|
||||
## Set up using Python
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels"
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source"
|
||||
|
||||
## Set up using Docker
|
||||
|
||||
### Pre-built images
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images"
|
||||
|
||||
### Build image from source
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source"
|
||||
|
||||
## Extra information
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information"
|
||||
@ -1,154 +0,0 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
|
||||
generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
|
||||
and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
|
||||
This tab describes how to set up your environment to run vLLM on Neuron.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- OS: Linux
|
||||
- Python: 3.9 or newer
|
||||
- Pytorch 2.5/2.6
|
||||
- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
|
||||
- AWS Neuron SDK 2.23
|
||||
|
||||
## Configure a new environment
|
||||
|
||||
### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
|
||||
|
||||
The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
|
||||
[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
|
||||
|
||||
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
|
||||
- Once inside your instance, activate the pre-installed virtual environment for inference by running
|
||||
```console
|
||||
source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
|
||||
```
|
||||
|
||||
Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
|
||||
for alternative setup instructions including using Docker and manually installing dependencies.
|
||||
|
||||
!!! note
|
||||
NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
|
||||
library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
Currently, there are no pre-built Neuron wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
#### Install vLLM from source
|
||||
|
||||
Install vllm as follows:
|
||||
|
||||
```console
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
pip install -U -r requirements/neuron.txt
|
||||
VLLM_TARGET_DEVICE="neuron" pip install -e .
|
||||
```
|
||||
|
||||
AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
|
||||
[https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's
|
||||
available on vLLM V0. Please utilize the AWS Fork for the following features:
|
||||
|
||||
- Llama-3.2 multi-modal support
|
||||
- Multi-node distributed inference
|
||||
|
||||
Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
|
||||
for more details and usage examples.
|
||||
|
||||
To install the AWS Neuron fork, run the following:
|
||||
|
||||
```console
|
||||
git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
|
||||
cd upstreaming-to-vllm
|
||||
pip install -r requirements/neuron.txt
|
||||
VLLM_TARGET_DEVICE="neuron" pip install -e .
|
||||
```
|
||||
|
||||
Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
|
||||
# --8<-- [end:set-up-using-docker]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
Currently, there are no pre-built Neuron images.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
|
||||
See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
|
||||
|
||||
Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
|
||||
[](){ #feature-support-through-nxd-inference-backend }
|
||||
### Feature support through NxD Inference backend
|
||||
|
||||
The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
|
||||
to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
|
||||
[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
|
||||
|
||||
To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
|
||||
as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
|
||||
```console
|
||||
override_neuron_config={
|
||||
"enable_bucketing":False,
|
||||
}
|
||||
```
|
||||
or when launching vLLM from the CLI, pass
|
||||
```console
|
||||
--override-neuron-config "{\"enable_bucketing\":false}"
|
||||
```
|
||||
|
||||
Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
|
||||
(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
|
||||
|
||||
### Known limitations
|
||||
|
||||
- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
|
||||
[guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
|
||||
for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
|
||||
- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
|
||||
[Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
|
||||
to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
|
||||
- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
|
||||
runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
|
||||
- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
|
||||
to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
|
||||
- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
|
||||
to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
|
||||
to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
|
||||
- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
|
||||
max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
|
||||
to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
|
||||
for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
|
||||
implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
|
||||
|
||||
|
||||
### Environment variables
|
||||
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
|
||||
compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
|
||||
artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
|
||||
but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
|
||||
under this specified path.
|
||||
- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
|
||||
- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
|
||||
|
||||
# --8<-- [end:extra-information]
|
||||
@ -1,41 +0,0 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
|
||||
|
||||
ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- OS: Linux
|
||||
- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
|
||||
- Instruction Set Architecture (ISA): NEON support is required
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
|
||||
|
||||
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
|
||||
# --8<-- [end:set-up-using-docker]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
@ -1,46 +0,0 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- OS: Linux
|
||||
- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
|
||||
- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
|
||||
|
||||
!!! tip
|
||||
[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
|
||||
|
||||
!!! note
|
||||
- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
|
||||
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
|
||||
# --8<-- [end:set-up-using-docker]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
@ -1,124 +0,0 @@
|
||||
# GPU
|
||||
|
||||
vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
|
||||
|
||||
## Requirements
|
||||
|
||||
- OS: Linux
|
||||
- Python: 3.9 -- 3.12
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
|
||||
|
||||
## Set up using Python
|
||||
|
||||
### Create a new Python environment
|
||||
|
||||
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:create-a-new-python-environment"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
There is no extra information on creating a new Python environment for this device.
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
There is no extra information on creating a new Python environment for this device.
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
|
||||
|
||||
[](){ #build-from-source }
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
|
||||
|
||||
## Set up using Docker
|
||||
|
||||
### Pre-built images
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
|
||||
|
||||
### Build image from source
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
|
||||
|
||||
## Supported features
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
|
||||
|
||||
=== "Intel XPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
|
||||
@ -1,6 +0,0 @@
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
|
||||
|
||||
```console
|
||||
uv venv --python 3.12 --seed
|
||||
source .venv/bin/activate
|
||||
```
|
||||
35
docs/make.bat
Normal file
35
docs/make.bat
Normal file
@ -0,0 +1,35 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.https://www.sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
@ -1,162 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import itertools
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import regex as re
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||
ROOT_DIR_RELATIVE = '../../../../..'
|
||||
EXAMPLE_DIR = ROOT_DIR / "examples"
|
||||
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
|
||||
print(ROOT_DIR.resolve())
|
||||
print(EXAMPLE_DIR.resolve())
|
||||
print(EXAMPLE_DOC_DIR.resolve())
|
||||
|
||||
|
||||
def fix_case(text: str) -> str:
|
||||
subs = {
|
||||
"api": "API",
|
||||
"cli": "CLI",
|
||||
"cpu": "CPU",
|
||||
"llm": "LLM",
|
||||
"mae": "MAE",
|
||||
"tpu": "TPU",
|
||||
"aqlm": "AQLM",
|
||||
"gguf": "GGUF",
|
||||
"lora": "LoRA",
|
||||
"rlhf": "RLHF",
|
||||
"vllm": "vLLM",
|
||||
"openai": "OpenAI",
|
||||
"lmcache": "LMCache",
|
||||
"multilora": "MultiLoRA",
|
||||
"mlpspeculator": "MLPSpeculator",
|
||||
r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32
|
||||
r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16
|
||||
}
|
||||
for pattern, repl in subs.items():
|
||||
text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
|
||||
return text
|
||||
|
||||
|
||||
@dataclass
|
||||
class Example:
|
||||
"""
|
||||
Example class for generating documentation content from a given path.
|
||||
|
||||
Attributes:
|
||||
path (Path): The path to the main directory or file.
|
||||
category (str): The category of the document.
|
||||
main_file (Path): The main file in the directory.
|
||||
other_files (list[Path]): list of other files in the directory.
|
||||
title (str): The title of the document.
|
||||
|
||||
Methods:
|
||||
__post_init__(): Initializes the main_file, other_files, and title attributes.
|
||||
determine_main_file() -> Path: Determines the main file in the given path.
|
||||
determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
|
||||
determine_title() -> str: Determines the title of the document.
|
||||
generate() -> str: Generates the documentation content.
|
||||
""" # noqa: E501
|
||||
path: Path
|
||||
category: str = None
|
||||
main_file: Path = field(init=False)
|
||||
other_files: list[Path] = field(init=False)
|
||||
title: str = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
self.main_file = self.determine_main_file()
|
||||
self.other_files = self.determine_other_files()
|
||||
self.title = self.determine_title()
|
||||
|
||||
def determine_main_file(self) -> Path:
|
||||
"""
|
||||
Determines the main file in the given path.
|
||||
If the path is a file, it returns the path itself. Otherwise, it searches
|
||||
for Markdown files (*.md) in the directory and returns the first one found.
|
||||
Returns:
|
||||
Path: The main file path, either the original path if it's a file or the first
|
||||
Markdown file found in the directory.
|
||||
Raises:
|
||||
IndexError: If no Markdown files are found in the directory.
|
||||
""" # noqa: E501
|
||||
return self.path if self.path.is_file() else list(
|
||||
self.path.glob("*.md")).pop()
|
||||
|
||||
def determine_other_files(self) -> list[Path]:
|
||||
"""
|
||||
Determine other files in the directory excluding the main file.
|
||||
|
||||
This method checks if the given path is a file. If it is, it returns an empty list.
|
||||
Otherwise, it recursively searches through the directory and returns a list of all
|
||||
files that are not the main file.
|
||||
|
||||
Returns:
|
||||
list[Path]: A list of Path objects representing the other files in the directory.
|
||||
""" # noqa: E501
|
||||
if self.path.is_file():
|
||||
return []
|
||||
is_other_file = lambda file: file.is_file() and file != self.main_file
|
||||
return [file for file in self.path.rglob("*") if is_other_file(file)]
|
||||
|
||||
def determine_title(self) -> str:
|
||||
return fix_case(self.path.stem.replace("_", " ").title())
|
||||
|
||||
def generate(self) -> str:
|
||||
content = f"---\ntitle: {self.title}\n---\n\n"
|
||||
content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
|
||||
|
||||
# Use long code fence to avoid issues with
|
||||
# included files containing code fences too
|
||||
code_fence = "``````"
|
||||
is_code = self.main_file.suffix != ".md"
|
||||
if is_code:
|
||||
content += f"{code_fence}{self.main_file.suffix[1:]}\n"
|
||||
content += f'--8<-- "{self.main_file}"\n'
|
||||
if is_code:
|
||||
content += f"{code_fence}\n"
|
||||
content += "\n"
|
||||
|
||||
if not self.other_files:
|
||||
return content
|
||||
|
||||
content += "## Example materials\n\n"
|
||||
for file in sorted(self.other_files):
|
||||
content += f'??? abstract "{file.relative_to(self.path)}"\n'
|
||||
if file.suffix != ".md":
|
||||
content += f" {code_fence}{file.suffix[1:]}\n"
|
||||
content += f' --8<-- "{file}"\n'
|
||||
if file.suffix != ".md":
|
||||
content += f" {code_fence}\n"
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
# Create the EXAMPLE_DOC_DIR if it doesn't exist
|
||||
if not EXAMPLE_DOC_DIR.exists():
|
||||
EXAMPLE_DOC_DIR.mkdir(parents=True)
|
||||
|
||||
categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir())
|
||||
|
||||
examples = []
|
||||
glob_patterns = ["*.py", "*.md", "*.sh"]
|
||||
# Find categorised examples
|
||||
for category in categories:
|
||||
globs = [category.glob(pattern) for pattern in glob_patterns]
|
||||
for path in itertools.chain(*globs):
|
||||
examples.append(Example(path, category.stem))
|
||||
# Find examples in subdirectories
|
||||
for path in category.glob("*/*.md"):
|
||||
examples.append(Example(path.parent, category.stem))
|
||||
|
||||
# Generate the example documentation
|
||||
for example in sorted(examples, key=lambda e: e.path.stem):
|
||||
example_name = f"{example.path.stem}.md"
|
||||
doc_path = EXAMPLE_DOC_DIR / example.category / example_name
|
||||
print(doc_path)
|
||||
if not doc_path.parent.exists():
|
||||
doc_path.parent.mkdir(parents=True)
|
||||
with open(doc_path, "w+") as f:
|
||||
f.write(example.generate())
|
||||
@ -1,16 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import os
|
||||
from typing import Literal
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
|
||||
if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
|
||||
# remove the warning banner if the version is a tagged release
|
||||
docs_dir = os.path.dirname(__file__)
|
||||
announcement_path = os.path.join(docs_dir,
|
||||
"mkdocs/overrides/main.html")
|
||||
# The file might be removed already if the build is triggered multiple
|
||||
# times (readthedocs build both HTML and PDF versions separately)
|
||||
if os.path.exists(announcement_path):
|
||||
os.remove(announcement_path)
|
||||
@ -1,53 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import regex as re
|
||||
from mkdocs.config.defaults import MkDocsConfig
|
||||
from mkdocs.structure.files import Files
|
||||
from mkdocs.structure.pages import Page
|
||||
|
||||
|
||||
def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
|
||||
files: Files):
|
||||
gh_icon = ":octicons-mark-github-16:"
|
||||
gh_url = "https://github.com"
|
||||
repo_url = f"{gh_url}/vllm-project/vllm"
|
||||
org_url = f"{gh_url}/orgs/vllm-project"
|
||||
urls = {
|
||||
"issue": f"{repo_url}/issues",
|
||||
"pr": f"{repo_url}/pull",
|
||||
"project": f"{org_url}/projects",
|
||||
"dir": f"{repo_url}/tree/main",
|
||||
"file": f"{repo_url}/blob/main",
|
||||
}
|
||||
titles = {
|
||||
"issue": "Issue #",
|
||||
"pr": "Pull Request #",
|
||||
"project": "Project #",
|
||||
"dir": "",
|
||||
"file": "",
|
||||
}
|
||||
|
||||
scheme = r"gh-(?P<type>.+?):(?P<path>.+?)(#(?P<fragment>.+?))?"
|
||||
inline_link = re.compile(r"\[(?P<title>[^\[]+?)\]\(" + scheme + r"\)")
|
||||
auto_link = re.compile(f"<{scheme}>")
|
||||
|
||||
def replace_inline_link(match: re.Match) -> str:
|
||||
url = f'{urls[match.group("type")]}/{match.group("path")}'
|
||||
if fragment := match.group("fragment"):
|
||||
url += f"#{fragment}"
|
||||
|
||||
return f'[{gh_icon} {match.group("title")}]({url})'
|
||||
|
||||
def replace_auto_link(match: re.Match) -> str:
|
||||
type = match.group("type")
|
||||
path = match.group("path")
|
||||
title = f"{titles[type]}{path}"
|
||||
url = f"{urls[type]}/{path}"
|
||||
if fragment := match.group("fragment"):
|
||||
url += f"#{fragment}"
|
||||
|
||||
return f"[{gh_icon} {title}]({url})"
|
||||
|
||||
markdown = inline_link.sub(replace_inline_link, markdown)
|
||||
markdown = auto_link.sub(replace_auto_link, markdown)
|
||||
|
||||
return markdown
|
||||
@ -1,5 +0,0 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{% block announce %}
|
||||
<p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
|
||||
{% endblock %}
|
||||
@ -1,36 +0,0 @@
|
||||
/* Warning for latest docs */
|
||||
.md-banner {
|
||||
background-color: var(--md-warning-bg-color);
|
||||
color: var(--md-warning-fg-color);
|
||||
}
|
||||
|
||||
/* https://christianoliff.com/blog/styling-external-links-with-an-icon-in-css/ */
|
||||
a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
|
||||
align-items: center;
|
||||
|
||||
&[href^="//"]::after,
|
||||
&[href^="http://"]::after,
|
||||
&[href^="https://"]::after {
|
||||
content: "";
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
margin-left: 4px;
|
||||
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' stroke='gray' viewBox='0 0 16 16'%3E%3Cpath fill-rule='evenodd' d='M8.636 3.5a.5.5 0 0 0-.5-.5H1.5A1.5 1.5 0 0 0 0 4.5v10A1.5 1.5 0 0 0 1.5 16h10a1.5 1.5 0 0 0 1.5-1.5V7.864a.5.5 0 0 0-1 0V14.5a.5.5 0 0 1-.5.5h-10a.5.5 0 0 1-.5-.5v-10a.5.5 0 0 1 .5-.5h6.636a.5.5 0 0 0 .5-.5z'/%3E%3Cpath fill-rule='evenodd' d='M16 .5a.5.5 0 0 0-.5-.5h-5a.5.5 0 0 0 0 1h3.793L6.146 9.146a.5.5 0 1 0 .708.708L15 1.707V5.5a.5.5 0 0 0 1 0v-5z'/%3E%3C/svg%3E");
|
||||
background-position: center;
|
||||
background-repeat: no-repeat;
|
||||
background-size: contain;
|
||||
display: inline-block;
|
||||
}
|
||||
}
|
||||
|
||||
/* Light mode: darker section titles */
|
||||
body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
|
||||
color: rgba(0, 0, 0, 0.7) !important;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
/* Dark mode: lighter gray section titles */
|
||||
body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
|
||||
color: rgba(255, 255, 255, 0.75) !important;
|
||||
font-weight: 700;
|
||||
}
|
||||
@ -1,690 +0,0 @@
|
||||
---
|
||||
title: Supported Models
|
||||
---
|
||||
[](){ #supported-models }
|
||||
|
||||
vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
|
||||
If a model supports more than one task, you can set the task via the `--task` argument.
|
||||
|
||||
For each task, we list the model architectures that have been implemented in vLLM.
|
||||
Alongside each architecture, we include some popular models that use it.
|
||||
|
||||
## Model Implementation
|
||||
|
||||
### vLLM
|
||||
|
||||
If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
|
||||
|
||||
These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models].
|
||||
|
||||
[](){ #transformers-backend }
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
|
||||
|
||||
To check if the modeling backend is Transformers, you can simply do this:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM(model=..., task="generate") # Name or path of your model
|
||||
llm.apply_model(lambda model: print(type(model)))
|
||||
```
|
||||
|
||||
If it is `TransformersForCausalLM` then it means it's based on Transformers!
|
||||
|
||||
!!! tip
|
||||
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server].
|
||||
|
||||
!!! note
|
||||
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
|
||||
|
||||
#### Custom models
|
||||
|
||||
If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
|
||||
|
||||
For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
|
||||
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
|
||||
* The model directory must have the correct structure (e.g. `config.json` is present).
|
||||
* `config.json` must contain `auto_map.AutoModel`.
|
||||
- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
|
||||
* Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
|
||||
|
||||
If the compatible model is:
|
||||
|
||||
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][openai-compatible-server].
|
||||
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][openai-compatible-server].
|
||||
|
||||
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
|
||||
[](){ #writing-custom-models }
|
||||
|
||||
#### Writing custom models
|
||||
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||
|
||||
```python title="modeling_my_model.py"
|
||||
|
||||
from transformers import PreTrainedModel
|
||||
from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
|
||||
def forward(self, hidden_states, **kwargs):
|
||||
...
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
**kwargs,
|
||||
)
|
||||
...
|
||||
|
||||
class MyModel(PreTrainedModel):
|
||||
_supports_attention_backend = True
|
||||
```
|
||||
|
||||
Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
||||
|
||||
```python title="configuration_my_model.py"
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
class MyConfig(PretrainedConfig):
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
```
|
||||
|
||||
- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
|
||||
- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
|
||||
* You only need to do this for layers which are not present on all pipeline stages
|
||||
* vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
|
||||
* The `list` in the first element of the `tuple` contains the names of the input arguments
|
||||
* The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
|
||||
|
||||
## Loading a Model
|
||||
|
||||
### Hugging Face Hub
|
||||
|
||||
By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
|
||||
|
||||
To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
|
||||
|
||||
Models do not _need_ to be natively supported to be used in vLLM.
|
||||
The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
|
||||
!!! tip
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# For generative models (task=generate) only
|
||||
llm = LLM(model=..., task="generate") # Name or path of your model
|
||||
output = llm.generate("Hello, my name is")
|
||||
print(output)
|
||||
|
||||
# For pooling models (task={embed,classify,reward,score}) only
|
||||
llm = LLM(model=..., task="embed") # Name or path of your model
|
||||
output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
|
||||
|
||||
Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM.
|
||||
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
|
||||
|
||||
#### Download a model
|
||||
|
||||
If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository:
|
||||
|
||||
```console
|
||||
# Download a model
|
||||
huggingface-cli download HuggingFaceH4/zephyr-7b-beta
|
||||
|
||||
# Specify a custom cache directory
|
||||
huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache
|
||||
|
||||
# Download a specific file from a model repo
|
||||
huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json
|
||||
```
|
||||
|
||||
#### List the downloaded models
|
||||
|
||||
Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache:
|
||||
|
||||
```console
|
||||
# List cached models
|
||||
huggingface-cli scan-cache
|
||||
|
||||
# Show detailed (verbose) output
|
||||
huggingface-cli scan-cache -v
|
||||
|
||||
# Specify a custom cache directory
|
||||
huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
|
||||
```
|
||||
|
||||
#### Delete a cached model
|
||||
|
||||
Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
|
||||
|
||||
```console
|
||||
# The `delete-cache` command requires extra dependencies to work with the TUI.
|
||||
# Please run `pip install huggingface_hub[cli]` to install them.
|
||||
|
||||
# Launch the interactive TUI to select models to delete
|
||||
$ huggingface-cli delete-cache
|
||||
? Select revisions to delete: 1 revisions selected counting for 438.9M.
|
||||
○ None of the following (if selected, nothing will be deleted).
|
||||
Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago)
|
||||
❯ ◉ a5beb1e3: main # modified 1 week ago
|
||||
|
||||
Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago)
|
||||
○ d4aa6901: main # modified 1 week ago
|
||||
|
||||
Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago)
|
||||
○ 2cfc18c9: main # modified 4 weeks ago
|
||||
|
||||
Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification.
|
||||
|
||||
# Need to confirm after selected
|
||||
? Select revisions to delete: 1 revision(s) selected.
|
||||
? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes
|
||||
Start deletion.
|
||||
Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
|
||||
```
|
||||
|
||||
#### Using a proxy
|
||||
|
||||
Here are some tips for loading/downloading models from Hugging Face using a proxy:
|
||||
|
||||
- Set the proxy globally for your session (or set it in the profile file):
|
||||
|
||||
```shell
|
||||
export http_proxy=http://your.proxy.server:port
|
||||
export https_proxy=http://your.proxy.server:port
|
||||
```
|
||||
|
||||
- Set the proxy for just the current command:
|
||||
|
||||
```shell
|
||||
https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
|
||||
|
||||
# or use vllm cmd directly
|
||||
https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests
|
||||
```
|
||||
|
||||
- Set the proxy in Python interpreter:
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ['http_proxy'] = 'http://your.proxy.server:port'
|
||||
os.environ['https_proxy'] = 'http://your.proxy.server:port'
|
||||
```
|
||||
|
||||
### ModelScope
|
||||
|
||||
To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
```
|
||||
|
||||
And use with `trust_remote_code=True`.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
|
||||
|
||||
# For generative models (task=generate) only
|
||||
output = llm.generate("Hello, my name is")
|
||||
print(output)
|
||||
|
||||
# For pooling models (task={embed,classify,reward,score}) only
|
||||
output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
[](){ #feature-status-legend }
|
||||
|
||||
## Feature Status Legend
|
||||
|
||||
- ✅︎ indicates that the feature is supported for the model.
|
||||
|
||||
- 🚧 indicates that the feature is planned but not yet supported for the model.
|
||||
|
||||
- ⚠️ indicates that the feature is available but may have known issues or limitations.
|
||||
|
||||
[](){ #supported-text-models }
|
||||
|
||||
## List of Text-only Language Models
|
||||
|
||||
### Generative Models
|
||||
|
||||
See [this page][generative-models] for more information on how to use generative models.
|
||||
|
||||
#### Text Generation
|
||||
|
||||
Specified using `--task generate`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ |
|
||||
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
|
||||
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ |
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
!!! warning
|
||||
Since some model architectures support both generative and pooling tasks,
|
||||
you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
|
||||
|
||||
#### Text Embedding
|
||||
|
||||
Specified using `--task embed`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
|
||||
| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | |
|
||||
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ |
|
||||
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ |
|
||||
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ |
|
||||
| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
|
||||
|
||||
!!! note
|
||||
`jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights.
|
||||
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
|
||||
of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
|
||||
|
||||
#### Reward Modeling
|
||||
|
||||
Specified using `--task reward`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
|
||||
|
||||
!!! warning
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
#### Classification
|
||||
|
||||
Specified using `--task classify`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
|
||||
#### Sentence Pair Scoring
|
||||
|
||||
Specified using `--task score`.
|
||||
|
||||
| Architecture | Models | Example HF Models |
|
||||
|---------------------------------------|-------------------|----------------------------------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. |
|
||||
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
## List of Multimodal Language Models
|
||||
|
||||
The following modalities are supported depending on the model:
|
||||
|
||||
- **T**ext
|
||||
- **I**mage
|
||||
- **V**ideo
|
||||
- **A**udio
|
||||
|
||||
Any combination of modalities joined by `+` are supported.
|
||||
|
||||
- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
|
||||
|
||||
On the other hand, modalities separated by `/` are mutually exclusive.
|
||||
|
||||
- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
|
||||
|
||||
See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model.
|
||||
|
||||
!!! warning
|
||||
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
|
||||
or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
|
||||
|
||||
Offline inference:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2-VL-7B-Instruct",
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)
|
||||
```
|
||||
|
||||
Online serving:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
|
||||
```
|
||||
|
||||
**This is no longer required if you are using vLLM V1.**
|
||||
|
||||
!!! note
|
||||
vLLM currently only supports adding LoRA to the language backbone of multimodal models.
|
||||
|
||||
### Generative Models
|
||||
|
||||
See [this page][generative-models] for more information on how to use generative models.
|
||||
|
||||
#### Text Generation
|
||||
|
||||
Specified using `--task generate`.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ |
|
||||
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
|
||||
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
|
||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||
|
||||
!!! warning
|
||||
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
|
||||
However, there are differences in how they handle text + image inputs:
|
||||
|
||||
V0 correctly implements the model's attention pattern:
|
||||
- Uses bidirectional attention between the image tokens corresponding to the same image
|
||||
- Uses causal attention for other tokens
|
||||
- Implemented via (naive) PyTorch SDPA with masking tensors
|
||||
- Note: May use significant memory for long prompts with image
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
!!! note
|
||||
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
|
||||
|
||||
!!! note
|
||||
`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
|
||||
|
||||
!!! note
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
|
||||
!!! warning
|
||||
The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
|
||||
|
||||
For the best results, we recommend using the following dependency versions (tested on A10 and L40):
|
||||
|
||||
```text
|
||||
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
|
||||
torch==2.5.1
|
||||
torchvision==0.20.1
|
||||
transformers==4.48.1
|
||||
tokenizers==0.21.0
|
||||
tiktoken==0.7.0
|
||||
vllm==0.7.0
|
||||
|
||||
# Optional but recommended for improved performance and stability
|
||||
triton==3.1.0
|
||||
xformers==0.0.28.post3
|
||||
uvloop==0.21.0
|
||||
protobuf==5.29.3
|
||||
openai==1.60.2
|
||||
opencv-python-headless==4.11.0.86
|
||||
pillow==10.4.0
|
||||
|
||||
# Installed FlashAttention (for float16 only)
|
||||
flash-attn>=2.5.6 # Not used in float32, but should be documented
|
||||
```
|
||||
|
||||
**Note:** Make sure you understand the security implications of using outdated packages.
|
||||
|
||||
!!! note
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
|
||||
!!! warning
|
||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
||||
|
||||
!!! note
|
||||
To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
|
||||
`pip install git+https://github.com/huggingface/transformers.git`.
|
||||
|
||||
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
!!! warning
|
||||
Since some model architectures support both generative and pooling tasks,
|
||||
you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
|
||||
|
||||
#### Text Embedding
|
||||
|
||||
Specified using `--task embed`.
|
||||
|
||||
Any text generation model can be converted into an embedding model by passing `--task embed`.
|
||||
|
||||
!!! note
|
||||
To get the best results, you should use pooling models that are specifically trained as such.
|
||||
|
||||
The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ |
|
||||
|
||||
#### Transcription
|
||||
|
||||
Specified using `--task transcription`.
|
||||
|
||||
Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|----------------|----------|---------------------|------------------------|-----------------------------|
|
||||
|
||||
---
|
||||
|
||||
## Model Support Policy
|
||||
|
||||
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
|
||||
|
||||
1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
|
||||
|
||||
2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
|
||||
|
||||
!!! tip
|
||||
When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
|
||||
|
||||
3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
|
||||
|
||||
4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
|
||||
|
||||
5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
|
||||
|
||||
Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
|
||||
|
||||
Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard.
|
||||
|
||||
We have the following levels of testing for models:
|
||||
|
||||
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
|
||||
2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
|
||||
3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
|
||||
4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
|
||||
51
docs/seed_parameter_behavior.md
Normal file
51
docs/seed_parameter_behavior.md
Normal file
@ -0,0 +1,51 @@
|
||||
# Seed Parameter Behavior in vLLM
|
||||
|
||||
## Overview
|
||||
|
||||
The `seed` parameter in vLLM is used to control the random states for various random number generators. This parameter can affect the behavior of random operations in user code, especially when working with models in vLLM.
|
||||
|
||||
## Default Behavior
|
||||
|
||||
By default, the `seed` parameter is set to `None`. When the `seed` parameter is `None`, the global random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that the random operations will behave as expected, without any fixed random states.
|
||||
|
||||
## Specifying a Seed
|
||||
|
||||
If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. This can be useful for reproducibility, as it ensures that the random operations produce the same results across multiple runs.
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Without Specifying a Seed
|
||||
|
||||
```python
|
||||
import random
|
||||
from vllm import LLM
|
||||
|
||||
# Initialize a vLLM model without specifying a seed
|
||||
model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
|
||||
|
||||
# Try generating random numbers
|
||||
print(random.randint(0, 100)) # Outputs different numbers across runs
|
||||
```
|
||||
|
||||
### Specifying a Seed
|
||||
|
||||
```python
|
||||
import random
|
||||
from vllm import LLM
|
||||
|
||||
# Initialize a vLLM model with a specific seed
|
||||
model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", seed=42)
|
||||
|
||||
# Try generating random numbers
|
||||
print(random.randint(0, 100)) # Outputs the same number across runs
|
||||
```
|
||||
|
||||
## Important Notes
|
||||
|
||||
- If the `seed` parameter is not specified, the behavior of global random states remains unaffected.
|
||||
- If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set to that value.
|
||||
- This behavior can be useful for reproducibility but may lead to non-intuitive behavior if the user is not explicitly aware of it.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Understanding the behavior of the `seed` parameter in vLLM is crucial for ensuring the expected behavior of random operations in your code. By default, the `seed` parameter is set to `None`, which means that the global random states are not affected. However, specifying a seed value can help achieve reproducibility in your experiments.
|
||||
@ -1,29 +0,0 @@
|
||||
---
|
||||
title: Offline Inference
|
||||
---
|
||||
[](){ #offline-inference }
|
||||
|
||||
You can run vLLM in your own code on a list of prompts.
|
||||
|
||||
The offline API is based on the [LLM][vllm.LLM] class.
|
||||
To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
|
||||
|
||||
For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
|
||||
and runs it in vLLM using the default configuration.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
```
|
||||
|
||||
After initializing the `LLM` instance, you can perform model inference using various APIs.
|
||||
The available APIs depend on the type of model that is being run:
|
||||
|
||||
- [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text.
|
||||
- [Pooling models][pooling-models] output their hidden states directly.
|
||||
|
||||
Please refer to the above pages for more details about each API.
|
||||
|
||||
!!! info
|
||||
[API Reference][offline-inference-api]
|
||||
8
docs/source/_static/custom.css
Normal file
8
docs/source/_static/custom.css
Normal file
@ -0,0 +1,8 @@
|
||||
.vertical-table-header th.head:not(.stub) {
|
||||
writing-mode: sideways-lr;
|
||||
white-space: nowrap;
|
||||
max-width: 0;
|
||||
p {
|
||||
margin: 0;
|
||||
}
|
||||
}
|
||||
@ -17,3 +17,22 @@ document.addEventListener("DOMContentLoaded", function () {
|
||||
script.async = true;
|
||||
document.head.appendChild(script);
|
||||
});
|
||||
|
||||
// Update URL search params when tab is clicked
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const tabs = document.querySelectorAll(".sd-tab-label");
|
||||
|
||||
function updateURL(tab) {
|
||||
const syncGroup = tab.getAttribute("data-sync-group");
|
||||
const syncId = tab.getAttribute("data-sync-id");
|
||||
if (syncGroup && syncId) {
|
||||
const url = new URL(window.location);
|
||||
url.searchParams.set(syncGroup, syncId);
|
||||
window.history.replaceState(null, "", url);
|
||||
}
|
||||
}
|
||||
|
||||
tabs.forEach(tab => {
|
||||
tab.addEventListener("click", () => updateURL(tab));
|
||||
});
|
||||
});
|
||||
39
docs/source/_templates/sections/header.html
Normal file
39
docs/source/_templates/sections/header.html
Normal file
@ -0,0 +1,39 @@
|
||||
<style>
|
||||
.notification-bar {
|
||||
width: 100vw;
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
font-size: 16px;
|
||||
padding: 0 6px 0 6px;
|
||||
}
|
||||
.notification-bar p {
|
||||
margin: 0;
|
||||
}
|
||||
.notification-bar a {
|
||||
font-weight: bold;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
/* Light mode styles (default) */
|
||||
.notification-bar {
|
||||
background-color: #fff3cd;
|
||||
color: #856404;
|
||||
}
|
||||
.notification-bar a {
|
||||
color: #d97706;
|
||||
}
|
||||
|
||||
/* Dark mode styles */
|
||||
html[data-theme=dark] .notification-bar {
|
||||
background-color: #333;
|
||||
color: #ddd;
|
||||
}
|
||||
html[data-theme=dark] .notification-bar a {
|
||||
color: #ffa500; /* Brighter color for visibility */
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="notification-bar">
|
||||
<p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
|
||||
</div>
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user