Compare commits
135 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1236aebf0e | |||
| ca2f6b9c30 | |||
| 20133cfee2 | |||
| ebb1ec9318 | |||
| 5b168b6d7a | |||
| 9760fd8f6a | |||
| b9f61e1387 | |||
| d6fd3a33b8 | |||
| 432ec9926e | |||
| 2b102d51ad | |||
| aa54a7bf7b | |||
| 2ad6194a02 | |||
| c594cbf565 | |||
| a35ca765a5 | |||
| 6aa8f9a4e7 | |||
| 1bc86a3da1 | |||
| bbfa0c61d1 | |||
| 20079c6e36 | |||
| 9a1b9b99d7 | |||
| 8bf507d766 | |||
| 306d60401d | |||
| f2c3f66d59 | |||
| 0f5e0d567e | |||
| c55d804672 | |||
| 749f5bdd38 | |||
| 2a50ef5760 | |||
| b8b904795d | |||
| ba5111f237 | |||
| 1e123529d7 | |||
| dff80b0e42 | |||
| 7782464a17 | |||
| 0f71e24034 | |||
| 1dab4d5718 | |||
| 7f21e8052b | |||
| 5a8641638a | |||
| f49239cb45 | |||
| 2dbe8c0774 | |||
| 84ec470fca | |||
| b29ca5c4d5 | |||
| ec6833c5e9 | |||
| e1fadf1197 | |||
| 43ff405b90 | |||
| fba02e3bd1 | |||
| 4577fc9abb | |||
| 5f1d0c8118 | |||
| c3bb9f2331 | |||
| 8f8900cee9 | |||
| 6acb7a6285 | |||
| 4f4a6b844a | |||
| 4d0a1541be | |||
| 77b6e74fe2 | |||
| 5acf828d99 | |||
| 3987e2ae96 | |||
| 77164dad5e | |||
| 95c40f9b09 | |||
| 3de3eadf5b | |||
| 3132290a14 | |||
| 1aa2f81b43 | |||
| d54af615d5 | |||
| a0efd3106c | |||
| e69879996f | |||
| a1cc9f33a3 | |||
| a521ef06e5 | |||
| 922165cba3 | |||
| 12ea698498 | |||
| 64eaf5fe05 | |||
| d1d61f3351 | |||
| 32ce3cf7c9 | |||
| d58f9c7f7a | |||
| c29034037d | |||
| 1b7cfd5a36 | |||
| da4b69d0b4 | |||
| c9479b2920 | |||
| 6f2909405e | |||
| b169d5f7b6 | |||
| f8977c233f | |||
| f274581f44 | |||
| 0b1447f890 | |||
| 24d0ef8970 | |||
| 7fcfd954ff | |||
| e740d07f07 | |||
| a652e71dd0 | |||
| 34d6c447c4 | |||
| 972eddf7c9 | |||
| fd7bb88d72 | |||
| 3c49dbdd03 | |||
| 1661a9c28f | |||
| 8e882ffdc0 | |||
| 26b4fa45be | |||
| 515b413ebf | |||
| caca0b718a | |||
| d86e3f0172 | |||
| 3ca8322b74 | |||
| 03b41b6cad | |||
| cad6447664 | |||
| c169b05541 | |||
| 468d16654a | |||
| 909f234faa | |||
| f8510587c2 | |||
| 9cfebf51ba | |||
| 77f95b99a6 | |||
| bbe888d033 | |||
| 25ed6738d4 | |||
| e568e401da | |||
| 269d901734 | |||
| 7951d78738 | |||
| 6dbe5b5c93 | |||
| 643622ba46 | |||
| a09c7ca9f2 | |||
| 0e98964e94 | |||
| c68b5c63eb | |||
| fced756923 | |||
| 321331b8ae | |||
| 6e4cea1cc5 | |||
| 435fa95444 | |||
| 4c2b38ce9e | |||
| d781930f90 | |||
| ce75efeecb | |||
| aa42561e40 | |||
| de65fc8e1e | |||
| 0c492b7824 | |||
| 0f0926b43f | |||
| 7f2c1a87e9 | |||
| b78f844a67 | |||
| 5e13c07d00 | |||
| 774c5fde30 | |||
| 9a21e331ff | |||
| 3e9ce609bd | |||
| 794ae1f551 | |||
| d73a9457a5 | |||
| a3896c7f02 | |||
| 51e98e4ffd | |||
| e56f44d9ec | |||
| e0cbad4e30 | |||
| b48d5cca16 |
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
|
||||
@ -2,102 +2,180 @@
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_XLA_CACHE_PATH= \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo HARDWARE \
|
||||
&& tpu-info \
|
||||
&& { \
|
||||
echo TEST_0: Running test_perf.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
||||
echo TEST_0_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_1: Running test_compilation.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
||||
echo TEST_1_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_2: Running test_basic.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
||||
echo TEST_2_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||
echo TEST_3_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
||||
echo TEST_4_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
||||
echo TEST_5_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
||||
echo TEST_6_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_7: Running test_sampler.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
||||
echo TEST_7_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
||||
echo TEST_8_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_9: Running test_multimodal.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
||||
echo TEST_9_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_10: Running test_pallas.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
||||
echo TEST_10_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_11: Running test_struct_output_generate.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
||||
echo TEST_11_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_12: Running test_moe_pallas.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
||||
echo TEST_12_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
# Disable the TPU LoRA tests until the feature is activated
|
||||
# & { \
|
||||
# echo TEST_13: Running test_moe_pallas.py; \
|
||||
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
||||
# echo TEST_13_EXIT_CODE: \$?; \
|
||||
# } & \
|
||||
wait \
|
||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
||||
"
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 0 "test_perf.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||
run_and_track_test 1 "test_compilation.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||
run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
@ -199,8 +199,9 @@ steps:
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
@ -274,17 +275,6 @@ steps:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/model_executor/guided_decoding
|
||||
- tests/test_logits_processor
|
||||
- tests/model_executor/test_guided_processors
|
||||
commands:
|
||||
- pytest -v -s test_logits_processor.py
|
||||
- pytest -v -s model_executor/test_guided_processors.py
|
||||
|
||||
- label: Speculative decoding tests # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
@ -397,6 +387,17 @@ steps:
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
@ -617,9 +618,11 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
@ -58,7 +58,7 @@ repos:
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
|
||||
@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
@ -785,5 +788,7 @@ endif()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
|
||||
|
||||
---
|
||||
|
||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||
|
||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||
|
||||
@ -64,6 +64,12 @@ become available.
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Custom</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>Local file: <code>data.jsonl</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@ -124,6 +130,38 @@ P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
### Custom Dataset
|
||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name custom \
|
||||
--dataset-path <path-to-your-data-jsonl> \
|
||||
--custom-skip-chat-template \
|
||||
--num-prompts 80 \
|
||||
--max-concurrency 1 \
|
||||
--temperature=0.3 \
|
||||
--top-p=0.75 \
|
||||
--result-dir "./log/"
|
||||
```
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
@ -146,9 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--ngram_prompt_lookup_min 2 \
|
||||
--ngram-prompt-lookup-max 5 \
|
||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
``` bash
|
||||
@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
**`philschmid/mt-bench`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
--num-prompts 80
|
||||
```
|
||||
|
||||
### Running With Sampling Parameters
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
@ -273,9 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--output-len=100 \
|
||||
--num-prompts=2048 \
|
||||
--async-engine \
|
||||
--ngram_prompt_lookup_min=2 \
|
||||
--ngram-prompt-lookup-max=5 \
|
||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
@ -324,7 +324,7 @@ async def async_request_openai_completions(
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
if usage := data.get("usage"):
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
"llama.cpp": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
|
||||
@ -9,9 +9,6 @@ generation. Supported dataset types include:
|
||||
- BurstGPT
|
||||
- HuggingFace
|
||||
- VisionArena
|
||||
|
||||
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
||||
SampleRequest instances, similar to the approach used in ShareGPT.
|
||||
"""
|
||||
|
||||
import base64
|
||||
@ -442,6 +439,97 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
return samples
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Custom Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class CustomDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the Custom dataset. Loads data from a JSONL file and generates
|
||||
sample requests based on conversation turns. E.g.,
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
# self.data will be a list of dictionaries
|
||||
# e.g., [{"prompt": "What is the capital of India?"}, ...]
|
||||
# This will be the standardized format which load_data()
|
||||
# has to convert into depending on the filetype of dataset_path.
|
||||
# sample() will assume this standardized format of self.data
|
||||
self.data = []
|
||||
|
||||
# Load the JSONL file
|
||||
if self.dataset_path.endswith(".jsonl"):
|
||||
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
|
||||
|
||||
# check if the JSONL file has a 'prompt' column
|
||||
if "prompt" not in jsonl_data.columns:
|
||||
raise ValueError("JSONL file must contain a 'prompt' column.")
|
||||
|
||||
# Convert each row to a dictionary and append to self.data
|
||||
# This will convert the DataFrame to a list of dictionaries
|
||||
# where each dictionary corresponds to a row in the DataFrame.
|
||||
# This is the standardized format we want for self.data
|
||||
for _, row in jsonl_data.iterrows():
|
||||
self.data.append(row.to_dict())
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only JSONL format is supported for CustomDataset."
|
||||
)
|
||||
|
||||
random.seed(self.random_seed)
|
||||
random.shuffle(self.data)
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
skip_chat_template: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
sampled_requests = []
|
||||
for item in self.data:
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
prompt = item["prompt"]
|
||||
|
||||
# apply template
|
||||
if not skip_chat_template:
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
)
|
||||
)
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
|
||||
return sampled_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Sonnet Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@ -6,13 +6,12 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@ -80,17 +79,9 @@ def main(args: argparse.Namespace):
|
||||
|
||||
def run_to_completion(profile_dir: Optional[str] = None):
|
||||
if profile_dir:
|
||||
with torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
str(profile_dir)
|
||||
),
|
||||
) as p:
|
||||
llm_generate()
|
||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||
llm.start_profile()
|
||||
llm_generate()
|
||||
llm.stop_profile()
|
||||
else:
|
||||
start_time = time.perf_counter()
|
||||
llm_generate()
|
||||
@ -103,11 +94,7 @@ def main(args: argparse.Namespace):
|
||||
run_to_completion(profile_dir=None)
|
||||
|
||||
if args.profile:
|
||||
profile_dir = args.profile_result_dir
|
||||
if not profile_dir:
|
||||
profile_dir = (
|
||||
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
||||
)
|
||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||
run_to_completion(profile_dir=profile_dir)
|
||||
return
|
||||
@ -164,15 +151,6 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="profile the generation process of a single batch",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile-result-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"path to save the pytorch profiler output. Can be visualized "
|
||||
"with ui.perfetto.dev or Tensorboard."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
type=str,
|
||||
@ -193,4 +171,9 @@ if __name__ == "__main__":
|
||||
# numbers. We need to disable prefix caching by default.
|
||||
parser.set_defaults(enable_prefix_caching=False)
|
||||
args = parser.parse_args()
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||
"Please set it to a valid path to use torch profiler."
|
||||
)
|
||||
main(args)
|
||||
|
||||
@ -60,6 +60,7 @@ from benchmark_dataset import (
|
||||
ASRDataset,
|
||||
BurstGPTDataset,
|
||||
ConversationDataset,
|
||||
CustomDataset,
|
||||
HuggingFaceDataset,
|
||||
InstructCoderDataset,
|
||||
MTBenchDataset,
|
||||
@ -627,7 +628,16 @@ def main(args: argparse.Namespace):
|
||||
"'--dataset-path' if required."
|
||||
)
|
||||
|
||||
if args.dataset_name == "sonnet":
|
||||
if args.dataset_name == "custom":
|
||||
dataset = CustomDataset(dataset_path=args.dataset_path)
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
output_len=args.custom_output_len,
|
||||
skip_chat_template=args.custom_skip_chat_template,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sonnet":
|
||||
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
||||
# For the "sonnet" dataset, formatting depends on the backend.
|
||||
if args.backend == "openai-chat":
|
||||
@ -762,6 +772,10 @@ def main(args: argparse.Namespace):
|
||||
if "temperature" not in sampling_params:
|
||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||
|
||||
if args.backend == "llama.cpp":
|
||||
# Disable prompt caching in llama.cpp backend
|
||||
sampling_params["cache_prompt"] = False
|
||||
|
||||
# Avoid GC processing "static" data - reduce pause times.
|
||||
gc.collect()
|
||||
gc.freeze()
|
||||
@ -834,6 +848,8 @@ def main(args: argparse.Namespace):
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
if field in benchmark_result:
|
||||
del benchmark_result[field]
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
@ -846,6 +862,7 @@ def main(args: argparse.Namespace):
|
||||
if args.result_filename:
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
os.makedirs(args.result_dir, exist_ok=True)
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(
|
||||
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
||||
@ -886,7 +903,7 @@ if __name__ == "__main__":
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="sharegpt",
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@ -1056,6 +1073,19 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
# group for dataset specific arguments
|
||||
custom_group = parser.add_argument_group("custom dataset options")
|
||||
custom_group.add_argument(
|
||||
"--custom-output-len",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Number of output tokens per request, used only for custom dataset.",
|
||||
)
|
||||
custom_group.add_argument(
|
||||
"--custom-skip-chat-template",
|
||||
action="store_true",
|
||||
help="Skip applying chat template to prompt, used only for custom dataset.",
|
||||
)
|
||||
|
||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||
sonnet_group.add_argument(
|
||||
"--sonnet-input-len",
|
||||
|
||||
222
benchmarks/kernels/bench_fp8_gemm.py
Normal file
222
benchmarks/kernels/bench_fp8_gemm.py
Normal file
@ -0,0 +1,222 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
import triton
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=[
|
||||
"torch-bf16",
|
||||
# "fp8-tensor-w-token-a",
|
||||
"fp8-tensor-w-tensor-a",
|
||||
"fp8-channel-w-token-a",
|
||||
# "fp8-channel-w-tensor-a",
|
||||
# "fp8-tensor-w-token-a-noquant",
|
||||
"fp8-tensor-w-tensor-a-noquant",
|
||||
"fp8-channel-w-token-a-noquant",
|
||||
# "fp8-channel-w-tensor-a-noquant",
|
||||
],
|
||||
line_names=[
|
||||
"torch-bf16",
|
||||
# "fp8-tensor-w-token-a",
|
||||
"fp8-tensor-w-tensor-a",
|
||||
"fp8-channel-w-token-a",
|
||||
# "fp8-channel-w-tensor-a",
|
||||
# "fp8-tensor-w-token-a-noquant",
|
||||
"fp8-tensor-w-tensor-a-noquant",
|
||||
"fp8-channel-w-token-a-noquant",
|
||||
# "fp8-channel-w-tensor-a-noquant",
|
||||
],
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs FP8 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# Create input tensors
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if "torch-bf16" in provider:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||
)
|
||||
|
||||
elif "fp8" in provider:
|
||||
# Weights are always quantized ahead of time
|
||||
if "noquant" in provider:
|
||||
# For no quantization, we just measure the GEMM
|
||||
if "tensor-w-token-a" in provider:
|
||||
# Dynamic per-token quant for A, per-tensor quant for B
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "tensor-w-tensor-a" in provider:
|
||||
# Static per-tensor quantization with fixed scales
|
||||
# for both A and B
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-token-a" in provider:
|
||||
# Static per-channel quantization for weights, per-token
|
||||
# quant for A
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-tensor-a" in provider:
|
||||
# Static per-channel quantization for weights, per-tensor
|
||||
# quant for A
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
else:
|
||||
# In these cases, we quantize the activations during the GEMM call
|
||||
if "tensor-w-token-a" in provider:
|
||||
# Dynamic per-token quant for A, per-tensor quant for B
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "tensor-w-tensor-a" in provider:
|
||||
# Static per-tensor quantization with fixed scales
|
||||
# for both A and B
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-token-a" in provider:
|
||||
# Static per-channel quantization for weights, per-token
|
||||
# quant for A
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-tensor-a" in provider:
|
||||
# Static per-channel quantization for weights, per-tensor
|
||||
# quant for A
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
b_fp8 = b_fp8.t()
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), quantiles=quantiles
|
||||
)
|
||||
|
||||
# Calculate TFLOP/s, two flops per multiply-add
|
||||
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
|
||||
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
KN_model_names = []
|
||||
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||
for model, tp_size in models_tps:
|
||||
assert model in WEIGHT_SHAPES
|
||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||
KN.append(model)
|
||||
KN_model_names.append(KN)
|
||||
return KN_model_names
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=[*WEIGHT_SHAPES.keys()],
|
||||
help="List of models to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[1],
|
||||
help="List of tensor parallel sizes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
KN_model_names = prepare_shapes(args)
|
||||
for K, N, model_name in KN_model_names:
|
||||
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_fp8_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
||||
@ -22,7 +22,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
base: float = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
@ -48,4 +48,50 @@ WEIGHT_SHAPES = {
|
||||
([16384, 106496], 1),
|
||||
([53248, 16384], 0),
|
||||
],
|
||||
"meta-llama/Llama-3.1-8B-Instruct": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-3.3-70B-Instruct": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 57344], 1),
|
||||
([28672, 8192], 0),
|
||||
],
|
||||
"mistralai/Mistral-Large-Instruct-2407": [
|
||||
([12288, 14336], 1),
|
||||
([12288, 12288], 0),
|
||||
([12288, 57344], 1),
|
||||
([28672, 12288], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-7B-Instruct": [
|
||||
([3584, 4608], 1),
|
||||
([3584, 3584], 0),
|
||||
([3584, 37888], 1),
|
||||
([18944, 3584], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-32B-Instruct": [
|
||||
([5120, 7168], 1),
|
||||
([5120, 5120], 0),
|
||||
([5120, 55296], 1),
|
||||
([27648, 5120], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-72B-Instruct": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 59136], 1),
|
||||
([29568, 8192], 0),
|
||||
],
|
||||
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
|
||||
([2048, 3072], 1),
|
||||
([2048, 4096], 1),
|
||||
([2048, 2048], 0),
|
||||
([2048, 576], 0),
|
||||
([2048, 21888], 1),
|
||||
([10944, 2048], 0),
|
||||
([2048, 2816], 1),
|
||||
([1408, 2048], 0),
|
||||
],
|
||||
}
|
||||
|
||||
@ -46,22 +46,38 @@ else()
|
||||
endif()
|
||||
|
||||
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
|
||||
|
||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||
# This is here to support installing all components under the same prefix with cmake --install.
|
||||
# setup.py installs every component separately but uses the same prefix for all.
|
||||
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
|
||||
# and these statements don't hurt when installing neither component.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Restore the install prefix
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
|
||||
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
||||
add_custom_target(
|
||||
hipify${NAME}
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
||||
BYPRODUCTS ${HIP_SRCS}
|
||||
COMMENT "Running hipify on ${NAME} extension source files.")
|
||||
|
||||
@ -13,14 +13,34 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "quantization/fp8/common.cuh"
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
|
||||
#define __HIP__MI300_MI250__
|
||||
#if defined(__HIPCC__) && \
|
||||
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__GFX9__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && defined(__gfx942__)
|
||||
#define __HIP__MI300__
|
||||
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__MI3XX__
|
||||
#endif
|
||||
|
||||
#if defined(__gfx950__)
|
||||
#define LDS_SIZE 160 * 1024
|
||||
#else
|
||||
#define LDS_SIZE 64 * 1024
|
||||
#endif
|
||||
|
||||
int get_lds_size() {
|
||||
static bool is_cached = false;
|
||||
static int result;
|
||||
if (is_cached == false) {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
size_t substring = device_arch.find("gfx95");
|
||||
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
|
||||
is_cached = true;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#if defined(NDEBUG)
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
@ -267,7 +287,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
|
||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
// This version targets cases where A[] fits LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@ -275,7 +295,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A, scalar_t* C,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
#if defined(__HIP__MI300__)
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
#if defined(__HIP__MI3XX__)
|
||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||
#else
|
||||
constexpr bool use_mfma = false;
|
||||
@ -295,13 +316,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
};
|
||||
|
||||
//----------------------------------------------------
|
||||
// Reserving 64 KB of LDS to have 1 WG / CU
|
||||
// Reserving 64/160 KB of LDS to have 1 WG / CU
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[1024 * 32];
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
//----------------------------------------------------
|
||||
// Fetch the activation matrix to LDS
|
||||
@ -312,11 +333,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// - Then the WG will move to another 8 K elements
|
||||
// TODO: Logic below will only work when K is multiple of 8
|
||||
//----------------------------------------------------
|
||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@ -517,7 +538,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||
@ -525,9 +546,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
|
||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
// This version targets cases where A[] marginally exceeds LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@ -535,7 +556,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A, scalar_t* C,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
#if defined(__HIP__MI300__)
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
#if defined(__HIP__MI3XX__)
|
||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||
#else
|
||||
constexpr bool use_mfma = false;
|
||||
@ -561,7 +583,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[1024 * 32];
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
//----------------------------------------------------
|
||||
// Computation of columns that need to be committed to memory!
|
||||
@ -598,11 +620,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// - Then the WG will move to another 8 K elements
|
||||
// TODO: Logic below will only work when K is multiple of 8
|
||||
//----------------------------------------------------
|
||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@ -686,7 +708,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Fetch A activation matrix in interleaved fashion from LDS or memory
|
||||
|
||||
for (int n = 0; n < N; n++) {
|
||||
if (k_ + K * n < 32 * 1024)
|
||||
if (k_ + K * n < max_lds_len)
|
||||
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
||||
else
|
||||
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
||||
@ -817,7 +839,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
|
||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||
@ -825,9 +847,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
|
||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
// This version targets big A[] cases, where it is much larger than LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@ -835,7 +857,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A, scalar_t* C,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
#if defined(__HIP__MI300__)
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
#if defined(__HIP__MI3XX__)
|
||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||
#else
|
||||
constexpr bool use_mfma = false;
|
||||
@ -855,13 +878,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
};
|
||||
|
||||
//----------------------------------------------------
|
||||
// Reserving 64 KB of LDS to have 1 WG / CU
|
||||
// Reserving 64/160 KB of LDS to have 1 WG / CU
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[1024 * 32];
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
//----------------------------------------------------
|
||||
// Computation of columns that need to be committed to memory!
|
||||
@ -902,11 +925,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
//----------------------------------------------------
|
||||
#define PCML
|
||||
#ifndef PCML
|
||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@ -916,7 +939,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#define TUC (THRDS * UNRL * A_CHUNK)
|
||||
uint32_t kBase = 0;
|
||||
// find biggest k size that fits in LDS
|
||||
uint32_t kFit = (32 * 1024) / N;
|
||||
uint32_t kFit = (max_lds_len) / N;
|
||||
// kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
|
||||
// of TUC
|
||||
kFit = (kFit % TUC == 0)
|
||||
@ -1164,7 +1187,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||
@ -1172,7 +1195,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
|
||||
int mindiv(int N, int div1, int div2) {
|
||||
int nPrRnd = div1 * div2;
|
||||
@ -1222,17 +1245,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||
_N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
} else if (K_in * N_in <= 32 * 1024 * 1.2) { \
|
||||
} else if (K_in * N_in <= max_lds_len * 1.2) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
||||
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
||||
@ -1272,7 +1296,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#if defined(__HIP__MI300__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@ -1281,6 +1305,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
const float* __restrict__ s_A,
|
||||
const float* __restrict__ s_B, const int _WvPrGrp,
|
||||
const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE;
|
||||
using scalar8 =
|
||||
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
||||
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
@ -1296,10 +1321,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
scalar8 h8;
|
||||
};
|
||||
|
||||
__shared__ fp8_t s[1024 * 64];
|
||||
__shared__ fp8_t s[max_lds_len];
|
||||
|
||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||
k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||
}
|
||||
__syncthreads();
|
||||
@ -1436,7 +1461,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
||||
@ -1446,9 +1471,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
|
||||
#if defined(__HIP__MI300__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@ -1456,6 +1481,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
const fp8_t* __restrict__ A, scalar_t* C,
|
||||
const float* __restrict__ s_A, const float* __restrict__ s_B,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
constexpr int max_lds_len = LDS_SIZE;
|
||||
using scalar8 =
|
||||
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
||||
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
@ -1471,10 +1497,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
scalar8 h8;
|
||||
};
|
||||
|
||||
__shared__ fp8_t s[1024 * 64];
|
||||
__shared__ fp8_t s[max_lds_len];
|
||||
|
||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||
k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||
}
|
||||
__syncthreads();
|
||||
@ -1517,7 +1543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
||||
if (k_ >= K) break;
|
||||
for (int n = 0; n < N; n++) {
|
||||
if (k_ + K * n < 64 * 1024)
|
||||
if (k_ + K * n < max_lds_len)
|
||||
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
||||
else
|
||||
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
||||
@ -1608,7 +1634,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||
@ -1618,7 +1644,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||
const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI300__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
|
||||
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||
at::Tensor& scale_a, at::Tensor& scale_b,
|
||||
@ -1638,12 +1664,13 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||
dim3 grid(CuCount);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size();
|
||||
|
||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||
_N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
|
||||
|
||||
@ -12,6 +12,7 @@ nav:
|
||||
- User Guide: usage/README.md
|
||||
- Developer Guide: contributing/README.md
|
||||
- API Reference: api/README.md
|
||||
- CLI Reference: cli/README.md
|
||||
- Timeline:
|
||||
- Roadmap: https://roadmap.vllm.ai
|
||||
- Releases: https://github.com/vllm-project/vllm/releases
|
||||
@ -56,6 +57,8 @@ nav:
|
||||
- Contents:
|
||||
- glob: api/vllm/*
|
||||
preserve_directory_names: true
|
||||
- CLI Reference:
|
||||
- Summary: cli/README.md
|
||||
- Community:
|
||||
- community/*
|
||||
- Blog: https://blog.vllm.ai
|
||||
|
||||
@ -12,8 +12,8 @@
|
||||
<p style="text-align:center">
|
||||
<script async defer src="https://buttons.github.io/buttons.js"></script>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
|
||||
<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
|
||||
</p>
|
||||
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
179
docs/cli/README.md
Normal file
179
docs/cli/README.md
Normal file
@ -0,0 +1,179 @@
|
||||
# vLLM CLI Guide
|
||||
|
||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||
|
||||
```
|
||||
vllm --help
|
||||
```
|
||||
|
||||
Available Commands:
|
||||
|
||||
```
|
||||
vllm {chat,complete,serve,bench,collect-env,run-batch}
|
||||
```
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [serve](#serve)
|
||||
- [chat](#chat)
|
||||
- [complete](#complete)
|
||||
- [bench](#bench)
|
||||
- [latency](#latency)
|
||||
- [serve](#serve-1)
|
||||
- [throughput](#throughput)
|
||||
- [collect-env](#collect-env)
|
||||
- [run-batch](#run-batch)
|
||||
- [More Help](#more-help)
|
||||
|
||||
## serve
|
||||
|
||||
Start the vLLM OpenAI Compatible API server.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Start with a model
|
||||
vllm serve meta-llama/Llama-2-7b-hf
|
||||
|
||||
# Specify the port
|
||||
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
||||
|
||||
# Check with --help for more options
|
||||
# To list all groups
|
||||
vllm serve --help=listgroup
|
||||
|
||||
# To view a argument group
|
||||
vllm serve --help=ModelConfig
|
||||
|
||||
# To view a single argument
|
||||
vllm serve --help=max-num-seqs
|
||||
|
||||
# To search by keyword
|
||||
vllm serve --help=max
|
||||
```
|
||||
|
||||
## chat
|
||||
|
||||
Generate chat completions via the running API server.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Directly connect to localhost API without arguments
|
||||
vllm chat
|
||||
|
||||
# Specify API url
|
||||
vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
||||
|
||||
# Quick chat with a single prompt
|
||||
vllm chat --quick "hi"
|
||||
```
|
||||
|
||||
## complete
|
||||
|
||||
Generate text completions based on the given prompt via the running API server.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Directly connect to localhost API without arguments
|
||||
vllm complete
|
||||
|
||||
# Specify API url
|
||||
vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
||||
|
||||
# Quick complete with a single prompt
|
||||
vllm complete --quick "The future of AI is"
|
||||
```
|
||||
|
||||
## bench
|
||||
|
||||
Run benchmark tests for latency online serving throughput and offline inference throughput.
|
||||
|
||||
Available Commands:
|
||||
|
||||
```bash
|
||||
vllm bench {latency, serve, throughput}
|
||||
```
|
||||
|
||||
### latency
|
||||
|
||||
Benchmark the latency of a single batch of requests.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
vllm bench latency \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--input-len 32 \
|
||||
--output-len 1 \
|
||||
--enforce-eager \
|
||||
--load-format dummy
|
||||
```
|
||||
|
||||
### serve
|
||||
|
||||
Benchmark the online serving throughput.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--host server-host \
|
||||
--port server-port \
|
||||
--random-input-len 32 \
|
||||
--random-output-len 4 \
|
||||
--num-prompts 5
|
||||
```
|
||||
|
||||
### throughput
|
||||
|
||||
Benchmark offline inference throughput.
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--input-len 32 \
|
||||
--output-len 1 \
|
||||
--enforce-eager \
|
||||
--load-format dummy
|
||||
```
|
||||
|
||||
## collect-env
|
||||
|
||||
Start collecting environment information.
|
||||
|
||||
```bash
|
||||
vllm collect-env
|
||||
```
|
||||
|
||||
## run-batch
|
||||
|
||||
Run batch prompts and write results to file.
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
# Running with a local file
|
||||
vllm run-batch \
|
||||
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# Using remote file
|
||||
vllm run-batch \
|
||||
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
## More Help
|
||||
|
||||
For detailed options of any subcommand, use:
|
||||
|
||||
```bash
|
||||
vllm <subcommand> --help
|
||||
```
|
||||
@ -29,20 +29,68 @@ See <gh-file:LICENSE>.
|
||||
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
|
||||
Check out the [building from source][build-from-source] documentation for details.
|
||||
|
||||
### Building the docs
|
||||
### Building the docs with MkDocs
|
||||
|
||||
Install the dependencies:
|
||||
#### Introduction to MkDocs
|
||||
|
||||
[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
|
||||
|
||||
#### Install MkDocs and Plugins
|
||||
|
||||
Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements/docs.txt
|
||||
```
|
||||
|
||||
Start the autoreloading MkDocs server:
|
||||
!!! note
|
||||
Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
|
||||
|
||||
#### Verify Installation
|
||||
|
||||
Confirm that MkDocs is correctly installed:
|
||||
|
||||
```bash
|
||||
mkdocs --version
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```console
|
||||
mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
|
||||
```
|
||||
|
||||
#### Clone the `vLLM` repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
```
|
||||
|
||||
#### Start the Development Server
|
||||
|
||||
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
|
||||
|
||||
```bash
|
||||
mkdocs serve
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```console
|
||||
INFO - Documentation built in 106.83 seconds
|
||||
INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
|
||||
INFO - [22:02:02] Serving on http://127.0.0.1:8000/
|
||||
```
|
||||
|
||||
#### View in Your Browser
|
||||
|
||||
Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
|
||||
|
||||
#### Learn More
|
||||
|
||||
For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
@ -60,6 +108,9 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files
|
||||
|
||||
# Unit tests
|
||||
pytest tests/
|
||||
|
||||
# Run tests for a single test file with detailed output
|
||||
pytest -s -v tests/test_logger.py
|
||||
```
|
||||
|
||||
!!! tip
|
||||
|
||||
@ -48,8 +48,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
More API details can be found in the [Offline Inference]
|
||||
(#offline-inference-api) section of the API docs.
|
||||
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
||||
|
||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||
|
||||
|
||||
@ -22,13 +22,13 @@ This document describes how vLLM deals with these challenges.
|
||||
|
||||
[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
|
||||
|
||||
- `spawn` - spawn a new Python process. This will be the default as of Python
|
||||
3.14. In macOS, this is already the default.
|
||||
- `spawn` - spawn a new Python process. The default on Windows and macOS.
|
||||
|
||||
- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
|
||||
in Python versions prior to 3.14.
|
||||
- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
|
||||
Linux for Python versions prior to 3.14.
|
||||
|
||||
- `forkserver` - Spawn a server process that will fork a new process on request.
|
||||
The default on Linux for Python version 3.14 and newer.
|
||||
|
||||
### Tradeoffs
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ The symbols used have the following meanings:
|
||||
- ✅ = Full compatibility
|
||||
- 🟠 = Partial compatibility
|
||||
- ❌ = No compatibility
|
||||
- ❔ = Unknown or TBD
|
||||
|
||||
!!! note
|
||||
Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
|
||||
@ -36,23 +37,23 @@ th:not(:first-child) {
|
||||
}
|
||||
</style>
|
||||
|
||||
| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
||||
|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------|
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
||||
| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
||||
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
|
||||
| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
|
||||
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
|
||||
| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
||||
| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
||||
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
|
||||
| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
|
||||
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
|
||||
|
||||
[](){ #feature-x-hardware }
|
||||
|
||||
@ -75,3 +76,6 @@ th:not(:first-child) {
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
|
||||
!!! note
|
||||
Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
|
||||
|
||||
@ -165,6 +165,7 @@ it will first look in the local directory for a directory `foobar`, and attempt
|
||||
that adapter will then be available for normal use on the server.
|
||||
|
||||
Alternatively, follow these example steps to implement your own plugin:
|
||||
|
||||
1. Implement the LoRAResolver interface.
|
||||
|
||||
Example of a simple S3 LoRAResolver implementation:
|
||||
@ -198,9 +199,9 @@ Alternatively, follow these example steps to implement your own plugin:
|
||||
return lora_request
|
||||
```
|
||||
|
||||
2. Register LoRAResolver plugin.
|
||||
2. Register `LoRAResolver` plugin.
|
||||
|
||||
```python
|
||||
```python
|
||||
from vllm.lora.resolver import LoRAResolverRegistry
|
||||
|
||||
s3_resolver = S3LoRAResolver()
|
||||
|
||||
@ -5,13 +5,13 @@ title: Supported Hardware
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Inferentia | Google TPU |
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU |
|
||||
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
|
||||
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ |
|
||||
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
|
||||
Paged Attention and Chunked Prefill are currently in development and will be available soon.
|
||||
Data types currently supported in Neuron SDK are FP16 and BF16.
|
||||
[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
|
||||
generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
|
||||
and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
|
||||
This tab describes how to set up your environment to run vLLM on Neuron.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
@ -11,59 +12,31 @@ Data types currently supported in Neuron SDK are FP16 and BF16.
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- OS: Linux
|
||||
- Python: 3.9 -- 3.11
|
||||
- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
|
||||
- Pytorch 2.0.1/2.1.1
|
||||
- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
|
||||
- Python: 3.9 or newer
|
||||
- Pytorch 2.5/2.6
|
||||
- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
|
||||
- AWS Neuron SDK 2.23
|
||||
|
||||
## Configure a new environment
|
||||
|
||||
### Launch Trn1/Inf2 instances
|
||||
### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
|
||||
|
||||
Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
|
||||
The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
|
||||
[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
|
||||
|
||||
- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
|
||||
- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
|
||||
- Select Ubuntu Server 22.04 TLS AMI
|
||||
- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
|
||||
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
|
||||
|
||||
### Install drivers and tools
|
||||
|
||||
The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
|
||||
|
||||
- Once inside your instance, activate the pre-installed virtual environment for inference by running
|
||||
```console
|
||||
# Configure Linux for Neuron repository updates
|
||||
. /etc/os-release
|
||||
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
|
||||
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
|
||||
EOF
|
||||
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \
|
||||
| sudo apt-key add -
|
||||
|
||||
# Update OS packages
|
||||
sudo apt-get update -y
|
||||
|
||||
# Install OS headers
|
||||
sudo apt-get install linux-headers-$(uname -r) -y
|
||||
|
||||
# Install git
|
||||
sudo apt-get install git -y
|
||||
|
||||
# install Neuron Driver
|
||||
sudo apt-get install aws-neuronx-dkms=2.* -y
|
||||
|
||||
# Install Neuron Runtime
|
||||
sudo apt-get install aws-neuronx-collectives=2.* -y
|
||||
sudo apt-get install aws-neuronx-runtime-lib=2.* -y
|
||||
|
||||
# Install Neuron Tools
|
||||
sudo apt-get install aws-neuronx-tools=2.* -y
|
||||
|
||||
# Add PATH
|
||||
export PATH=/opt/aws/neuron/bin:$PATH
|
||||
source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
|
||||
```
|
||||
|
||||
Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
|
||||
for alternative setup instructions including using Docker and manually installing dependencies.
|
||||
|
||||
!!! note
|
||||
NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
|
||||
library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
|
||||
@ -75,60 +48,37 @@ Currently, there are no pre-built Neuron wheels.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
!!! note
|
||||
The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
|
||||
|
||||
Following instructions are applicable to Neuron SDK 2.16 and beyond.
|
||||
|
||||
#### Install transformers-neuronx and its dependencies
|
||||
|
||||
[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
|
||||
Follow the steps below to install transformer-neuronx package and its dependencies.
|
||||
|
||||
```console
|
||||
# Install Python venv
|
||||
sudo apt-get install -y python3.10-venv g++
|
||||
|
||||
# Create Python venv
|
||||
python3.10 -m venv aws_neuron_venv_pytorch
|
||||
|
||||
# Activate Python venv
|
||||
source aws_neuron_venv_pytorch/bin/activate
|
||||
|
||||
# Install Jupyter notebook kernel
|
||||
pip install ipykernel
|
||||
python3.10 -m ipykernel install \
|
||||
--user \
|
||||
--name aws_neuron_venv_pytorch \
|
||||
--display-name "Python (torch-neuronx)"
|
||||
pip install jupyter notebook
|
||||
pip install environment_kernels
|
||||
|
||||
# Set pip repository pointing to the Neuron repository
|
||||
python -m pip config set \
|
||||
global.extra-index-url \
|
||||
https://pip.repos.neuron.amazonaws.com
|
||||
|
||||
# Install wget, awscli
|
||||
python -m pip install wget
|
||||
python -m pip install awscli
|
||||
|
||||
# Update Neuron Compiler and Framework
|
||||
python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
|
||||
```
|
||||
|
||||
#### Install vLLM from source
|
||||
|
||||
Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
|
||||
Install vllm as follows:
|
||||
|
||||
```console
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
pip install -U -r requirements/neuron.txt
|
||||
VLLM_TARGET_DEVICE="neuron" pip install .
|
||||
VLLM_TARGET_DEVICE="neuron" pip install -e .
|
||||
```
|
||||
|
||||
If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
|
||||
AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
|
||||
[https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's
|
||||
available on vLLM V0. Please utilize the AWS Fork for the following features:
|
||||
|
||||
- Llama-3.2 multi-modal support
|
||||
- Multi-node distributed inference
|
||||
|
||||
Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
|
||||
for more details and usage examples.
|
||||
|
||||
To install the AWS Neuron fork, run the following:
|
||||
|
||||
```console
|
||||
git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
|
||||
cd upstreaming-to-vllm
|
||||
pip install -r requirements/neuron.txt
|
||||
VLLM_TARGET_DEVICE="neuron" pip install -e .
|
||||
```
|
||||
|
||||
Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
@ -148,5 +98,57 @@ Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dock
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
|
||||
There is no extra information for this device.
|
||||
[](){ #feature-support-through-nxd-inference-backend }
|
||||
### Feature support through NxD Inference backend
|
||||
|
||||
The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
|
||||
to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
|
||||
[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
|
||||
|
||||
To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
|
||||
as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
|
||||
```console
|
||||
override_neuron_config={
|
||||
"enable_bucketing":False,
|
||||
}
|
||||
```
|
||||
or when launching vLLM from the CLI, pass
|
||||
```console
|
||||
--override-neuron-config "{\"enable_bucketing\":false}"
|
||||
```
|
||||
|
||||
Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
|
||||
(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
|
||||
|
||||
### Known limitations
|
||||
|
||||
- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
|
||||
[guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
|
||||
for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
|
||||
- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
|
||||
[Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
|
||||
to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
|
||||
- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
|
||||
runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
|
||||
- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
|
||||
to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
|
||||
- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
|
||||
to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
|
||||
to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
|
||||
- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
|
||||
max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
|
||||
to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
|
||||
for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
|
||||
implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
|
||||
|
||||
|
||||
### Environment variables
|
||||
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
|
||||
compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
|
||||
artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
|
||||
but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
|
||||
under this specified path.
|
||||
- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
|
||||
- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
|
||||
|
||||
# --8<-- [end:extra-information]
|
||||
|
||||
@ -302,31 +302,31 @@ Specified using `--task generate`.
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | ✅︎ | |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | | |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | ✅︎ | |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | ✅︎ | |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | ✅︎ | |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | ✅︎ | |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | ✅︎ | |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | ✅︎ | |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ |
|
||||
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | ✅︎ | |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | ✅︎ | |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | ✅︎ | |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ |
|
||||
@ -336,39 +336,39 @@ Specified using `--task generate`.
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | ✅︎ | |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
|
||||
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | ✅︎ | |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | ✅︎ | |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | ✅︎ | |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | ✅︎ | |
|
||||
| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | ✅︎ | |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ |
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | ✅︎ | |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | ✅︎ | |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | ✅︎ | |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
|
||||
!!! note
|
||||
@ -401,7 +401,7 @@ Specified using `--task embed`.
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
@ -512,44 +512,44 @@ Specified using `--task generate`.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | ✅︎ | ✅︎ | |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | ✅︎ | ✅︎ | |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ | |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | ✅︎ | ✅︎ | |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | ✅︎ | ✅︎ | |
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ |
|
||||
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | ✅︎ | ✅︎ | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎\* | |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | ✅︎ | |
|
||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | ✅︎ | | |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | ✅︎ | ✅︎ | |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | ✅︎ | ✅︎ | |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | ✅︎ | ✅︎ | |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | ✅︎ | ✅︎ | |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | ✅︎ | ✅︎ | |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | ✅︎ | ✅︎ | |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | ✅︎ | | |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ⚠️ | |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | ✅︎ | ✅︎ | |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | |
|
||||
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | ✅︎ | ✅︎ | |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | ✅︎ | ✅︎ | |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎\* | |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | ✅︎ | ✅︎ | |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | ✅︎ | |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
@ -647,7 +647,7 @@ The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | ✅︎ | |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ |
|
||||
|
||||
#### Transcription
|
||||
|
||||
@ -35,19 +35,6 @@ The following metrics are exposed:
|
||||
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
||||
```
|
||||
|
||||
The following metrics are deprecated and due to be removed in a future version:
|
||||
|
||||
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
|
||||
`vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
|
||||
used in V1.
|
||||
- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
|
||||
counters in V1.
|
||||
- `vllm:time_in_queue_requests` because it duplicates
|
||||
`vllm:request_queue_time_seconds`.
|
||||
- `vllm:model_forward_time_milliseconds` and
|
||||
`vllm:model_execute_time_milliseconds` because
|
||||
prefill/decode/inference time metrics should be used instead.
|
||||
|
||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||
and are then removed in version `X.Y+2`.
|
||||
|
||||
@ -12,14 +12,14 @@ All communications between nodes in a multi-node vLLM deployment are **insecure
|
||||
|
||||
The following options control inter-node communications in vLLM:
|
||||
|
||||
1. **Environment Variables:**
|
||||
#### 1. **Environment Variables:**
|
||||
- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
|
||||
|
||||
2. **KV Cache Transfer Configuration:**
|
||||
#### 2. **KV Cache Transfer Configuration:**
|
||||
- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
|
||||
- `--kv-port`: The port for KV cache transfer communications (default: 14579)
|
||||
|
||||
3. **Data Parallel Configuration:**
|
||||
#### 3. **Data Parallel Configuration:**
|
||||
- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
|
||||
- `data_parallel_master_port`: Port of the data parallel master (default: 29500)
|
||||
|
||||
@ -39,16 +39,16 @@ Key points from the PyTorch security guide:
|
||||
|
||||
### Security Recommendations
|
||||
|
||||
1. **Network Isolation:**
|
||||
#### 1. **Network Isolation:**
|
||||
- Deploy vLLM nodes on a dedicated, isolated network
|
||||
- Use network segmentation to prevent unauthorized access
|
||||
- Implement appropriate firewall rules
|
||||
|
||||
2. **Configuration Best Practices:**
|
||||
#### 2. **Configuration Best Practices:**
|
||||
- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
|
||||
- Configure firewalls to only allow necessary ports between nodes
|
||||
|
||||
3. **Access Control:**
|
||||
#### 3. **Access Control:**
|
||||
- Restrict physical and network access to the deployment environment
|
||||
- Implement proper authentication and authorization for management interfaces
|
||||
- Follow the principle of least privilege for all system components
|
||||
|
||||
46
examples/offline_inference/context_extension.py
Normal file
46
examples/offline_inference/context_extension.py
Normal file
@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
rope_theta = 1000000
|
||||
original_max_position_embeddings = 32768
|
||||
factor = 4.0
|
||||
|
||||
# Use yarn to extend context
|
||||
hf_overrides = {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": int(original_max_position_embeddings * factor),
|
||||
}
|
||||
|
||||
llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=128,
|
||||
)
|
||||
|
||||
conversation = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": "Hello"},
|
||||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||
]
|
||||
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
|
||||
|
||||
|
||||
def print_outputs(outputs):
|
||||
print("\nGenerated Outputs:\n" + "-" * 80)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\n")
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
print_outputs(outputs)
|
||||
@ -97,10 +97,14 @@ def main(
|
||||
# with DP, each rank should process different prompts.
|
||||
# usually all the DP ranks process a full dataset,
|
||||
# and each rank processes a different part of the dataset.
|
||||
promts_per_rank = len(prompts) // dp_size
|
||||
start = global_dp_rank * promts_per_rank
|
||||
end = start + promts_per_rank
|
||||
prompts = prompts[start:end]
|
||||
floor = len(prompts) // dp_size
|
||||
remainder = len(prompts) % dp_size
|
||||
|
||||
# Distribute prompts into even groups.
|
||||
def start(rank):
|
||||
return rank * floor + min(rank, remainder)
|
||||
|
||||
prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
|
||||
if len(prompts) == 0:
|
||||
# if any rank has no prompts to process,
|
||||
# we need to set a placeholder prompt
|
||||
|
||||
105
examples/offline_inference/neuron_multimodal.py
Normal file
105
examples/offline_inference/neuron_multimodal.py
Normal file
@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import requests
|
||||
import torch
|
||||
from neuronx_distributed_inference.models.mllama.utils import add_instruct
|
||||
from PIL import Image
|
||||
|
||||
from vllm import LLM, SamplingParams, TextPrompt
|
||||
|
||||
|
||||
def get_image(image_url):
|
||||
image = Image.open(requests.get(image_url, stream=True).raw)
|
||||
return image
|
||||
|
||||
|
||||
# Model Inputs
|
||||
PROMPTS = [
|
||||
"What is in this image? Tell me a story",
|
||||
"What is the recipe of mayonnaise in two sentences?",
|
||||
"Describe this image",
|
||||
"What is the capital of Italy famous for?",
|
||||
]
|
||||
IMAGES = [
|
||||
get_image(
|
||||
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
|
||||
),
|
||||
None,
|
||||
get_image(
|
||||
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
|
||||
),
|
||||
None,
|
||||
]
|
||||
SAMPLING_PARAMS = [
|
||||
dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
|
||||
for _ in range(len(PROMPTS))
|
||||
]
|
||||
|
||||
|
||||
def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
|
||||
# Prepare all inputs for mllama generation, including:
|
||||
# 1. put text prompt into instruct chat template
|
||||
# 2. compose single text and single image prompt into Vllm's prompt class
|
||||
# 3. prepare sampling parameters
|
||||
input_image = single_image
|
||||
has_image = torch.tensor([1])
|
||||
if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
|
||||
has_image = torch.tensor([0])
|
||||
|
||||
instruct_prompt = add_instruct(prompt, has_image)
|
||||
inputs = TextPrompt(prompt=instruct_prompt)
|
||||
|
||||
if input_image is not None:
|
||||
inputs["multi_modal_data"] = {"image": input_image}
|
||||
|
||||
sampling_params = SamplingParams(**sampling_params)
|
||||
return inputs, sampling_params
|
||||
|
||||
|
||||
def print_outputs(outputs):
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert (
|
||||
len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
|
||||
), f"""Text, image prompts and sampling parameters should have the
|
||||
same batch size; but got {len(PROMPTS)}, {len(IMAGES)},
|
||||
and {len(SAMPLING_PARAMS)}"""
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_num_seqs=1,
|
||||
max_model_len=4096,
|
||||
block_size=4096,
|
||||
device="neuron",
|
||||
tensor_parallel_size=32,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled": False,
|
||||
"skip_warmup": True,
|
||||
"save_sharded_checkpoint": True,
|
||||
"on_device_sampling_config": {
|
||||
"global_topk": 1,
|
||||
"dynamic": False,
|
||||
"deterministic": False,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
batched_inputs = []
|
||||
batched_sample_params = []
|
||||
for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
|
||||
inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
|
||||
# test batch-size = 1
|
||||
outputs = llm.generate(inputs, sampling_params)
|
||||
print_outputs(outputs)
|
||||
batched_inputs.append(inputs)
|
||||
batched_sample_params.append(sampling_params)
|
||||
|
||||
# test batch-size = 4
|
||||
outputs = llm.generate(batched_inputs, batched_sample_params)
|
||||
print_outputs(outputs)
|
||||
@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line.
|
||||
You can run the batch with the following command, which will write its results to a file called `results.jsonl`
|
||||
|
||||
```console
|
||||
python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
python -m vllm.entrypoints.openai.run_batch \
|
||||
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
or use command-line:
|
||||
|
||||
```console
|
||||
vllm run-batch \
|
||||
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
### Step 3: Check your results
|
||||
@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h
|
||||
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
|
||||
|
||||
```console
|
||||
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
python -m vllm.entrypoints.openai.run_batch \
|
||||
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
or use command-line:
|
||||
|
||||
```console
|
||||
vllm run-batch \
|
||||
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
## Example 3: Integrating with AWS S3
|
||||
@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \
|
||||
--model --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
or use command-line:
|
||||
|
||||
```console
|
||||
vllm run-batch \
|
||||
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
|
||||
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
|
||||
--model --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
### Step 4: View your results
|
||||
|
||||
Your results are now on S3. You can view them in your terminal by running
|
||||
|
||||
@ -577,23 +577,6 @@
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "Num Swapped",
|
||||
"range": true,
|
||||
"refId": "B",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -874,19 +857,6 @@
|
||||
"legendFormat": "GPU Cache Usage",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "CPU Cache Usage",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Cache Utilization",
|
||||
|
||||
@ -8,7 +8,6 @@ requires = [
|
||||
"setuptools-scm>=8.0",
|
||||
"torch == 2.7.0",
|
||||
"wheel",
|
||||
"regex",
|
||||
"jinja2",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
@ -110,6 +109,7 @@ ignore = [
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
plugins = ['pydantic.mypy']
|
||||
ignore_missing_imports = true
|
||||
check_untyped_defs = true
|
||||
follow_imports = "silent"
|
||||
@ -171,7 +171,8 @@ plugins.md033.enabled = false # inline-html
|
||||
plugins.md046.enabled = false # code-block-style
|
||||
plugins.md024.allow_different_nesting = true # no-duplicate-headers
|
||||
|
||||
[tool.ty]
|
||||
[tool.ty.src]
|
||||
root = "./vllm"
|
||||
respect-ignore-files = true
|
||||
|
||||
[tool.ty.environment]
|
||||
|
||||
@ -14,7 +14,7 @@ protobuf # Required by LlamaTokenizer.
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
aiohttp
|
||||
openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
|
||||
pydantic >= 2.9
|
||||
pydantic >= 2.10
|
||||
prometheus_client >= 0.18.0
|
||||
pillow # Required for image processing
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
|
||||
@ -51,3 +51,4 @@ numpy
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
@ -480,12 +480,13 @@ pycparser==2.22
|
||||
# via cffi
|
||||
pycryptodomex==3.22.0
|
||||
# via blobfile
|
||||
pydantic==2.9.2
|
||||
pydantic==2.11.5
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# datamodel-code-generator
|
||||
# mistral-common
|
||||
# mteb
|
||||
pydantic-core==2.23.4
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
@ -784,6 +785,9 @@ typing-extensions==4.12.2
|
||||
# pydantic-core
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspection
|
||||
typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
tzdata==2024.2
|
||||
# via pandas
|
||||
uri-template==1.3.0
|
||||
|
||||
@ -18,9 +18,9 @@ setuptools==78.1.0
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch==2.8.0.dev20250518
|
||||
torchvision==0.22.0.dev20250518
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch==2.8.0.dev20250529
|
||||
torchvision==0.22.0.dev20250529
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
||||
|
||||
7
setup.py
7
setup.py
@ -5,12 +5,12 @@ import importlib.util
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from shutil import which
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from packaging.version import Version, parse
|
||||
from setuptools import Extension, setup
|
||||
@ -251,11 +251,8 @@ class cmake_build_ext(build_ext):
|
||||
|
||||
# CMake appends the extension prefix to the install path,
|
||||
# and outdir already contains that prefix, so we need to remove it.
|
||||
# We assume only the final component of extension prefix is added by
|
||||
# CMake, this is currently true for current extensions but may not
|
||||
# always be the case.
|
||||
prefix = outdir
|
||||
if '.' in ext.name:
|
||||
for _ in range(ext.name.count('.')):
|
||||
prefix = prefix.parent
|
||||
|
||||
# prefix here should actually be the same for all components
|
||||
|
||||
@ -60,7 +60,6 @@ def _fix_prompt_embed_outputs(
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [False])
|
||||
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
|
||||
@ -69,7 +68,6 @@ def test_models(
|
||||
hf_runner,
|
||||
model: str,
|
||||
backend: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
enable_prompt_embeds: bool,
|
||||
@ -97,7 +95,7 @@ def test_models(
|
||||
str(i) for i in range(1024)) + " are:"
|
||||
example_prompts = [prompt]
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
if enable_prompt_embeds:
|
||||
with torch.no_grad():
|
||||
@ -106,7 +104,6 @@ def test_models(
|
||||
|
||||
with VllmRunner(model,
|
||||
max_model_len=8192,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
enable_prompt_embeds=enable_prompt_embeds,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
|
||||
@ -74,11 +74,12 @@ class SillyModel(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
def test_simple_piecewise_compile():
|
||||
def _test_simple_piecewise_compile(*, use_inductor):
|
||||
|
||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_cudagraph=True,
|
||||
use_inductor=use_inductor,
|
||||
splitting_ops=["silly.attention"],
|
||||
cudagraph_copy_inputs=True,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
@ -108,3 +109,11 @@ def test_simple_piecewise_compile():
|
||||
output = model(input)
|
||||
assert global_counter == 2
|
||||
assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
|
||||
|
||||
|
||||
def test_simple_piecewise_compile_inductor():
|
||||
_test_simple_piecewise_compile(use_inductor=True)
|
||||
|
||||
|
||||
def test_simple_piecewise_compile_no_inductor():
|
||||
_test_simple_piecewise_compile(use_inductor=False)
|
||||
|
||||
@ -261,12 +261,14 @@ def tractable_computation(input_ids: torch.Tensor,
|
||||
@torch.inference_mode
|
||||
def run_model(llama_config,
|
||||
use_compile: bool,
|
||||
use_inductor: bool,
|
||||
split_attn: bool = False) -> torch.Tensor:
|
||||
|
||||
if use_compile:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_cudagraph=True,
|
||||
use_inductor=use_inductor,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
)
|
||||
if split_attn:
|
||||
@ -304,7 +306,7 @@ def run_model(llama_config,
|
||||
return output.cpu()
|
||||
|
||||
|
||||
def test_toy_llama():
|
||||
def _test_toy_llama(*, use_inductor):
|
||||
# compare output with and without piecewise compilation
|
||||
|
||||
llama_config = LlamaConfig(hidden_size=128,
|
||||
@ -326,8 +328,14 @@ def test_toy_llama():
|
||||
num_backend_compilations=0,
|
||||
num_cudagraph_caputured=0,
|
||||
):
|
||||
outputs.append(run_model(llama_config, use_compile=False))
|
||||
run_model(tractable_config, use_compile=False)
|
||||
outputs.append(
|
||||
run_model(llama_config, use_inductor=False, use_compile=False))
|
||||
run_model(tractable_config, use_inductor=False, use_compile=False)
|
||||
|
||||
if use_inductor:
|
||||
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
|
||||
else:
|
||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
@ -336,9 +344,13 @@ def test_toy_llama():
|
||||
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_caputured=
|
||||
2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
**kwargs,
|
||||
):
|
||||
outputs.append(run_model(llama_config, use_compile=True))
|
||||
run_model(tractable_config, use_compile=True)
|
||||
outputs.append(
|
||||
run_model(llama_config,
|
||||
use_inductor=use_inductor,
|
||||
use_compile=True))
|
||||
run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
@ -353,13 +365,27 @@ def test_toy_llama():
|
||||
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
):
|
||||
outputs.append(
|
||||
run_model(llama_config, use_compile=True, split_attn=True))
|
||||
run_model(tractable_config, use_compile=True, split_attn=True)
|
||||
run_model(llama_config,
|
||||
use_inductor=use_inductor,
|
||||
use_compile=True,
|
||||
split_attn=True))
|
||||
run_model(tractable_config,
|
||||
use_inductor=use_inductor,
|
||||
use_compile=True,
|
||||
split_attn=True)
|
||||
|
||||
for i in range(1, len(outputs)):
|
||||
assert torch.allclose(outputs[0], outputs[i])
|
||||
|
||||
|
||||
def test_toy_llama_inductor():
|
||||
_test_toy_llama(use_inductor=True)
|
||||
|
||||
|
||||
def test_toy_no_inductor():
|
||||
_test_toy_llama(use_inductor=False)
|
||||
|
||||
|
||||
@torch.inference_mode
|
||||
def benchmark():
|
||||
from triton.testing import do_bench
|
||||
|
||||
@ -311,6 +311,7 @@ class HfRunner:
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
trust_remote_code: bool = True,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
@ -320,10 +321,15 @@ class HfRunner:
|
||||
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
self.device = self.get_default_device()
|
||||
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
|
||||
self.dtype = torch_dtype = _get_and_verify_dtype(
|
||||
self.model_name,
|
||||
self.config,
|
||||
dtype=dtype,
|
||||
is_pooling_model=is_sentence_transformer or is_cross_encoder,
|
||||
)
|
||||
|
||||
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
||||
model_kwargs.setdefault("torch_dtype", torch_dtype)
|
||||
@ -336,7 +342,7 @@ class HfRunner:
|
||||
model_name,
|
||||
device=self.device,
|
||||
model_kwargs=model_kwargs,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
elif is_cross_encoder:
|
||||
# Lazy init required for AMD CI
|
||||
@ -346,12 +352,12 @@ class HfRunner:
|
||||
model_name,
|
||||
device=self.device,
|
||||
automodel_args=model_kwargs,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
else:
|
||||
model = auto_cls.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
@ -372,7 +378,7 @@ class HfRunner:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
|
||||
# don't put this import at the top level
|
||||
@ -381,7 +387,7 @@ class HfRunner:
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
if skip_tokenizer_init:
|
||||
self.tokenizer = self.processor.tokenizer
|
||||
|
||||
@ -227,6 +227,7 @@ MULTIMODAL_MODELS = {
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
|
||||
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
|
||||
"AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
|
||||
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
|
||||
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import error_on_warning
|
||||
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
|
||||
def test_pos_args_deprecated():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
|
||||
|
||||
with error_on_warning(DeprecationWarning):
|
||||
LLM(MODEL_NAME, tokenizer=MODEL_NAME)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'tokenizer'"):
|
||||
LLM(MODEL_NAME, MODEL_NAME)
|
||||
|
||||
with pytest.warns(DeprecationWarning,
|
||||
match="'tokenizer', 'tokenizer_mode'"):
|
||||
LLM(MODEL_NAME, MODEL_NAME, "auto")
|
||||
@ -4,6 +4,7 @@ import os
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
|
||||
MTEB_EMBED_TOL,
|
||||
OpenAIClientMtebEncoder,
|
||||
run_mteb_embed_task,
|
||||
run_mteb_embed_task_st)
|
||||
@ -38,4 +39,4 @@ def test_mteb(server):
|
||||
print("SentenceTransformer main score: ", st_main_score)
|
||||
print("Difference: ", st_main_score - vllm_main_score)
|
||||
|
||||
assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4)
|
||||
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
|
||||
|
||||
@ -11,7 +11,8 @@ import requests
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...models.utils import run_embedding_correctness_test
|
||||
from ...models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
@ -11,7 +11,9 @@ import pytest
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
|
||||
from ...conftest import HfRunner
|
||||
from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
|
||||
from ...models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
from ...models.utils import EmbedModelInfo
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODELS = [
|
||||
|
||||
@ -313,3 +313,37 @@ async def test_loading_invalid_adapters_does_not_break_others(
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_beam_search_with_lora_adapters(
|
||||
client: openai.AsyncOpenAI,
|
||||
tmp_path,
|
||||
zephyr_lora_files,
|
||||
):
|
||||
"""Validate that async beam search can be used with lora."""
|
||||
|
||||
async def load_and_run_adapter(adapter_name: str):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": adapter_name,
|
||||
"lora_path": str(zephyr_lora_files)
|
||||
})
|
||||
for _ in range(3):
|
||||
await client.completions.create(
|
||||
model=adapter_name,
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
|
||||
lora_tasks = []
|
||||
for i in range(3):
|
||||
lora_tasks.append(
|
||||
asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
|
||||
|
||||
results, _ = await asyncio.wait(lora_tasks)
|
||||
|
||||
for r in results:
|
||||
assert not isinstance(r, Exception), f"Got exception {r}"
|
||||
|
||||
@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
|
||||
EXPECTED_METRICS = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_swapped", # deprecated
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc", # deprecated
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS = [
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
]
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -2,9 +2,10 @@
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import BatchRequestOutput
|
||||
|
||||
# ruff: noqa: E501
|
||||
@ -24,9 +25,13 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
|
||||
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
|
||||
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
|
||||
|
||||
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
|
||||
|
||||
INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
|
||||
|
||||
|
||||
def test_empty_file():
|
||||
with tempfile.NamedTemporaryFile(
|
||||
@ -35,9 +40,8 @@ def test_empty_file():
|
||||
input_file.write("")
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen([
|
||||
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
|
||||
input_file.name, "-o", output_file.name, "--model",
|
||||
"intfloat/multilingual-e5-small"
|
||||
"vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
|
||||
"--model", "intfloat/multilingual-e5-small"
|
||||
], )
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
@ -54,9 +58,8 @@ def test_completions():
|
||||
input_file.write(INPUT_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen([
|
||||
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
|
||||
input_file.name, "-o", output_file.name, "--model",
|
||||
"NousResearch/Meta-Llama-3-8B-Instruct"
|
||||
"vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
|
||||
"--model", "NousResearch/Meta-Llama-3-8B-Instruct"
|
||||
], )
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
@ -79,9 +82,8 @@ def test_completions_invalid_input():
|
||||
input_file.write(INVALID_INPUT_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen([
|
||||
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
|
||||
input_file.name, "-o", output_file.name, "--model",
|
||||
"NousResearch/Meta-Llama-3-8B-Instruct"
|
||||
"vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
|
||||
"--model", "NousResearch/Meta-Llama-3-8B-Instruct"
|
||||
], )
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
@ -95,9 +97,8 @@ def test_embeddings():
|
||||
input_file.write(INPUT_EMBEDDING_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen([
|
||||
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
|
||||
input_file.name, "-o", output_file.name, "--model",
|
||||
"intfloat/multilingual-e5-small"
|
||||
"vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
|
||||
"--model", "intfloat/multilingual-e5-small"
|
||||
], )
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
@ -110,16 +111,17 @@ def test_embeddings():
|
||||
BatchRequestOutput.model_validate_json(line)
|
||||
|
||||
|
||||
def test_score():
|
||||
@pytest.mark.parametrize("input_batch",
|
||||
[INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
|
||||
def test_score(input_batch):
|
||||
with tempfile.NamedTemporaryFile(
|
||||
"w") as input_file, tempfile.NamedTemporaryFile(
|
||||
"r") as output_file:
|
||||
input_file.write(INPUT_SCORE_BATCH)
|
||||
input_file.write(input_batch)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen([
|
||||
sys.executable,
|
||||
"-m",
|
||||
"vllm.entrypoints.openai.run_batch",
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
|
||||
@ -76,11 +76,11 @@ async def test_tokenize_completions(
|
||||
})
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {
|
||||
"tokens": tokens,
|
||||
"count": len(tokens),
|
||||
"max_model_len": 8192
|
||||
}
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -138,11 +138,11 @@ async def test_tokenize_chat(
|
||||
})
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {
|
||||
"tokens": tokens,
|
||||
"count": len(tokens),
|
||||
"max_model_len": 8192
|
||||
}
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -215,11 +215,46 @@ async def test_tokenize_chat_with_tools(
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {
|
||||
"tokens": tokens,
|
||||
"count": len(tokens),
|
||||
"max_model_len": 8192,
|
||||
}
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_with_return_token_strs(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
||||
tokenizer_mode="fast")
|
||||
|
||||
prompt = "This is a token_strs test prompt! vllm1"
|
||||
response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"model": model_name,
|
||||
"return_token_strs": True
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=True)
|
||||
tokens_str = tokenizer.convert_ids_to_tokens(tokens)
|
||||
|
||||
result = response.json()
|
||||
assert result["tokens"] == tokens
|
||||
assert result["count"] == len(tokens)
|
||||
assert result["max_model_len"] == 8192
|
||||
assert result["token_strs"] == tokens_str
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
@ -191,3 +191,27 @@ def test_streaming_tool_call_with_large_steps():
|
||||
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [False])
|
||||
def test_regex_timeout_handling(streaming: bool):
|
||||
"""test regex timeout is handled gracefully"""
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser(
|
||||
"llama4_pythonic")(mock_tokenizer)
|
||||
|
||||
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
|
||||
content, tool_calls = run_tool_extraction(tool_parser,
|
||||
fake_problematic_input,
|
||||
streaming=streaming)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert content == fake_problematic_input
|
||||
assert len(tool_calls) == 0
|
||||
mock_regex.match.assert_called_once()
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
@ -159,3 +159,27 @@ def test_streaming_tool_call_with_large_steps():
|
||||
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [False])
|
||||
def test_regex_timeout_handling(streaming: bool):
|
||||
"""test regex timeout is handled gracefully"""
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser(
|
||||
"llama4_pythonic")(mock_tokenizer)
|
||||
|
||||
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
|
||||
content, tool_calls = run_tool_extraction(tool_parser,
|
||||
fake_problematic_input,
|
||||
streaming=streaming)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert content == fake_problematic_input
|
||||
assert len(tool_calls) == 0
|
||||
mock_regex.match.assert_called_once()
|
||||
|
||||
268
tests/entrypoints/test_api_server_process_manager.py
Normal file
268
tests/entrypoints/test_api_server_process_manager.py
Normal file
@ -0,0 +1,268 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import multiprocessing
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.utils import (APIServerProcessManager,
|
||||
wait_for_completion_or_failure)
|
||||
|
||||
# Global variables to control worker behavior
|
||||
WORKER_RUNTIME_SECONDS = 0.5
|
||||
|
||||
|
||||
# Mock implementation of run_api_server_worker
|
||||
def mock_run_api_server_worker(listen_address, sock, args, client_config=None):
|
||||
"""Mock run_api_server_worker that runs for a specific time."""
|
||||
print(f"Mock worker started with client_config: {client_config}")
|
||||
time.sleep(WORKER_RUNTIME_SECONDS)
|
||||
print("Mock worker completed successfully")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server_args():
|
||||
"""Fixture to provide arguments for APIServerProcessManager."""
|
||||
sock = socket.socket()
|
||||
return {
|
||||
"target_server_fn":
|
||||
mock_run_api_server_worker,
|
||||
"listen_address":
|
||||
"localhost:8000",
|
||||
"sock":
|
||||
sock,
|
||||
"args":
|
||||
"test_args", # Simple string to avoid pickling issues
|
||||
"num_servers":
|
||||
3,
|
||||
"input_addresses": [
|
||||
"tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
|
||||
"tcp://127.0.0.1:5003"
|
||||
],
|
||||
"output_addresses": [
|
||||
"tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
|
||||
"tcp://127.0.0.1:6003"
|
||||
],
|
||||
"stats_update_address":
|
||||
"tcp://127.0.0.1:7000",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_stats_update", [True, False])
|
||||
def test_api_server_process_manager_init(api_server_args, with_stats_update):
|
||||
"""Test initializing the APIServerProcessManager."""
|
||||
# Set the worker runtime to ensure tests complete in reasonable time
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 0.5
|
||||
|
||||
# Copy the args to avoid mutating the
|
||||
args = api_server_args.copy()
|
||||
|
||||
if not with_stats_update:
|
||||
args.pop("stats_update_address")
|
||||
manager = APIServerProcessManager(**args)
|
||||
|
||||
try:
|
||||
# Verify the manager was initialized correctly
|
||||
assert len(manager.processes) == 3
|
||||
|
||||
# Verify all processes are running
|
||||
for proc in manager.processes:
|
||||
assert proc.is_alive()
|
||||
|
||||
print("Waiting for processes to run...")
|
||||
time.sleep(WORKER_RUNTIME_SECONDS / 2)
|
||||
|
||||
# They should still be alive at this point
|
||||
for proc in manager.processes:
|
||||
assert proc.is_alive()
|
||||
|
||||
finally:
|
||||
# Always clean up the processes
|
||||
print("Cleaning up processes...")
|
||||
manager.close()
|
||||
|
||||
# Give processes time to terminate
|
||||
time.sleep(0.2)
|
||||
|
||||
# Verify all processes were terminated
|
||||
for proc in manager.processes:
|
||||
assert not proc.is_alive()
|
||||
|
||||
|
||||
@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
|
||||
mock_run_api_server_worker)
|
||||
def test_wait_for_completion_or_failure(api_server_args):
|
||||
"""Test that wait_for_completion_or_failure works with failures."""
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 1.0
|
||||
|
||||
# Create the manager
|
||||
manager = APIServerProcessManager(**api_server_args)
|
||||
|
||||
try:
|
||||
assert len(manager.processes) == 3
|
||||
|
||||
# Create a result capture for the thread
|
||||
result: dict[str, Optional[Exception]] = {"exception": None}
|
||||
|
||||
def run_with_exception_capture():
|
||||
try:
|
||||
wait_for_completion_or_failure(api_server_manager=manager)
|
||||
except Exception as e:
|
||||
result["exception"] = e
|
||||
|
||||
# Start a thread to run wait_for_completion_or_failure
|
||||
wait_thread = threading.Thread(target=run_with_exception_capture,
|
||||
daemon=True)
|
||||
wait_thread.start()
|
||||
|
||||
# Let all processes run for a short time
|
||||
time.sleep(0.2)
|
||||
|
||||
# All processes should still be running
|
||||
assert all(proc.is_alive() for proc in manager.processes)
|
||||
|
||||
# Now simulate a process failure
|
||||
print("Simulating process failure...")
|
||||
manager.processes[0].terminate()
|
||||
|
||||
# Wait for the wait_for_completion_or_failure
|
||||
# to detect and handle the failure
|
||||
# This should trigger it to terminate all other processes
|
||||
wait_thread.join(timeout=1.0)
|
||||
|
||||
# The wait thread should have exited
|
||||
assert not wait_thread.is_alive()
|
||||
|
||||
# Verify that an exception was raised with appropriate error message
|
||||
assert result["exception"] is not None
|
||||
assert "died with exit code" in str(result["exception"])
|
||||
|
||||
# All processes should now be terminated
|
||||
for i, proc in enumerate(manager.processes):
|
||||
assert not proc.is_alive(), f"Process {i} should not be alive"
|
||||
|
||||
finally:
|
||||
manager.close()
|
||||
time.sleep(0.2)
|
||||
|
||||
|
||||
@pytest.mark.timeout(30)
|
||||
def test_normal_completion(api_server_args):
|
||||
"""Test that wait_for_completion_or_failure works in normal completion."""
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 0.1
|
||||
|
||||
# Create the manager
|
||||
manager = APIServerProcessManager(**api_server_args)
|
||||
|
||||
try:
|
||||
# Give processes time to terminate
|
||||
# wait for processes to complete
|
||||
remaining_processes = manager.processes.copy()
|
||||
while remaining_processes:
|
||||
for proc in remaining_processes:
|
||||
if not proc.is_alive():
|
||||
remaining_processes.remove(proc)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Verify all processes have terminated
|
||||
for i, proc in enumerate(manager.processes):
|
||||
assert not proc.is_alive(
|
||||
), f"Process {i} still alive after terminate()"
|
||||
|
||||
# Now call wait_for_completion_or_failure
|
||||
# since all processes have already
|
||||
# terminated, it should return immediately
|
||||
# with no error
|
||||
wait_for_completion_or_failure(api_server_manager=manager)
|
||||
|
||||
finally:
|
||||
# Clean up just in case
|
||||
manager.close()
|
||||
time.sleep(0.2)
|
||||
|
||||
|
||||
@pytest.mark.timeout(30)
|
||||
def test_external_process_monitoring(api_server_args):
|
||||
"""Test that wait_for_completion_or_failure handles additional processes."""
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 100
|
||||
|
||||
# Create and start the external process
|
||||
# (simulates local_engine_manager or coordinator)
|
||||
spawn_context = multiprocessing.get_context("spawn")
|
||||
external_proc = spawn_context.Process(target=mock_run_api_server_worker,
|
||||
name="MockExternalProcess")
|
||||
external_proc.start()
|
||||
|
||||
# Create the class to simulate a coordinator
|
||||
class MockCoordinator:
|
||||
|
||||
def __init__(self, proc):
|
||||
self.proc = proc
|
||||
|
||||
def close(self):
|
||||
if self.proc.is_alive():
|
||||
self.proc.terminate()
|
||||
self.proc.join(timeout=0.5)
|
||||
|
||||
# Create a mock coordinator with the external process
|
||||
mock_coordinator = MockCoordinator(external_proc)
|
||||
|
||||
# Create the API server manager
|
||||
manager = APIServerProcessManager(**api_server_args)
|
||||
|
||||
try:
|
||||
# Verify manager initialization
|
||||
assert len(manager.processes) == 3
|
||||
|
||||
# Create a result capture for the thread
|
||||
result: dict[str, Optional[Exception]] = {"exception": None}
|
||||
|
||||
def run_with_exception_capture():
|
||||
try:
|
||||
wait_for_completion_or_failure(api_server_manager=manager,
|
||||
coordinator=mock_coordinator)
|
||||
except Exception as e:
|
||||
result["exception"] = e
|
||||
|
||||
# Start a thread to run wait_for_completion_or_failure
|
||||
wait_thread = threading.Thread(target=run_with_exception_capture,
|
||||
daemon=True)
|
||||
wait_thread.start()
|
||||
|
||||
# Terminate the external process to trigger a failure
|
||||
time.sleep(0.2)
|
||||
external_proc.terminate()
|
||||
|
||||
# Wait for the thread to detect the failure
|
||||
wait_thread.join(timeout=1.0)
|
||||
|
||||
# The wait thread should have completed
|
||||
assert not wait_thread.is_alive(
|
||||
), "wait_for_completion_or_failure thread still running"
|
||||
|
||||
# Verify that an exception was raised with appropriate error message
|
||||
assert result["exception"] is not None, "No exception was raised"
|
||||
error_message = str(result["exception"])
|
||||
assert "died with exit code" in error_message, \
|
||||
f"Unexpected error message: {error_message}"
|
||||
assert "MockExternalProcess" in error_message, \
|
||||
f"Error doesn't mention external process: {error_message}"
|
||||
|
||||
# Verify that all API server processes were terminated as a result
|
||||
for i, proc in enumerate(manager.processes):
|
||||
assert not proc.is_alive(
|
||||
), f"API server process {i} was not terminated"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
manager.close()
|
||||
mock_coordinator.close()
|
||||
time.sleep(0.2)
|
||||
@ -13,7 +13,9 @@ HEAD_SIZES = [128, 256]
|
||||
BLOCK_SIZES = [16, 32]
|
||||
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
QDTYPES = [None, torch.float8_e4m3fn]
|
||||
QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
|
||||
None, torch.float8_e4m3fnuz
|
||||
]
|
||||
# one value large enough to test overflow in index calculation.
|
||||
# one value small enough to test the schema op check
|
||||
NUM_BLOCKS = [32768, 2048]
|
||||
|
||||
@ -70,7 +70,7 @@ def test_rotary_embedding(
|
||||
device: str,
|
||||
use_key: bool,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
base: float = 10000,
|
||||
) -> None:
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
@ -135,7 +135,7 @@ def test_batched_rotary_embedding(
|
||||
device: str,
|
||||
use_key: bool,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
base: float = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
@ -203,7 +203,7 @@ def test_batched_rotary_embedding_multi_lora(
|
||||
device: str,
|
||||
use_key: bool,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
base: float = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
@ -1,18 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import triton.language as tl
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||
BatchedPrepareAndFinalize, BatchedTritonExperts,
|
||||
invoke_moe_batched_triton_kernel)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import round_up
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
TOP_KS = [1, 2, 6]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
@dataclass
|
||||
class BatchedMMConfig:
|
||||
dtype: torch.dtype
|
||||
in_dtype: torch.dtype
|
||||
out_dtype: torch.dtype
|
||||
num_experts: int
|
||||
max_tokens_per_expert: int
|
||||
K: int
|
||||
@ -28,17 +48,26 @@ class BatchedMMTensors:
|
||||
|
||||
@staticmethod
|
||||
def make_tensors(config: BatchedMMConfig):
|
||||
if config.in_dtype == torch.torch.float8_e4m3fn:
|
||||
config_in_dtype = torch.bfloat16
|
||||
else:
|
||||
config_in_dtype = config.in_dtype
|
||||
|
||||
A = torch.randn(
|
||||
(config.num_experts, config.max_tokens_per_expert, config.K),
|
||||
device="cuda",
|
||||
dtype=config.dtype) / 10
|
||||
dtype=config_in_dtype) / 10
|
||||
B = torch.randn((config.num_experts, config.N, config.K),
|
||||
device="cuda",
|
||||
dtype=config.dtype)
|
||||
dtype=config_in_dtype)
|
||||
C = torch.zeros(
|
||||
(config.num_experts, config.max_tokens_per_expert, config.N),
|
||||
device="cuda",
|
||||
dtype=config.dtype)
|
||||
dtype=config.out_dtype)
|
||||
|
||||
A = A.to(config.in_dtype)
|
||||
B = B.to(config.in_dtype)
|
||||
|
||||
num_expert_tokens = torch.randint(low=0,
|
||||
high=config.max_tokens_per_expert,
|
||||
size=(config.num_experts, ),
|
||||
@ -47,16 +76,96 @@ class BatchedMMTensors:
|
||||
return BatchedMMTensors(A, B, C, num_expert_tokens)
|
||||
|
||||
|
||||
def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
|
||||
num_expert_tokens: torch.Tensor) -> torch.Tensor:
|
||||
def native_w8a8_block_matmul(A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size,
|
||||
output_dtype=torch.bfloat16):
|
||||
"""This function performs matrix multiplication with block-wise
|
||||
quantization using native torch.
|
||||
It is agnostic to the input data type and can be used for both int8 and
|
||||
fp8 data types.
|
||||
|
||||
It takes two input tensors `A` and `B` (int8) with scales `As` and
|
||||
`Bs` (float32).
|
||||
The output is returned in the specified `output_dtype`.
|
||||
"""
|
||||
A = A.to(torch.float32)
|
||||
B = B.to(torch.float32).contiguous()
|
||||
assert A.shape[-1] == B.shape[-1]
|
||||
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
||||
assert len(block_size) == 2
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1], (
|
||||
f"{(A.shape[-1] + block_k - 1) // block_k} == {As.shape[-1]}")
|
||||
assert A.shape[:-1] == As.shape[:-1], f"{A.shape} == {As.shape}"
|
||||
|
||||
M = A.numel() // A.shape[-1]
|
||||
N, K = B.shape
|
||||
origin_C_shape = A.shape[:-1] + (N, )
|
||||
A = A.reshape(M, A.shape[-1])
|
||||
As = As.reshape(M, As.shape[-1])
|
||||
n_tiles = (N + block_n - 1) // block_n
|
||||
k_tiles = (K + block_k - 1) // block_k
|
||||
assert n_tiles == Bs.shape[0]
|
||||
assert k_tiles == Bs.shape[1]
|
||||
|
||||
C_shape = (M, N)
|
||||
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
|
||||
|
||||
A_tiles = [
|
||||
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
|
||||
]
|
||||
B_tiles = [[
|
||||
B[
|
||||
j * block_n:min((j + 1) * block_n, N),
|
||||
i * block_k:min((i + 1) * block_k, K),
|
||||
] for i in range(k_tiles)
|
||||
] for j in range(n_tiles)]
|
||||
C_tiles = [
|
||||
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
|
||||
]
|
||||
As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
|
||||
|
||||
for i in range(k_tiles):
|
||||
for j in range(n_tiles):
|
||||
a = A_tiles[i]
|
||||
b = B_tiles[j][i]
|
||||
c = C_tiles[j]
|
||||
s = As_tiles[i] * Bs[j][i]
|
||||
c[:, :] += torch.matmul(a, b.t()) * s
|
||||
|
||||
C = C.reshape(origin_C_shape).to(output_dtype)
|
||||
return C
|
||||
|
||||
|
||||
def ref_impl(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
C: torch.Tensor,
|
||||
num_expert_tokens: torch.Tensor,
|
||||
A_scale: Optional[torch.Tensor],
|
||||
B_scale: Optional[torch.Tensor],
|
||||
block_shape: Optional[list[int]],
|
||||
) -> torch.Tensor:
|
||||
num_expert_tokens_cpu = num_expert_tokens.clone()
|
||||
num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
|
||||
num_experts = num_expert_tokens.size(0)
|
||||
|
||||
for e in range(num_experts):
|
||||
num_tokens = num_expert_tokens_cpu[e]
|
||||
C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
|
||||
if A.dtype == torch.torch.float8_e4m3fn:
|
||||
if False:
|
||||
tmp = native_w8a8_block_matmul(A[e, :, :],
|
||||
B[e].transpose(0, 1), A_scale,
|
||||
B_scale, block_shape)
|
||||
else:
|
||||
tmp = ops.cutlass_scaled_mm(A[e, :, :], B[e].transpose(0, 1),
|
||||
A_scale, B_scale, torch.bfloat16)
|
||||
C[e, :num_tokens, :] = tmp[:num_tokens, :]
|
||||
else:
|
||||
C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
|
||||
|
||||
return C
|
||||
|
||||
@ -66,22 +175,45 @@ def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
|
||||
[32, 64, 128, 192, 224, 256, 512])
|
||||
@pytest.mark.parametrize("K", [128, 256, 1024])
|
||||
@pytest.mark.parametrize("N", [128, 256, 512, 1024])
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[torch.torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
|
||||
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
N: int, dtype: torch.dtype):
|
||||
|
||||
config = BatchedMMConfig(dtype, num_experts, max_tokens_per_expert, K, N)
|
||||
if dtype == torch.torch.float8_e4m3fn:
|
||||
in_dtype = dtype
|
||||
out_dtype = torch.bfloat16
|
||||
else:
|
||||
in_dtype = dtype
|
||||
out_dtype = dtype
|
||||
|
||||
config = BatchedMMConfig(in_dtype, out_dtype, num_experts,
|
||||
max_tokens_per_expert, K, N)
|
||||
tensors = BatchedMMTensors.make_tensors(config)
|
||||
|
||||
test_output = tensors.C
|
||||
ref_output = test_output.clone()
|
||||
ref_output2 = test_output.clone()
|
||||
|
||||
compute_tl_dtype = {
|
||||
torch.float16: tl.float16,
|
||||
torch.bfloat16: tl.bfloat16,
|
||||
torch.float32: tl.float32
|
||||
}[test_output.dtype]
|
||||
|
||||
use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
|
||||
block_shape = [16, 16, 32] # 16 for k if not fp8
|
||||
|
||||
if use_fp8_w8a8:
|
||||
A_scale = torch.ones(1, dtype=torch.float32, device=tensors.A.device)
|
||||
B_scale = torch.ones(1, dtype=torch.float32, device=tensors.B.device)
|
||||
quant_block_shape = [1, 1]
|
||||
else:
|
||||
A_scale = None
|
||||
B_scale = None
|
||||
quant_block_shape = None
|
||||
|
||||
invoke_moe_batched_triton_kernel(
|
||||
tensors.A,
|
||||
tensors.B,
|
||||
@ -89,21 +221,30 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
tensors.num_expert_tokens,
|
||||
compute_tl_dtype,
|
||||
# Quantization data
|
||||
None,
|
||||
None,
|
||||
A_scale,
|
||||
B_scale,
|
||||
None,
|
||||
# Quantization schemes
|
||||
False,
|
||||
use_fp8_w8a8,
|
||||
False,
|
||||
False,
|
||||
config={
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 16,
|
||||
"BLOCK_SIZE_K": 16
|
||||
})
|
||||
"BLOCK_SIZE_M": block_shape[0],
|
||||
"BLOCK_SIZE_N": block_shape[1],
|
||||
"BLOCK_SIZE_K": block_shape[2],
|
||||
},
|
||||
block_shape=quant_block_shape,
|
||||
)
|
||||
|
||||
ref_output = ref_impl(tensors.A, tensors.B, ref_output,
|
||||
tensors.num_expert_tokens)
|
||||
ref_output = ref_output.to(dtype=out_dtype)
|
||||
ref_output = ref_impl(tensors.A.to(dtype=out_dtype),
|
||||
tensors.B.to(dtype=out_dtype), ref_output,
|
||||
tensors.num_expert_tokens, A_scale, B_scale,
|
||||
block_shape[-2:])
|
||||
|
||||
ref_output2 = ref_impl(tensors.A, tensors.B, ref_output2,
|
||||
tensors.num_expert_tokens, A_scale, B_scale,
|
||||
block_shape[-2:])
|
||||
|
||||
rtol, atol = {
|
||||
torch.float16: (6e-2, 6e-2),
|
||||
@ -111,4 +252,154 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
torch.float32: (1e-2, 1e-2),
|
||||
}[test_output.dtype]
|
||||
|
||||
torch.testing.assert_close(test_output, ref_output, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(ref_output, ref_output2, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(test_output, ref_output2, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
def batched_moe(
|
||||
a: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
topk_weight: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
w1_scale: Optional[torch.Tensor] = None,
|
||||
w2_scale: Optional[torch.Tensor] = None,
|
||||
qtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
per_act_token: bool = False,
|
||||
) -> torch.Tensor:
|
||||
max_num_tokens = round_up(a.shape[0], 64)
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
BatchedPrepareAndFinalize(max_num_tokens,
|
||||
world_size=1,
|
||||
dp_size=1,
|
||||
rank=0,
|
||||
qtype=qtype,
|
||||
block_shape=block_shape,
|
||||
per_act_token=per_act_token),
|
||||
BatchedTritonExperts(max_num_tokens=max_num_tokens,
|
||||
dp_size=1,
|
||||
world_size=1,
|
||||
use_fp8_w8a8=qtype == torch.float8_e4m3fn,
|
||||
block_shape=block_shape))
|
||||
|
||||
return fused_experts(a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weight,
|
||||
topk_ids,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale)
|
||||
|
||||
|
||||
# Note: same as torch_moe but with fused_topk factored out.
|
||||
def torch_moe2(
|
||||
a: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
topk_weight: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
w1_scale: Optional[torch.Tensor] = None,
|
||||
w2_scale: Optional[torch.Tensor] = None,
|
||||
use_fp8_w8a8: bool = False,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
) -> torch.Tensor:
|
||||
M, K = a.shape
|
||||
topk = topk_ids.shape[1]
|
||||
|
||||
a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
|
||||
|
||||
if use_fp8_w8a8:
|
||||
a, a_scale = per_token_group_quant_fp8(a, block_shape[1])
|
||||
else:
|
||||
a_scale = None
|
||||
|
||||
out = torch.zeros(M * topk,
|
||||
w2.shape[1],
|
||||
dtype=torch.bfloat16,
|
||||
device=a.device)
|
||||
num_experts = w1.shape[0]
|
||||
for i in range(num_experts):
|
||||
mask = (topk_ids == i).view(-1)
|
||||
if mask.sum():
|
||||
if not use_fp8_w8a8:
|
||||
tmp1 = a[mask] @ w1[i].transpose(0, 1)
|
||||
tmp2 = SiluAndMul()(tmp1)
|
||||
out[mask] = tmp2 @ w2[i].transpose(0, 1)
|
||||
else:
|
||||
tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
|
||||
w1_scale[i], block_shape,
|
||||
torch.bfloat16)
|
||||
|
||||
tmp2 = SiluAndMul()(tmp1)
|
||||
tmp2, b_scale = per_token_group_quant_fp8(tmp2, block_shape[1])
|
||||
|
||||
out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
|
||||
w2_scale[i], block_shape,
|
||||
torch.bfloat16)
|
||||
|
||||
return (out.view(M, -1, w2.shape[1]) *
|
||||
topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [32, 45, 64]) #[1, 33, 64, 222])
|
||||
@pytest.mark.parametrize("n", [128, 512, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 512, 1024, 2048])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
def test_fused_moe_batched_experts(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
block_shape = [128, 128]
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) / 10
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
|
||||
qtype = dtype if dtype == torch.torch.float8_e4m3fn else None
|
||||
|
||||
if use_fp8_w8a8:
|
||||
block_n, block_k = block_shape[0], block_shape[1]
|
||||
n_tiles_w1 = (2 * n + block_n - 1) // block_n
|
||||
n_tiles_w2 = (k + block_n - 1) // block_n
|
||||
k_tiles_w1 = (k + block_k - 1) // block_k
|
||||
k_tiles_w2 = (n + block_k - 1) // block_k
|
||||
|
||||
finfo = torch.finfo(dtype)
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
w1 = w1.clamp(min=fp8_min, max=fp8_max).to(dtype)
|
||||
w2 = w2.clamp(min=fp8_min, max=fp8_max).to(dtype)
|
||||
|
||||
factor_for_scale = 1e-2
|
||||
w1_s = torch.rand(
|
||||
(e, n_tiles_w1, k_tiles_w1), dtype=torch.float32,
|
||||
device="cuda") * factor_for_scale
|
||||
w2_s = torch.rand(
|
||||
(e, n_tiles_w2, k_tiles_w2), dtype=torch.float32,
|
||||
device="cuda") * factor_for_scale
|
||||
else:
|
||||
w1_s = None
|
||||
w2_s = None
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||
batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
|
||||
w2_s, qtype, block_shape)
|
||||
baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids, w1_s,
|
||||
w2_s, use_fp8_w8a8, block_shape)
|
||||
|
||||
torch.testing.assert_close(baseline_output,
|
||||
batched_output,
|
||||
atol=2e-2,
|
||||
rtol=0)
|
||||
|
||||
@ -33,7 +33,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (fused_topk,
|
||||
get_default_config)
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import round_up
|
||||
|
||||
PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
|
||||
(222, 2048, 1024)]
|
||||
@ -74,6 +77,11 @@ class ProcessGroupInfo:
|
||||
device: torch.device
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_pplx_backend(monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "pplx")
|
||||
|
||||
|
||||
def _worker_parallel_launch(
|
||||
local_rank: int,
|
||||
world_size: int,
|
||||
@ -275,6 +283,70 @@ def batched_moe(
|
||||
return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts)
|
||||
|
||||
|
||||
def native_w8a8_block_matmul(A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size,
|
||||
output_dtype=torch.bfloat16):
|
||||
"""This function performs matrix multiplication with block-wise
|
||||
quantization using native torch.
|
||||
It is agnostic to the input data type and can be used for both int8 and
|
||||
fp8 data types.
|
||||
|
||||
It takes two input tensors `A` and `B` (int8) with scales `As` and
|
||||
`Bs` (float32).
|
||||
The output is returned in the specified `output_dtype`.
|
||||
"""
|
||||
A = A.to(torch.float32)
|
||||
B = B.to(torch.float32).contiguous()
|
||||
assert A.shape[-1] == B.shape[-1]
|
||||
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
||||
assert len(block_size) == 2
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1], (
|
||||
f"{(A.shape[-1] + block_k - 1) // block_k} == {As.shape[-1]}")
|
||||
assert A.shape[:-1] == As.shape[:-1], f"{A.shape} == {As.shape}"
|
||||
|
||||
M = A.numel() // A.shape[-1]
|
||||
N, K = B.shape
|
||||
origin_C_shape = A.shape[:-1] + (N, )
|
||||
A = A.reshape(M, A.shape[-1])
|
||||
As = As.reshape(M, As.shape[-1])
|
||||
n_tiles = (N + block_n - 1) // block_n
|
||||
k_tiles = (K + block_k - 1) // block_k
|
||||
assert n_tiles == Bs.shape[0]
|
||||
assert k_tiles == Bs.shape[1]
|
||||
|
||||
C_shape = (M, N)
|
||||
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
|
||||
|
||||
A_tiles = [
|
||||
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
|
||||
]
|
||||
B_tiles = [[
|
||||
B[
|
||||
j * block_n:min((j + 1) * block_n, N),
|
||||
i * block_k:min((i + 1) * block_k, K),
|
||||
] for i in range(k_tiles)
|
||||
] for j in range(n_tiles)]
|
||||
C_tiles = [
|
||||
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
|
||||
]
|
||||
As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
|
||||
|
||||
for i in range(k_tiles):
|
||||
for j in range(n_tiles):
|
||||
a = A_tiles[i]
|
||||
b = B_tiles[j][i]
|
||||
c = C_tiles[j]
|
||||
s = As_tiles[i] * Bs[j][i]
|
||||
c[:, :] += torch.matmul(a, b.t()) * s
|
||||
|
||||
C = C.reshape(origin_C_shape).to(output_dtype)
|
||||
return C
|
||||
|
||||
|
||||
# Note: same as torch_moe but with fused_topk factored out.
|
||||
def torch_moe2(
|
||||
a: torch.Tensor,
|
||||
@ -282,17 +354,44 @@ def torch_moe2(
|
||||
w2: torch.Tensor,
|
||||
topk_weight: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
w1_scale: Optional[torch.Tensor] = None,
|
||||
w2_scale: Optional[torch.Tensor] = None,
|
||||
use_fp8_w8a8: bool = False,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
) -> torch.Tensor:
|
||||
M, K = a.shape
|
||||
topk = topk_ids.shape[1]
|
||||
|
||||
a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
|
||||
out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
|
||||
|
||||
if use_fp8_w8a8:
|
||||
a, a_scale = per_token_group_quant_fp8(a, block_shape[1])
|
||||
else:
|
||||
a_scale = None
|
||||
|
||||
out = torch.zeros(M * topk,
|
||||
w2.shape[1],
|
||||
dtype=torch.bfloat16,
|
||||
device=a.device)
|
||||
num_experts = w1.shape[0]
|
||||
for i in range(num_experts):
|
||||
mask = (topk_ids == i).view(-1)
|
||||
if mask.sum():
|
||||
out[mask] = SiluAndMul()(
|
||||
a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
|
||||
if not use_fp8_w8a8:
|
||||
tmp1 = a[mask] @ w1[i].transpose(0, 1)
|
||||
tmp2 = SiluAndMul()(tmp1)
|
||||
out[mask] = tmp2 @ w2[i].transpose(0, 1)
|
||||
else:
|
||||
tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
|
||||
w1_scale[i], block_shape,
|
||||
torch.bfloat16)
|
||||
|
||||
tmp2 = SiluAndMul()(tmp1)
|
||||
tmp2, b_scale = per_token_group_quant_fp8(tmp2, block_shape[1])
|
||||
|
||||
out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
|
||||
w2_scale[i], block_shape,
|
||||
torch.bfloat16)
|
||||
|
||||
return (out.view(M, -1, w2.shape[1]) *
|
||||
topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
|
||||
@ -497,6 +596,10 @@ def pplx_moe(
|
||||
w2: torch.Tensor,
|
||||
topk_weight: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
w1_scale: Optional[torch.Tensor] = None,
|
||||
w2_scale: Optional[torch.Tensor] = None,
|
||||
qtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
use_compile: bool = True,
|
||||
use_cudagraphs: bool = True,
|
||||
) -> torch.Tensor:
|
||||
@ -506,9 +609,17 @@ def pplx_moe(
|
||||
device = torch.device("cuda", rank)
|
||||
hidden_dim = a.shape[1]
|
||||
num_experts = w1.shape[0]
|
||||
block_size = 128
|
||||
block_size = block_shape[1] if block_shape is not None else 128
|
||||
topk = topk_ids.shape[1]
|
||||
max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
|
||||
max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
|
||||
|
||||
if qtype is not None:
|
||||
a_dtype = qtype
|
||||
# This is probably not right
|
||||
scale_bytes = round_up(((hidden_dim + block_size - 1) // block_size) * torch.float32.itemsize, 16)
|
||||
else:
|
||||
a_dtype = a.dtype
|
||||
scale_bytes = 0
|
||||
|
||||
ata = AllToAll.internode(
|
||||
max_num_tokens=max_num_tokens,
|
||||
@ -518,10 +629,8 @@ def pplx_moe(
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
hidden_dim=hidden_dim,
|
||||
hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
|
||||
hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else
|
||||
((hidden_dim + block_size - 1) // block_size *
|
||||
torch.float32.itemsize)),
|
||||
hidden_dim_bytes=hidden_dim * a_dtype.itemsize,
|
||||
hidden_dim_scale_bytes=scale_bytes,
|
||||
)
|
||||
|
||||
topk_ids = topk_ids.to(dtype=torch.uint32)
|
||||
@ -532,11 +641,15 @@ def pplx_moe(
|
||||
world_size,
|
||||
rank,
|
||||
dp_size,
|
||||
quant_dtype=qtype,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
experts = BatchedTritonExperts(max_num_tokens=a.shape[0],
|
||||
experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size)
|
||||
dp_size=dp_size,
|
||||
use_fp8_w8a8=qtype==torch.float8_e4m3fn,
|
||||
block_shape=block_shape)
|
||||
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
prepare_finalize,
|
||||
@ -552,6 +665,13 @@ def pplx_moe(
|
||||
w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
|
||||
w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
|
||||
|
||||
if w1_scale is not None:
|
||||
w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
|
||||
w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
|
||||
else:
|
||||
w1_scale_chunk = None
|
||||
w2_scale_chunk = None
|
||||
|
||||
if use_compile:
|
||||
_fused_experts = torch.compile(fused_experts,
|
||||
backend='inductor',
|
||||
@ -564,6 +684,8 @@ def pplx_moe(
|
||||
w2_chunk,
|
||||
chunk_topk_weight,
|
||||
chunk_topk_ids,
|
||||
w1_scale=w1_scale_chunk,
|
||||
w2_scale=w2_scale_chunk,
|
||||
global_num_experts=num_experts)
|
||||
|
||||
if use_cudagraphs:
|
||||
@ -576,6 +698,8 @@ def pplx_moe(
|
||||
w2_chunk,
|
||||
chunk_topk_weight,
|
||||
chunk_topk_ids,
|
||||
w1_scale=w1_scale_chunk,
|
||||
w2_scale=w2_scale_chunk,
|
||||
global_num_experts=num_experts)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
@ -638,6 +762,10 @@ def _pplx_moe(
|
||||
w2: torch.Tensor,
|
||||
score: torch.Tensor,
|
||||
topk: int,
|
||||
w1_s: Optional[torch.Tensor] = None,
|
||||
w2_s: Optional[torch.Tensor] = None,
|
||||
qtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
):
|
||||
uid = nvshmem_get_unique_id(
|
||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||
@ -649,11 +777,20 @@ def _pplx_moe(
|
||||
|
||||
moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
|
||||
|
||||
use_fp8_w8a8 = qtype == torch.float8_e4m3fn
|
||||
|
||||
device = torch.device("cuda", pgi.rank)
|
||||
a = a.to(device)
|
||||
w1 = w1.to(device)
|
||||
w2 = w2.to(device)
|
||||
w1_s = w1_s.to(device) if w1_s is not None else None
|
||||
w2_s = w2_s.to(device) if w2_s is not None else None
|
||||
|
||||
with set_current_vllm_config(vllm_config), override_config(moe_config):
|
||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||
torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
|
||||
torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids, w1_s, w2_s, use_fp8_w8a8, block_shape)
|
||||
pplx_output = pplx_moe(pgi.rank, pgi.world_size, dp_size, a, w1, w2,
|
||||
topk_weight, topk_ids)
|
||||
topk_weight, topk_ids, w1_s, w2_s, qtype, block_shape)
|
||||
# TODO (bnell): fix + re-enable
|
||||
#batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
|
||||
# topk_ids)
|
||||
@ -670,7 +807,7 @@ def _pplx_moe(
|
||||
@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@requires_pplx
|
||||
def test_pplx_moe(
|
||||
@ -683,9 +820,40 @@ def test_pplx_moe(
|
||||
current_platform.seed_everything(7)
|
||||
m, n, k = mnk
|
||||
world_size, dp_size = world_dp_size
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) / 10
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk)
|
||||
use_fp8_w8a8 = dtype == torch.float8_e4m3fn
|
||||
|
||||
if use_fp8_w8a8:
|
||||
block_shape = [128, 128]
|
||||
quant_type = torch.float8_e4m3fn
|
||||
block_n, block_k = block_shape[0], block_shape[1]
|
||||
n_tiles_w1 = (2 * n + block_n - 1) // block_n
|
||||
n_tiles_w2 = (k + block_n - 1) // block_n
|
||||
k_tiles_w1 = (k + block_k - 1) // block_k
|
||||
k_tiles_w2 = (n + block_k - 1) // block_k
|
||||
|
||||
finfo = torch.finfo(dtype)
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
w1 = w1.clamp(min=fp8_min, max=fp8_max).to(dtype)
|
||||
w2 = w2.clamp(min=fp8_min, max=fp8_max).to(dtype)
|
||||
|
||||
factor_for_scale = 1e-2
|
||||
w1_s = torch.rand(
|
||||
(e, n_tiles_w1, k_tiles_w1), dtype=torch.float32,
|
||||
device="cuda") * factor_for_scale
|
||||
w2_s = torch.rand(
|
||||
(e, n_tiles_w2, k_tiles_w2), dtype=torch.float32,
|
||||
device="cuda") * factor_for_scale
|
||||
else:
|
||||
block_shape = None
|
||||
quant_type = None
|
||||
w1_s = None
|
||||
w2_s = None
|
||||
|
||||
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, w1_s, w2_s, quant_type, block_shape)
|
||||
|
||||
@ -35,6 +35,15 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
|
||||
assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk)
|
||||
|
||||
|
||||
def test_rocm_aiter_grouped_topk_custom_op_registration():
|
||||
"""Test that the custom op is correctly registered."""
|
||||
# Check if the op exists in torch.ops.vllm
|
||||
assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk')
|
||||
|
||||
# Check if the op is callable
|
||||
assert callable(torch.ops.vllm.rocm_aiter_grouped_topk)
|
||||
|
||||
|
||||
def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
|
||||
"""Test that the op can be used with torch.compile."""
|
||||
# Create test tensors
|
||||
@ -120,3 +129,87 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
|
||||
rtol=1e-2,
|
||||
atol=1e-2)
|
||||
assert torch.allclose(topk_ids_original, topk_ids_compiled)
|
||||
|
||||
|
||||
def test_rocm_aiter_grouped_topk_torch_compile_compatibility():
|
||||
"""Test that the op can be used with torch.compile."""
|
||||
# Create test tensors
|
||||
token = 64
|
||||
expert = 256
|
||||
num_expert_group = 8
|
||||
topk = 8
|
||||
topk_group = 4
|
||||
renormalize = True
|
||||
scoring_func = "softmax"
|
||||
scale_factor = 1.0
|
||||
|
||||
gating_output = torch.randn((token, expert),
|
||||
dtype=torch.bfloat16,
|
||||
device="cuda")
|
||||
|
||||
device = gating_output.device
|
||||
topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
|
||||
topk_weights = torch.empty((token, topk),
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
|
||||
# Define a function that uses the op
|
||||
def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func):
|
||||
return torch.ops.vllm.rocm_aiter_grouped_topk(
|
||||
gating_output, topk_weights, topk_ids, num_expert_group,
|
||||
topk_group, renormalize, scoring_func, scale_factor)
|
||||
|
||||
# Verify the op's fake implementation
|
||||
torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk,
|
||||
(gating_output, topk_weights, topk_ids),
|
||||
kwargs={
|
||||
"num_expert_group": num_expert_group,
|
||||
"topk_group": topk_group,
|
||||
"need_renorm": renormalize,
|
||||
"scoring_func": scoring_func,
|
||||
"routed_scaling_factor": scale_factor
|
||||
},
|
||||
test_utils=("test_faketensor"))
|
||||
|
||||
# Compile the function with appropriate settings
|
||||
compiled_fn = torch.compile(grouped_topk_fn,
|
||||
fullgraph=True,
|
||||
backend="inductor",
|
||||
mode="reduce-overhead",
|
||||
dynamic=False)
|
||||
|
||||
topk_weights_original = torch.empty((token, topk),
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
topk_ids_original = torch.empty((token, topk),
|
||||
dtype=torch.int32,
|
||||
device=device)
|
||||
|
||||
topk_weights_compiled = torch.empty((token, topk),
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
topk_ids_compiled = torch.empty((token, topk),
|
||||
dtype=torch.int32,
|
||||
device=device)
|
||||
|
||||
# Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
|
||||
grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original,
|
||||
scoring_func)
|
||||
compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled,
|
||||
scoring_func)
|
||||
|
||||
# Sort the results for comparison since the order might not be deterministic
|
||||
topk_ids_original, indices_original = torch.sort(topk_ids_original)
|
||||
topk_weights_original = torch.gather(topk_weights_original, 1,
|
||||
indices_original)
|
||||
|
||||
topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
|
||||
topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
|
||||
indices_compiled)
|
||||
|
||||
# Verify results match
|
||||
assert torch.allclose(topk_weights_original,
|
||||
topk_weights_compiled,
|
||||
rtol=1e-2,
|
||||
atol=1e-2)
|
||||
assert torch.allclose(topk_ids_original, topk_ids_compiled)
|
||||
|
||||
@ -8,7 +8,7 @@ from vllm.platforms import current_platform
|
||||
|
||||
# Using the default value (240.0) from pytorch will cause accuracy
|
||||
# issue on dynamic quantization models. Here use 224.0 for rocm.
|
||||
ROCM_FP8_MAX = 224.0
|
||||
ROCM_FP8FNUZ_MAX = 224.0
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
@ -26,9 +26,11 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
|
||||
|
||||
qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
|
||||
else torch.finfo(quant_dtype)
|
||||
qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
|
||||
and current_platform.is_fp8_fnuz() \
|
||||
else qtype_traits.max
|
||||
qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
|
||||
and current_platform.is_fp8_fnuz() \
|
||||
else qtype_traits.min
|
||||
qtype_max = as_float32_tensor(qtype_traits_max)
|
||||
s_1 = as_float32_tensor(1.0)
|
||||
@ -70,9 +72,11 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
|
||||
-> tuple[torch.tensor, torch.tensor]:
|
||||
|
||||
fp8_traits = torch.finfo(FP8_DTYPE)
|
||||
fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
|
||||
and current_platform.is_fp8_fnuz() \
|
||||
else fp8_traits.max
|
||||
fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
|
||||
and current_platform.is_fp8_fnuz() \
|
||||
else fp8_traits.min
|
||||
fp8_max = as_float32_tensor(fp8_traits_max)
|
||||
one = as_float32_tensor(1.0)
|
||||
|
||||
@ -24,16 +24,16 @@ if current_platform.is_rocm():
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="GPTQ"),
|
||||
quantization="gptq"),
|
||||
]
|
||||
else:
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
|
||||
quantization="AWQ"),
|
||||
quantization="awq"),
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="GPTQ"),
|
||||
quantization="gptq"),
|
||||
]
|
||||
|
||||
|
||||
@ -100,7 +100,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
"#ff8050",
|
||||
"#ff8080",
|
||||
]
|
||||
elif model.quantization == "AWQ":
|
||||
elif model.quantization == "awq":
|
||||
expected_no_lora_output = [
|
||||
"I'm sorry, I don't understand",
|
||||
"I'm sorry, I don't understand",
|
||||
@ -109,7 +109,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
"#f07700: A v",
|
||||
"#f00000: A v",
|
||||
]
|
||||
elif model.quantization == "GPTQ":
|
||||
elif model.quantization == "gptq":
|
||||
expected_no_lora_output = [
|
||||
"I'm sorry, I don't have",
|
||||
"I'm sorry, I don't have",
|
||||
@ -122,7 +122,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
def expect_match(output, expected_output):
|
||||
# HACK: GPTQ lora outputs are just incredibly unstable.
|
||||
# Assert that the outputs changed.
|
||||
if (model.quantization == "GPTQ"
|
||||
if (model.quantization == "gptq"
|
||||
and expected_output is expected_lora_output):
|
||||
assert output != expected_no_lora_output
|
||||
for i, o in enumerate(output):
|
||||
@ -172,7 +172,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
model):
|
||||
if num_gpus_available < 2:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
|
||||
if model.quantization == "GPTQ":
|
||||
if model.quantization == "gptq":
|
||||
pytest.skip("GPTQ lora outputs are just incredibly unstable")
|
||||
llm_tp1 = vllm.LLM(
|
||||
model=model.model_path,
|
||||
|
||||
@ -10,6 +10,7 @@ import vllm
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
|
||||
|
||||
@pytest.fixture(autouse=not current_platform.is_cpu())
|
||||
@ -69,7 +70,7 @@ class Qwen2VLTester:
|
||||
expected_outputs: list[str],
|
||||
lora_id: Optional[int] = None,
|
||||
temperature: float = 0,
|
||||
max_tokens: int = 5) -> list[str]:
|
||||
max_tokens: int = 5):
|
||||
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=temperature,
|
||||
@ -97,7 +98,35 @@ class Qwen2VLTester:
|
||||
generated), f"Generated text {generated} doesn't "
|
||||
f"match expected pattern {expected}"
|
||||
|
||||
return generated_texts
|
||||
def run_beam_search_test(self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[list[str]],
|
||||
lora_id: Optional[int] = None,
|
||||
temperature: float = 0,
|
||||
beam_width: int = 2,
|
||||
max_tokens: int = 5):
|
||||
|
||||
beam_search_params = BeamSearchParams(beam_width=beam_width,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature)
|
||||
|
||||
inputs = [{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {
|
||||
"image": asset.pil_image
|
||||
},
|
||||
} for asset in images]
|
||||
|
||||
lora_request = LoRARequest(str(lora_id), lora_id,
|
||||
self.config.lora_path)
|
||||
outputs = self.llm.beam_search(inputs,
|
||||
beam_search_params,
|
||||
lora_request=lora_request)
|
||||
|
||||
for output_obj, expected_outs in zip(outputs, expected_outputs):
|
||||
output_texts = [seq.text for seq in output_obj.sequences]
|
||||
assert output_texts == expected_outs, \
|
||||
f"Generated texts {output_texts} do not match expected {expected_outs}" # noqa: E501
|
||||
|
||||
|
||||
TEST_IMAGES = [
|
||||
@ -110,6 +139,14 @@ EXPECTED_OUTPUTS = [
|
||||
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
|
||||
]
|
||||
|
||||
# NOTE - beam search .text contains the whole text
|
||||
EXPECTED_BEAM_SEARCH_OUTPUTS = [
|
||||
[
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands", # noqa: E501
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall", # noqa: E501
|
||||
],
|
||||
]
|
||||
|
||||
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
@ -130,6 +167,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
|
||||
lora_id=lora_id)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="Qwen2-VL dependency xformers incompatible with ROCm")
|
||||
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
|
||||
"""Test Qwen 2.0 VL model with LoRA through beam search."""
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
|
||||
lora_path=qwen2vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
for lora_id in [1, 2]:
|
||||
# NOTE currently, we only test cherry blossom since stop sign
|
||||
# output is slightly different for v1; - the root cause is likely
|
||||
# independent of the intent of this test, which is to ensure beam
|
||||
# search passes through lora through correctly.
|
||||
tester.run_beam_search_test(
|
||||
[ImageAsset("cherry_blossom")],
|
||||
expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
|
||||
lora_id=lora_id)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
|
||||
|
||||
@ -4,7 +4,7 @@ import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.layers.pooler import CLSPool, PoolingType
|
||||
from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType
|
||||
from vllm.model_executor.models.bert import BertEmbeddingModel
|
||||
from vllm.model_executor.models.roberta import RobertaEmbeddingModel
|
||||
from vllm.platforms import current_platform
|
||||
@ -14,7 +14,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
|
||||
REVISION = os.environ.get("REVISION", "main")
|
||||
|
||||
MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
|
||||
"intfloat/multilingual-e5-small")
|
||||
"intfloat/multilingual-e5-base")
|
||||
REVISION_ROBERTA = os.environ.get("REVISION", "main")
|
||||
|
||||
|
||||
@ -40,17 +40,15 @@ def test_model_loading_with_params(vllm_runner):
|
||||
|
||||
# asserts on the pooling config files
|
||||
assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
|
||||
assert model_config.pooler_config.pooling_norm
|
||||
assert model_config.pooler_config.normalize
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
|
||||
assert model_tokenizer.tokenizer_config["do_lower_case"]
|
||||
assert model_tokenizer.tokenizer.model_max_length == 512
|
||||
|
||||
def check_model(model):
|
||||
assert isinstance(model, BertEmbeddingModel)
|
||||
assert model._pooler.pooling_type == PoolingType.CLS
|
||||
assert model._pooler.normalize
|
||||
assert isinstance(model._pooler, CLSPool)
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
@ -80,16 +78,15 @@ def test_roberta_model_loading_with_params(vllm_runner):
|
||||
|
||||
# asserts on the pooling config files
|
||||
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
|
||||
assert model_config.pooler_config.pooling_norm
|
||||
assert model_config.pooler_config.normalize
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small"
|
||||
assert not model_tokenizer.tokenizer_config["do_lower_case"]
|
||||
assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base"
|
||||
assert model_tokenizer.tokenizer.model_max_length == 512
|
||||
|
||||
def check_model(model):
|
||||
assert isinstance(model, RobertaEmbeddingModel)
|
||||
assert model._pooler.pooling_type == PoolingType.MEAN
|
||||
assert model._pooler.normalize
|
||||
assert isinstance(model._pooler, MeanPool)
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
|
||||
72
tests/models/language/pooling/embed_utils.py
Normal file
72
tests/models/language/pooling/embed_utils.py
Normal file
@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
|
||||
matryoshka_fy)
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
hf_model: "HfRunner",
|
||||
inputs: list[str],
|
||||
vllm_outputs: Sequence[list[float]],
|
||||
dimensions: Optional[int] = None,
|
||||
):
|
||||
hf_outputs = hf_model.encode(inputs)
|
||||
if dimensions:
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
def correctness_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None):
|
||||
if not model_info.enable_test:
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
|
||||
# This makes the input_ids different between hf_model and vllm_model.
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
|
||||
model_dtype = getattr(
|
||||
vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
|
||||
vllm_dtype)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
@ -80,18 +80,19 @@ def run_mteb_embed_task_st(model_name, tasks):
|
||||
def mteb_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
vllm_extra_kwargs=None):
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None):
|
||||
if not model_info.enable_test:
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
max_model_len=None,
|
||||
dtype=model_info.dtype,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
if model_info.architecture:
|
||||
@ -101,17 +102,18 @@ def mteb_test_embed_models(hf_runner,
|
||||
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
||||
MTEB_EMBED_TASKS)
|
||||
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
|
||||
model_dtype = getattr(
|
||||
vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
|
||||
vllm_dtype)
|
||||
|
||||
with set_default_torch_dtype(model_dtype) and hf_runner(
|
||||
with set_default_torch_dtype(vllm_dtype) and hf_runner(
|
||||
model_info.name, is_sentence_transformer=True,
|
||||
dtype=model_dtype) as hf_model:
|
||||
dtype=vllm_dtype) as hf_model:
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
|
||||
|
||||
print("VLLM:", vllm_dtype, vllm_main_score)
|
||||
print("SentenceTransformer:", model_dtype, st_main_score)
|
||||
print("VLLM:", vllm_main_score)
|
||||
print("SentenceTransformers:", st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
|
||||
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
|
||||
|
||||
71
tests/models/language/pooling/test_baai.py
Normal file
71
tests/models/language/pooling/test_baai.py
Normal file
@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import pytest
|
||||
|
||||
from .embed_utils import EmbedModelInfo, correctness_test_embed_models
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
EmbedModelInfo("BAAI/bge-base-en",
|
||||
architecture="BertModel",
|
||||
enable_test=True),
|
||||
EmbedModelInfo("BAAI/bge-base-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-small-en",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-small-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-en",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-base-en-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-base-zh-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-small-en-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-small-zh-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-en-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-zh-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
########## XLMRobertaModel
|
||||
EmbedModelInfo("BAAI/bge-m3",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=True),
|
||||
########## Qwen2Model
|
||||
EmbedModelInfo("BAAI/bge-code-v1",
|
||||
architecture="Qwen2Model",
|
||||
dtype="float32",
|
||||
enable_test=True),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
@ -43,6 +43,6 @@ def test_models(
|
||||
|
||||
# the tolerance value of 1e-2 is selected based on the
|
||||
# half datatype tests in
|
||||
# tests/models/embedding/language/test_embedding.py
|
||||
# tests/models/language/pooling/test_embedding.py
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
@ -10,29 +10,31 @@ from ...utils import check_embeddings_close
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# [Encoder-only]
|
||||
pytest.param("BAAI/bge-base-en-v1.5",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
|
||||
# Be careful of the order of models, decoder-only models should be
|
||||
# placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct`
|
||||
# case won't pass because gte-Qwen2-1.5B-instruct will cache custom
|
||||
# model code with bidirectional attention.
|
||||
# [Decoder-only]
|
||||
pytest.param("BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model]),
|
||||
pytest.param("intfloat/e5-mistral-7b-instruct",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
|
||||
# [Encoder-only]
|
||||
pytest.param("BAAI/bge-base-en-v1.5",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
|
||||
# [Cross-Encoder]
|
||||
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
@ -44,7 +46,7 @@ def test_models(
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["override_pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN")
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
@ -54,13 +56,11 @@ def test_models(
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
@ -3,7 +3,8 @@ from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from ...utils import EmbedModelInfo, run_embedding_correctness_test
|
||||
from .embed_utils import EmbedModelInfo, correctness_test_embed_models
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
@ -44,6 +45,7 @@ MODELS = [
|
||||
########### Qwen2ForCausalLM
|
||||
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
architecture="Qwen2ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=True),
|
||||
########## ModernBertModel
|
||||
EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
|
||||
@ -53,9 +55,8 @@ MODELS = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "GteNewModel":
|
||||
@ -66,28 +67,13 @@ def test_models_mteb(hf_runner, vllm_runner,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
if not model_info.enable_test:
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# ST will strip the input texts, see test_embedding.py
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "GteNewModel":
|
||||
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
dtype=model_info.dtype,
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_info.dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts, vllm_extra_kwargs)
|
||||
|
||||
@ -1,9 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import PoolingParams
|
||||
|
||||
from ...utils import check_embeddings_close, matryoshka_fy
|
||||
from .embed_utils import (EmbedModelInfo, check_embeddings_close,
|
||||
correctness_test_embed_models, matryoshka_fy)
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
SCORING_MODELS = [
|
||||
"jinaai/jina-reranker-v2-base-multilingual", # Roberta
|
||||
@ -25,16 +29,10 @@ TEXTS_2 = [
|
||||
]
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
"jinaai/jina-embeddings-v3",
|
||||
]
|
||||
|
||||
EMBEDDING_PROMPTS = [
|
||||
"Follow the white rabbit.", # English
|
||||
"Sigue al conejo blanco.", # Spanish
|
||||
"Suis le lapin blanc.", # French
|
||||
"跟着白兔走。", # Chinese
|
||||
"اتبع الأرنب الأبيض.", # Arabic
|
||||
"Folge dem weißen Kaninchen.", # German
|
||||
EmbedModelInfo("jinaai/jina-embeddings-v3",
|
||||
architecture="XLMRobertaModel",
|
||||
is_matryoshka=True,
|
||||
dtype="float32")
|
||||
]
|
||||
|
||||
|
||||
@ -80,73 +78,66 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
|
||||
def emb_model_name(request):
|
||||
yield request.param
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
|
||||
def hf_model_callback(model):
|
||||
model.encode = partial(model.encode, task="text-matching")
|
||||
|
||||
mteb_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
hf_model_callback=hf_model_callback)
|
||||
|
||||
|
||||
def test_is_matryoshka(vllm_runner, emb_model_name):
|
||||
with vllm_runner(emb_model_name, task="embed",
|
||||
max_model_len=None) as vllm_model:
|
||||
assert vllm_model.model.llm_engine.model_config.is_matryoshka
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
|
||||
def hf_model_callback(model):
|
||||
model.encode = partial(model.encode, task="text-matching")
|
||||
|
||||
correctness_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
example_prompts,
|
||||
hf_model_callback=hf_model_callback)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", EMBEDDING_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_embeddings(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
example_prompts = EMBEDDING_PROMPTS
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
|
||||
|
||||
with vllm_runner(model, task="embed", dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", EMBEDDING_MODELS)
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("dimensions", [16, 32])
|
||||
def test_matryoshka(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
model_info,
|
||||
dtype: str,
|
||||
dimensions: int,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if not model_info.is_matryoshka:
|
||||
pytest.skip("Model is not matryoshka")
|
||||
|
||||
example_prompts = EMBEDDING_PROMPTS
|
||||
# ST will strip the input texts, see test_embedding.py
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
model_info.name,
|
||||
dtype=dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
with vllm_runner(model, task="embed", dtype=dtype,
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
assert vllm_model.model.llm_engine.model_config.is_matryoshka
|
||||
|
||||
matryoshka_dimensions = (
|
||||
vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
|
||||
assert matryoshka_dimensions is not None
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from ...utils import EmbedModelInfo, run_embedding_correctness_test
|
||||
from .embed_utils import EmbedModelInfo, correctness_test_embed_models
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
|
||||
@ -13,6 +14,9 @@ MODELS = [
|
||||
architecture="NomicBertModel",
|
||||
dtype="float32",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("nomic-ai/CodeRankEmbed",
|
||||
architecture="NomicBertModel",
|
||||
enable_test=False),
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
|
||||
architecture="NomicBertModel",
|
||||
dtype="float32",
|
||||
@ -21,30 +25,14 @@ MODELS = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
if not model_info.enable_test:
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# ST will strip the input texts, see test_embedding.py
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
dtype=model_info.dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_info.dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
|
||||
130
tests/models/language/pooling/test_nomic_max_model_len.py
Normal file
130
tests/models/language/pooling/test_nomic_max_model_len.py
Normal file
@ -0,0 +1,130 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ruff: noqa: SIM117
|
||||
import pytest
|
||||
|
||||
from ...utils import EmbedModelInfo
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
|
||||
#EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
|
||||
#EmbedModelInfo("nomic-ai/CodeRankEmbed"),
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
|
||||
#EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
|
||||
]
|
||||
|
||||
rope_theta = 1000
|
||||
factor = 4.0
|
||||
original_max_position_embeddings = 2048
|
||||
max_model_len = int(original_max_position_embeddings * factor)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=None) as vllm_model:
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
assert model_config.max_model_len == 512
|
||||
else:
|
||||
assert (
|
||||
model_config.max_model_len == original_max_position_embeddings)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=256) as vllm_model:
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
|
||||
# set 512 < max_model_len <= 2048
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=1024):
|
||||
pass
|
||||
else:
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=1024) as vllm_model:
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_illegal(model_info, vllm_runner):
|
||||
# set max_model_len > 2048
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, task="embed", max_model_len=4096):
|
||||
pass
|
||||
|
||||
# set max_model_len > 2048 by hf_overrides
|
||||
hf_overrides = {"max_model_len": 4096}
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_use_rope_scaling_legal(model_info, vllm_runner):
|
||||
hf_overrides = {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
},
|
||||
"max_model_len": max_model_len
|
||||
}
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
hf_overrides = {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
}
|
||||
}
|
||||
# illegal max_model_len
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
|
||||
hf_overrides = {
|
||||
"rope_theta": rope_theta,
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
},
|
||||
"max_model_len": max_model_len + 1
|
||||
}
|
||||
# illegal max_model_len by hf_overrides
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -2,7 +2,8 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from ...utils import EmbedModelInfo, run_embedding_correctness_test
|
||||
from .embed_utils import EmbedModelInfo, correctness_test_embed_models
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
|
||||
@ -41,37 +42,14 @@ MODELS = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_mteb(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
) -> None:
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_correctness(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
if not model_info.enable_test:
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# ST will strip the input texts, see test_embedding.py
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
dtype=model_info.dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_info.dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
|
||||
@ -100,6 +100,7 @@ def run_test(
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype="half",
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
|
||||
@ -40,7 +40,7 @@ def _test_processing_correctness(
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
@ -283,7 +283,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
|
||||
"ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
|
||||
trust_remote_code=True),
|
||||
"NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long", # noqa: E501
|
||||
"NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
|
||||
trust_remote_code=True),
|
||||
"Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
|
||||
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
|
||||
@ -434,6 +434,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
|
||||
tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
|
||||
"EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
|
||||
trust_remote_code=True,
|
||||
is_available_online=False,
|
||||
speculative_model="openbmb/MiniCPM-2B-sft-bf16",
|
||||
tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
|
||||
"MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
|
||||
trust_remote_code=True,
|
||||
speculative_model="XiaomiMiMo/MiMo-7B-RL")
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
import warnings
|
||||
from collections.abc import Sequence
|
||||
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
|
||||
from typing import Any, NamedTuple, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -13,9 +13,6 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..conftest import HfRunner
|
||||
|
||||
TokensText = tuple[list[int], str]
|
||||
|
||||
|
||||
@ -317,6 +314,7 @@ def check_embeddings_close(
|
||||
dim=0)
|
||||
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\nCosine similarity: \t{sim:.4f}"
|
||||
f"\n{name_0}:\t{embeddings_0[:16]!r}"
|
||||
f"\n{name_1}:\t{embeddings_1[:16]!r}")
|
||||
|
||||
@ -337,22 +335,3 @@ class EmbedModelInfo(NamedTuple):
|
||||
architecture: str = ""
|
||||
dtype: str = "auto"
|
||||
enable_test: bool = True
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
hf_model: "HfRunner",
|
||||
inputs: list[str],
|
||||
vllm_outputs: Sequence[list[float]],
|
||||
dimensions: Optional[int] = None,
|
||||
):
|
||||
hf_outputs = hf_model.encode(inputs)
|
||||
if dimensions:
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
11
tests/neuron/1_core/test_neuron_quant.py
Normal file
11
tests/neuron/1_core/test_neuron_quant.py
Normal file
@ -0,0 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from vllm.model_executor.layers.quantization.neuron_quant import (
|
||||
NeuronQuantConfig)
|
||||
|
||||
|
||||
def test_get_supported_act_dtypes():
|
||||
neuron_quant_config = NeuronQuantConfig()
|
||||
supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
|
||||
target_list = ["any_dtype1", "any_dtype2"]
|
||||
for dtype in target_list:
|
||||
assert dtype in supported_act_dtypes
|
||||
98
tests/neuron/2_core/test_multi_lora.py
Normal file
98
tests/neuron/2_core/test_multi_lora.py
Normal file
@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
def test_llama_single_lora():
|
||||
sql_lora_files = snapshot_download(
|
||||
repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-hf",
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=4,
|
||||
max_model_len=512,
|
||||
use_v2_block_manager=True,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled": False,
|
||||
"skip_warmup": True,
|
||||
"lora_modules": [{
|
||||
"name": "lora_id_1",
|
||||
"path": sql_lora_files
|
||||
}]
|
||||
},
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_lora_rank=256,
|
||||
device="neuron")
|
||||
"""For multi-lora requests using NxDI as the backend, only the lora_name
|
||||
needs to be specified. The lora_id and lora_path are supplied at the LLM
|
||||
class/server initialization, after which the paths are handled by NxDI"""
|
||||
lora_req_1 = LoRARequest("lora_id_1", 0, " ")
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
]
|
||||
outputs = llm.generate(prompts,
|
||||
SamplingParams(top_k=1),
|
||||
lora_request=[lora_req_1, lora_req_1])
|
||||
|
||||
expected_outputs = [
|
||||
" the head of state and head of government of the United States. "
|
||||
"The president direct",
|
||||
" a city of contrasts. The city is home to the Eiffel Tower"
|
||||
]
|
||||
|
||||
for expected_output, output in zip(expected_outputs, outputs):
|
||||
generated_text = output.outputs[0].text
|
||||
assert (expected_output == generated_text)
|
||||
|
||||
|
||||
def test_llama_multiple_lora():
|
||||
sql_lora_files = snapshot_download(
|
||||
repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-hf",
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=4,
|
||||
max_model_len=512,
|
||||
use_v2_block_manager=True,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled":
|
||||
False,
|
||||
"skip_warmup":
|
||||
True,
|
||||
"lora_modules": [{
|
||||
"name": "lora_id_1",
|
||||
"path": sql_lora_files
|
||||
}, {
|
||||
"name": "lora_id_2",
|
||||
"path": sql_lora_files
|
||||
}]
|
||||
},
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=256,
|
||||
device="neuron")
|
||||
"""For multi-lora requests using NxDI as the backend, only the lora_name
|
||||
needs to be specified. The lora_id and lora_path are supplied at the LLM
|
||||
class/server initialization, after which the paths are handled by NxDI"""
|
||||
lora_req_1 = LoRARequest("lora_id_1", 0, " ")
|
||||
lora_req_2 = LoRARequest("lora_id_2", 1, " ")
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
]
|
||||
outputs = llm.generate(prompts,
|
||||
SamplingParams(top_k=1),
|
||||
lora_request=[lora_req_1, lora_req_2])
|
||||
|
||||
expected_outputs = [
|
||||
" the head of state and head of government of the United States. "
|
||||
"The president direct",
|
||||
" a city of contrasts. The city is home to the Eiffel Tower"
|
||||
]
|
||||
|
||||
for expected_output, output in zip(expected_outputs, outputs):
|
||||
generated_text = output.outputs[0].text
|
||||
assert (expected_output == generated_text)
|
||||
@ -103,7 +103,7 @@ class TestTwoTokenBadWord:
|
||||
add_special_tokens=False)[0]
|
||||
|
||||
def test_two_token_bad_word(self, vllm_runner):
|
||||
with vllm_runner(self.MODEL) as llm:
|
||||
with vllm_runner(self.MODEL, dtype="half") as llm:
|
||||
output_token_ids = self._generate(llm)
|
||||
assert output_token_ids[:2] == [
|
||||
self.target_token_id1, self.target_token_id2
|
||||
|
||||
@ -4,7 +4,6 @@ import gc
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -16,7 +15,6 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
|
||||
TensorSerializer,
|
||||
is_vllm_tensorized,
|
||||
load_with_tensorizer,
|
||||
open_stream,
|
||||
tensorize_vllm_model)
|
||||
# yapf: enable
|
||||
@ -61,21 +59,6 @@ def write_keyfile(keyfile_path: str):
|
||||
f.write(encryption_params.key)
|
||||
|
||||
|
||||
@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
|
||||
def test_load_with_tensorizer(mock_agent, tensorizer_config):
|
||||
mock_linear_method = MagicMock()
|
||||
mock_agent_instance = mock_agent.return_value
|
||||
mock_agent_instance.deserialize.return_value = MagicMock()
|
||||
|
||||
result = load_with_tensorizer(tensorizer_config,
|
||||
quant_method=mock_linear_method)
|
||||
|
||||
mock_agent.assert_called_once_with(tensorizer_config,
|
||||
quant_method=mock_linear_method)
|
||||
mock_agent_instance.deserialize.assert_called_once()
|
||||
assert result == mock_agent_instance.deserialize.return_value
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||
def test_can_deserialize_s3(vllm_runner):
|
||||
model_ref = "EleutherAI/pythia-1.4b"
|
||||
|
||||
@ -17,7 +17,8 @@ from vllm_test_utils.monitor import monitor
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
|
||||
MemorySnapshot, PlaceholderModule, StoreBoolean,
|
||||
bind_kv_cache, deprecate_kwargs, get_open_port,
|
||||
bind_kv_cache, common_broadcastable_dtype,
|
||||
deprecate_kwargs, get_open_port, is_lossless_cast,
|
||||
make_zmq_path, make_zmq_socket, memory_profiling,
|
||||
merge_async_iterators, sha256, split_zmq_path,
|
||||
supports_kw, swap_dict_values)
|
||||
@ -567,12 +568,65 @@ def test_lru_cache():
|
||||
assert 6 in cache
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("src_dtype", "tgt_dtype", "expected_result"),
|
||||
[
|
||||
# Different precision_levels
|
||||
(torch.bool, torch.int8, True),
|
||||
(torch.bool, torch.float16, True),
|
||||
(torch.bool, torch.complex32, True),
|
||||
(torch.int64, torch.bool, False),
|
||||
(torch.int64, torch.float16, True),
|
||||
(torch.int64, torch.complex32, True),
|
||||
(torch.float64, torch.bool, False),
|
||||
(torch.float64, torch.int8, False),
|
||||
(torch.float64, torch.complex32, True),
|
||||
(torch.complex128, torch.bool, False),
|
||||
(torch.complex128, torch.int8, False),
|
||||
(torch.complex128, torch.float16, False),
|
||||
# precision_level=0
|
||||
(torch.bool, torch.bool, True),
|
||||
# precision_level=1
|
||||
(torch.int8, torch.int16, True),
|
||||
(torch.int16, torch.int8, False),
|
||||
(torch.uint8, torch.int8, False),
|
||||
(torch.int8, torch.uint8, False),
|
||||
# precision_level=2
|
||||
(torch.float16, torch.float32, True),
|
||||
(torch.float32, torch.float16, False),
|
||||
(torch.bfloat16, torch.float32, True),
|
||||
(torch.float32, torch.bfloat16, False),
|
||||
# precision_level=3
|
||||
(torch.complex32, torch.complex64, True),
|
||||
(torch.complex64, torch.complex32, False),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("dtypes", "expected_result"),
|
||||
[
|
||||
([torch.bool], torch.bool),
|
||||
([torch.bool, torch.int8], torch.int8),
|
||||
([torch.bool, torch.int8, torch.float16], torch.float16),
|
||||
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_common_broadcastable_dtype(dtypes, expected_result):
|
||||
assert common_broadcastable_dtype(dtypes) == expected_result
|
||||
|
||||
|
||||
def test_placeholder_module_error_handling():
|
||||
placeholder = PlaceholderModule("placeholder_1234")
|
||||
|
||||
def build_ctx():
|
||||
return pytest.raises(ModuleNotFoundError,
|
||||
match="No module named")
|
||||
return pytest.raises(ModuleNotFoundError, match="No module named")
|
||||
|
||||
with build_ctx():
|
||||
int(placeholder)
|
||||
@ -608,6 +662,7 @@ def test_placeholder_module_error_handling():
|
||||
_ = placeholder_attr.module
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
"obj,key1,key2",
|
||||
[
|
||||
@ -618,6 +673,7 @@ def test_placeholder_module_error_handling():
|
||||
# Tests for both keys do not exist
|
||||
({1: "a", 2: "b"}, 3, 4),
|
||||
])
|
||||
# yapf: enable
|
||||
def test_swap_dict_values(obj, key1, key2):
|
||||
original_obj = obj.copy()
|
||||
swap_dict_values(obj, key1, key2)
|
||||
@ -631,19 +687,19 @@ def test_swap_dict_values(obj, key1, key2):
|
||||
assert key1 not in obj
|
||||
|
||||
|
||||
def test_model_specification(parser_with_config,
|
||||
cli_config_file,
|
||||
def test_model_specification(parser_with_config, cli_config_file,
|
||||
cli_config_file_with_model):
|
||||
# Test model in CLI takes precedence over config
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'cli-model', '--config', cli_config_file_with_model
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
['serve', 'cli-model', '--config', cli_config_file_with_model])
|
||||
assert args.model_tag == 'cli-model'
|
||||
assert args.served_model_name == 'mymodel'
|
||||
|
||||
# Test model from config file works
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', '--config', cli_config_file_with_model,
|
||||
'serve',
|
||||
'--config',
|
||||
cli_config_file_with_model,
|
||||
])
|
||||
assert args.model == 'config-model'
|
||||
assert args.served_model_name == 'mymodel'
|
||||
@ -654,17 +710,19 @@ def test_model_specification(parser_with_config,
|
||||
|
||||
# Test using --model option raises error
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"With `vllm serve`, you should provide the model as a positional "
|
||||
"argument or in a config file instead of via the `--model` option."
|
||||
),
|
||||
ValueError,
|
||||
match=
|
||||
("With `vllm serve`, you should provide the model as a positional "
|
||||
"argument or in a config file instead of via the `--model` option."),
|
||||
):
|
||||
parser_with_config.parse_args(['serve', '--model', 'my-model'])
|
||||
|
||||
# Test other config values are preserved
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'cli-model', '--config', cli_config_file_with_model,
|
||||
'serve',
|
||||
'cli-model',
|
||||
'--config',
|
||||
cli_config_file_with_model,
|
||||
])
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
@ -673,7 +731,7 @@ def test_model_specification(parser_with_config,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
|
||||
(None, bool, [1, 2, 3])])
|
||||
(None, bool, [1, 2, 3])])
|
||||
@pytest.mark.parametrize("output", [0, 1, 2])
|
||||
def test_sha256(input: tuple, output: int):
|
||||
hash = sha256(input)
|
||||
@ -682,7 +740,8 @@ def test_sha256(input: tuple, output: int):
|
||||
assert hash != 0
|
||||
|
||||
bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
|
||||
assert hash == int.from_bytes(hashlib.sha256(bytes).digest(),
|
||||
byteorder="big")
|
||||
|
||||
# hashing again, returns the same value
|
||||
assert hash == sha256(input)
|
||||
@ -698,8 +757,7 @@ def test_sha256(input: tuple, output: int):
|
||||
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
|
||||
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
|
||||
("inproc://some_identifier", ("inproc", "some_identifier", "")),
|
||||
]
|
||||
)
|
||||
])
|
||||
def test_split_zmq_path(path, expected):
|
||||
assert split_zmq_path(path) == expected
|
||||
|
||||
@ -711,8 +769,7 @@ def test_split_zmq_path(path, expected):
|
||||
"tcp://127.0.0.1", # Missing port
|
||||
"tcp://[::1]", # Missing port for IPv6
|
||||
"tcp://:5555", # Missing host
|
||||
]
|
||||
)
|
||||
])
|
||||
def test_split_zmq_path_invalid(invalid_path):
|
||||
with pytest.raises(ValueError):
|
||||
split_zmq_path(invalid_path)
|
||||
@ -734,7 +791,8 @@ def test_make_zmq_socket_ipv6():
|
||||
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
|
||||
|
||||
# Verify that the IPV6 option is set
|
||||
assert zsock.getsockopt(zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
|
||||
assert zsock.getsockopt(
|
||||
zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
|
||||
|
||||
# Clean up
|
||||
zsock.close()
|
||||
|
||||
@ -1,73 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
# Required to register the custom ops
|
||||
import vllm.lora.ops.xla_ops.pallas # noqa # pylint: disable=unused-import
|
||||
|
||||
N_TOKENS = [16, 1024, 4096]
|
||||
HIDDEN_SIZES = [1024, 2048, 4096]
|
||||
|
||||
DTYPES = [torch.bfloat16]
|
||||
NUM_LORA = [1, 4, 16]
|
||||
RANKS = [32, 256, 512]
|
||||
|
||||
|
||||
def generate_test_data(T, D, L, N, seed, dtype=torch.float32):
|
||||
"""
|
||||
Inputs: (All integers)
|
||||
T: Total number of tokens
|
||||
D: Input dim
|
||||
L: LoRA Dim
|
||||
N: N LoRAs
|
||||
|
||||
Outputs:
|
||||
inputs: torch.Tensor - shape (T, D)
|
||||
loras: torch.Tensor - shape (N, 1, L, D)
|
||||
idxs: torch.Tensor - shape (T, ) - all values must be in [0, N)
|
||||
|
||||
ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T
|
||||
"""
|
||||
torch.manual_seed(seed)
|
||||
|
||||
inputs = torch.randn((T, D), device="xla", dtype=dtype)
|
||||
loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype)
|
||||
idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla")
|
||||
|
||||
ref_output = ref_bgmv(inputs, loras, idxs)
|
||||
return inputs, loras, idxs, ref_output
|
||||
|
||||
|
||||
def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor):
|
||||
selected_loras = loras[idxs]
|
||||
if len(selected_loras.shape) == 4:
|
||||
selected_loras = selected_loras.squeeze(axis=1)
|
||||
|
||||
batch_size, output_size, input_size = selected_loras.shape
|
||||
return (selected_loras @ inputs.reshape(
|
||||
(batch_size, input_size, 1))).reshape((batch_size, output_size))
|
||||
|
||||
|
||||
# Parameterize tests with various shapes and dtypes
|
||||
@pytest.mark.parametrize("T", N_TOKENS)
|
||||
@pytest.mark.parametrize("D", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("L", RANKS)
|
||||
@pytest.mark.parametrize("N", NUM_LORA)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
|
||||
@pytest.mark.parametrize("seed", [0])
|
||||
def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed):
|
||||
if op_type == "expand":
|
||||
D, L = L, D
|
||||
|
||||
inputs, loras, idxs, ref_output = generate_test_data(
|
||||
T, D, L, N, seed, dtype)
|
||||
|
||||
# Run bgmv
|
||||
output = torch.ops.xla.bgmv(inputs, loras, idxs)
|
||||
|
||||
# Make sure we have no NaNs
|
||||
assert not torch.any(torch.isnan(output))
|
||||
|
||||
# Compare with reference output
|
||||
assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2)
|
||||
@ -26,7 +26,7 @@ TOP_KS = [2, 6]
|
||||
# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
|
||||
@pytest.mark.parametrize("m", [8, 16, 64, 2048])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||
@pytest.mark.parametrize("k", [128, 512, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
|
||||
@ -173,7 +173,7 @@ def test_traces_with_detailed_steps(
|
||||
llm = LLM(
|
||||
model=model,
|
||||
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
||||
collect_detailed_traces="all",
|
||||
collect_detailed_traces=["all"],
|
||||
)
|
||||
prompts = ["This is a short prompt"]
|
||||
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
@ -28,7 +28,7 @@ from tests.models.utils import TextTextLogprobs
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.entrypoints.cli.serve import ServeSubcommand
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
@ -99,7 +99,8 @@ class RemoteOpenAIServer:
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="vLLM's remote OpenAI server.")
|
||||
parser = make_arg_parser(parser)
|
||||
subparsers = parser.add_subparsers(required=False, dest="subparser")
|
||||
parser = ServeSubcommand().subparser_init(subparsers)
|
||||
args = parser.parse_args(["--model", model, *vllm_serve_args])
|
||||
self.host = str(args.host or 'localhost')
|
||||
self.port = int(args.port)
|
||||
|
||||
@ -45,7 +45,6 @@ def make_request(request_id,
|
||||
multi_modal_placeholders=mm_positions,
|
||||
sampling_params=SamplingParams(max_tokens=17),
|
||||
eos_token_id=100,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=cache_salt,
|
||||
)
|
||||
|
||||
@ -38,7 +38,6 @@ def make_request(request_id,
|
||||
sampling_params=SamplingParams(max_tokens=17,
|
||||
prompt_logprobs=prompt_logprobs),
|
||||
eos_token_id=100,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=cache_salt,
|
||||
)
|
||||
|
||||
@ -138,7 +138,6 @@ def create_requests(num_requests: int,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
arrival_time=0,
|
||||
)
|
||||
requests.append(request)
|
||||
return requests
|
||||
@ -744,7 +743,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
|
||||
|
||||
# No draft or accepted tokens counted yet
|
||||
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
|
||||
assert not engine_core_outputs or (
|
||||
engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
|
||||
|
||||
# Schedule the speculated tokens for validation
|
||||
output = scheduler.schedule()
|
||||
@ -772,7 +772,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
engine_core_outputs = scheduler.update_from_output(output,
|
||||
model_runner_output)
|
||||
|
||||
scheduler_stats = engine_core_outputs.scheduler_stats
|
||||
scheduler_stats = engine_core_outputs[0].scheduler_stats \
|
||||
if engine_core_outputs else None
|
||||
if expected[0] == 0:
|
||||
assert scheduler_stats.spec_decoding_stats is None
|
||||
else:
|
||||
@ -843,7 +844,7 @@ def _step_until_done(
|
||||
# We should be in the decode phase now.
|
||||
assert num_scheduled_tokens == 1
|
||||
assert len(output.kv_connector_metadata.requests) == 0
|
||||
ecos = scheduler.update_from_output(output, model_runner_output)
|
||||
ecos = scheduler.update_from_output(output, model_runner_output)[0]
|
||||
all_done = True
|
||||
for eco in ecos.outputs:
|
||||
if eco.finish_reason is None:
|
||||
|
||||
@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
assert len(engine_core.scheduler.running) == 4
|
||||
|
||||
# Loop through until they are all done.
|
||||
while len(engine_core.step().outputs) > 0:
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
req0.request_id = req1.request_id = "test"
|
||||
engine_core.add_request(req0)
|
||||
|
||||
while len(engine_core.step().outputs) > 0:
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
|
||||
engine_core.add_request(req1)
|
||||
while len(engine_core.step().outputs) > 0:
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
# Loop through until they are all done.
|
||||
while len(engine_core.step().outputs) > 0:
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
@ -296,7 +296,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
engine_core.add_request(req1)
|
||||
|
||||
# Schedule Batch 1: (10, req0)
|
||||
assert engine_core.step_with_batch_queue() is None
|
||||
assert engine_core.step_with_batch_queue()[0] is None
|
||||
assert engine_core.batch_queue.qsize() == 1
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens[0] == 10
|
||||
@ -305,7 +305,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
req0.request_id].num_computed_tokens == 10
|
||||
|
||||
# Schedule Batch 2: (2, req0), (8, req1)
|
||||
assert engine_core.step_with_batch_queue() is None
|
||||
assert engine_core.step_with_batch_queue()[0] is None
|
||||
assert engine_core.batch_queue.qsize() == 2
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens[0] == 2
|
||||
@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
assert scheduler_output.num_scheduled_tokens[1] == 4
|
||||
|
||||
# Batch queue is full. Finish Batch 2. Get first token of req0.
|
||||
output = engine_core.step_with_batch_queue()
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
|
||||
@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
assert scheduler_output.num_scheduled_tokens[0] == 1
|
||||
|
||||
# Batch queue is full. Finish Batch 3. Get first token of req1.
|
||||
output = engine_core.step_with_batch_queue()
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
|
||||
@ -358,11 +358,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
engine_core.scheduler.requests[1].num_tokens + 1,
|
||||
]
|
||||
while engine_core.scheduler.get_num_unfinished_requests() == 2:
|
||||
output = engine_core.step_with_batch_queue()
|
||||
output = engine_core.step_with_batch_queue()[0]
|
||||
if step % 2 == 0:
|
||||
# Even steps consumes an output.
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert len(output[0].outputs) == 1
|
||||
if req_id in engine_core.scheduler.requests:
|
||||
assert engine_core.scheduler.requests[
|
||||
req_id].num_tokens == expected_num_tokens[req_id]
|
||||
|
||||
171
tests/v1/entrypoints/openai/test_multi_api_servers.py
Normal file
171
tests/v1/entrypoints/openai/test_multi_api_servers.py
Normal file
@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||
|
||||
DP_SIZE = os.getenv("DP_SIZE", "1")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args():
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--api-server-count",
|
||||
"4",
|
||||
"--data_parallel_size",
|
||||
DP_SIZE,
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(default_server_args):
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
)
|
||||
async def test_single_completion(client: openai.AsyncOpenAI,
|
||||
model_name: str) -> None:
|
||||
|
||||
async def make_request():
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=10,
|
||||
temperature=1.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
# The exact number of tokens can vary slightly with temperature=1.0,
|
||||
# so we check for a reasonable minimum length.
|
||||
assert len(choice.text) >= 1
|
||||
# Finish reason might not always be 'length' if the model finishes early
|
||||
# or due to other reasons, especially with high temperature.
|
||||
# So, we'll accept 'length' or 'stop'.
|
||||
assert choice.finish_reason in ("length", "stop")
|
||||
|
||||
# Token counts can also vary, so we check they are positive.
|
||||
assert completion.usage.completion_tokens > 0
|
||||
assert completion.usage.prompt_tokens > 0
|
||||
assert completion.usage.total_tokens > 0
|
||||
return completion
|
||||
|
||||
# Test single request
|
||||
result = await make_request()
|
||||
assert result is not None
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Send two bursts of requests
|
||||
num_requests = 100
|
||||
tasks = [make_request() for _ in range(num_requests)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
assert len(results) == num_requests
|
||||
assert all(completion is not None for completion in results)
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
tasks = [make_request() for _ in range(num_requests)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
assert len(results) == num_requests
|
||||
assert all(completion is not None for completion in results)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
)
|
||||
async def test_completion_streaming(client: openai.AsyncOpenAI,
|
||||
model_name: str) -> None:
|
||||
prompt = "What is an LLM?"
|
||||
|
||||
async def make_streaming_request():
|
||||
# Perform a non-streaming request to get the expected full output
|
||||
single_completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
single_output = single_completion.choices[0].text
|
||||
|
||||
# Perform the streaming request
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
last_chunk = None
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
last_chunk = chunk # Keep track of the last chunk
|
||||
|
||||
# finish reason should only return in the last block for OpenAI API
|
||||
assert finish_reason_count == 1, (
|
||||
"Finish reason should appear exactly once.")
|
||||
assert last_chunk is not None, (
|
||||
"Stream should have yielded at least one chunk.")
|
||||
assert last_chunk.choices[
|
||||
0].finish_reason == "length", "Finish reason should be 'length'."
|
||||
# Check that the combined text matches the non-streamed version.
|
||||
assert "".join(
|
||||
chunks
|
||||
) == single_output, "Streamed output should match non-streamed output."
|
||||
return True # Indicate success for this request
|
||||
|
||||
# Test single request
|
||||
result = await make_streaming_request()
|
||||
assert result is not None
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Send two bursts of requests
|
||||
num_requests = 100
|
||||
tasks = [make_streaming_request() for _ in range(num_requests)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
assert len(
|
||||
results
|
||||
) == num_requests, f"Expected {num_requests} results, got {len(results)}"
|
||||
assert all(results), "Not all streaming requests completed successfully."
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
tasks = [make_streaming_request() for _ in range(num_requests)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
assert len(
|
||||
results
|
||||
) == num_requests, f"Expected {num_requests} results, got {len(results)}"
|
||||
assert all(results), "Not all streaming requests completed successfully."
|
||||
@ -43,7 +43,7 @@ def test_basic_lifecycle():
|
||||
# Ensure the request is finished after 1 tokens.
|
||||
assert request.is_finished()
|
||||
assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
output = engine_core_outputs.outputs[0]
|
||||
output = engine_core_outputs[0].outputs[0]
|
||||
assert output.finish_reason == FinishReason.LENGTH
|
||||
assert output.kv_transfer_params is not None
|
||||
|
||||
@ -165,7 +165,7 @@ def test_prefix_cache_lifecycle():
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = create_model_runner_output(reqs=[request_remote])
|
||||
eco = scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
kv_transfer_params = eco.outputs[0].kv_transfer_params
|
||||
kv_transfer_params = eco[0].outputs[0].kv_transfer_params
|
||||
|
||||
# Ensure we send all block ids, even if there is a cache hit.
|
||||
assert (len(
|
||||
|
||||
@ -61,7 +61,7 @@ def test_basic_lifecycle():
|
||||
# (1c): update_from_output()
|
||||
engine_core_outputs = scheduler.update_from_output(scheduler_output,
|
||||
model_runner_output)
|
||||
assert len(engine_core_outputs.outputs) == 0
|
||||
assert not engine_core_outputs or not engine_core_outputs[0].outputs
|
||||
|
||||
# STEP (2):
|
||||
# (2a): schedule(): nothing happens!
|
||||
@ -112,7 +112,7 @@ def test_basic_lifecycle():
|
||||
model_runner_output)
|
||||
scheduler.schedule()
|
||||
|
||||
outputs = engine_core_outputs.outputs
|
||||
outputs = engine_core_outputs[0].outputs
|
||||
assert len(outputs) == 1
|
||||
output = outputs[0]
|
||||
assert output.finish_reason == FinishReason.STOP
|
||||
@ -335,7 +335,7 @@ def test_full_block_prompt():
|
||||
model_runner_output)
|
||||
scheduler.schedule()
|
||||
|
||||
outputs = engine_core_outputs.outputs
|
||||
outputs = engine_core_outputs[0].outputs
|
||||
assert len(outputs) == 1
|
||||
output = outputs[0]
|
||||
assert output.finish_reason == FinishReason.STOP
|
||||
|
||||
@ -153,7 +153,6 @@ def create_request(
|
||||
multi_modal_placeholders=None,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
arrival_time=0,
|
||||
)
|
||||
req.kv_transfer_params = kv_transfer_params
|
||||
return req
|
||||
|
||||
@ -81,7 +81,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
|
||||
mm_hashes=[],
|
||||
mm_positions=[],
|
||||
sampling_params=SamplingParams(),
|
||||
block_ids=[0],
|
||||
block_ids=[[0]], # block_ids should be list[list[int]]
|
||||
num_computed_tokens=0,
|
||||
lora_request=None,
|
||||
))
|
||||
@ -112,14 +112,35 @@ def _is_req_added(model_runner, req_id: str) -> bool:
|
||||
|
||||
|
||||
def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
|
||||
"""Check if the request state block IDs match the block table.
|
||||
|
||||
This function handles both legacy BlockTable and new MultiGroupBlockTable
|
||||
structures for backward compatibility.
|
||||
"""
|
||||
|
||||
req_index = model_runner.input_batch.req_id_to_index[req_id]
|
||||
block_table = model_runner.input_batch.block_table
|
||||
multi_group_block_table = model_runner.input_batch.block_table
|
||||
req_state = model_runner.requests[req_id]
|
||||
if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
|
||||
|
||||
# Access the first block table from MultiGroupBlockTable
|
||||
# This is safe since we currently only use single KV cache groups
|
||||
block_table = multi_group_block_table[0]
|
||||
|
||||
# req_state.block_ids is now list[list[int]] for MultiGroupBlockTable
|
||||
# Extract the first group's block IDs
|
||||
if isinstance(req_state.block_ids[0], list):
|
||||
# New format: list[list[int]] - extract first group
|
||||
req_block_ids = req_state.block_ids[0]
|
||||
else:
|
||||
# Legacy format: list[int] - use directly
|
||||
req_block_ids = req_state.block_ids
|
||||
|
||||
if block_table.num_blocks_per_row[req_index] != len(req_block_ids):
|
||||
return False
|
||||
|
||||
num_blocks = block_table.num_blocks_per_row[req_index]
|
||||
return (block_table.block_table_np[req_index, :num_blocks] ==
|
||||
req_state.block_ids).all()
|
||||
block_table_values = block_table.block_table_np[req_index, :num_blocks]
|
||||
return (block_table_values == req_block_ids).all()
|
||||
|
||||
|
||||
def test_update_states_new_request(model_runner):
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user