diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 0412c5f379..e29eb78a9f 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -2,8 +2,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB +# Note that we have 400 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/3792 . +# Please also sync the value with the one in Dockerfile. +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 686f70dbec..69b6b146b3 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -43,7 +43,7 @@ main() { - # The figures should be genereated by a separate process outside the CI/CD pipeline + # The figures should be generated by a separate process outside the CI/CD pipeline # # generate figures # python3 -m pip install tabulate pandas matplotlib diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 3f38cf5137..32bd34c431 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -301,6 +301,104 @@ run_serving_tests() { kill_gpu_processes } +run_genai_perf_tests() { + # run genai-perf tests + + # $1: a json file specifying genai-perf test cases + local genai_perf_test_file + genai_perf_test_file=$1 + + # Iterate over genai-perf tests + jq -c '.[]' "$genai_perf_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ + "$server_params" "$common_params" + fi + + if wait_for_server; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps=$num_prompts + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + #TODO: add output dir. + client_command="genai-perf profile \ + -m $model \ + --service-kind openai \ + --backend vllm \ + --endpoint-type chat \ + --streaming \ + --url localhost:$port \ + --request-rate $qps \ + --num-prompts $num_prompts \ + " + + echo "Client command: $client_command" + + eval "$client_command" + + #TODO: process/record outputs + done + done + + kill_gpu_processes + +} prepare_dataset() { @@ -328,12 +426,17 @@ main() { pip install -U transformers + pip install -r requirements-dev.txt + which genai-perf + # check storage df -h ensure_installed wget ensure_installed curl ensure_installed jq + # genai-perf dependency + ensure_installed libb64-0d prepare_dataset @@ -345,6 +448,10 @@ main() { # run the test run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" + # run genai-perf tests + run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" + mv artifacts/ $RESULTS_FOLDER/ + # upload benchmark results to buildkite python3 -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json new file mode 100644 index 0000000000..edbe9f2df0 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "llama8B_tp1_genai_perf", + "qps_list": [4,8,16,32], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "port": 8000, + "num_prompts": 500, + "reuse_server": false + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "genai_perf_input_parameters": { + } + } +] \ No newline at end of file diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 4f1729d46d..e19ace782f 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -9,36 +9,33 @@ CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; } +remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; } trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 # offline inference - docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference.py" + python3 examples/offline_inference/basic.py" # Run basic model test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install pytest pytest-asyncio \ - decord einops librosa peft Pillow sentence-transformers soundfile \ - transformers_stream_generator matplotlib datamodel_code_generator - pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pip install -r vllm/requirements-test.txt pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model @@ -46,26 +43,26 @@ function cpu_tests() { pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" - # online inference - docker exec cpu-test-"$NUMA_NODE" bash -c " + # online serving + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=$1 @@ -78,8 +75,14 @@ function cpu_tests() { --num-prompts 20 \ --endpoint /v1/completions \ --tokenizer facebook/opt-125m" + + # Run multi-lora tests + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + set -e + pytest -s -v \ + tests/lora/test_qwen2vl.py" } -# All of CPU tests are expected to be finished less than 25 mins. +# All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 4fc6d089cc..3e4e409466 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference.py + python3 examples/offline_inference/basic.py ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index fa4f74fca7..1edcb1d266 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -8,9 +8,17 @@ set -ex docker build -t hpu-test-env -f Dockerfile.hpu . # Setup cleanup +# certain versions of HPU software stack have a bug that can +# override the exit code of the script, so we need to use +# separate remove_docker_container and remove_docker_container_and_exit +# functions, while other platforms only need one remove_docker_container +# function. +EXITCODE=1 remove_docker_container() { docker rm -f hpu-test || true; } -trap remove_docker_container EXIT +remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; } +trap remove_docker_container_and_exit EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py +EXITCODE=$? diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 9259391aae..0590dad4f3 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -3,6 +3,18 @@ # This script build the Neuron docker image and run the API server inside the container. # It serves a sanity check for compilation and basic model usage. set -e +set -v + +image_name="neuron/vllm-ci" +container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + +HF_CACHE="$(realpath ~)/huggingface" +mkdir -p "${HF_CACHE}" +HF_MOUNT="/root/.cache/huggingface" + +NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" +mkdir -p "${NEURON_COMPILE_CACHE_URL}" +NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" # Try building the docker image aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com @@ -13,41 +25,33 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then last_build=$(cat /tmp/neuron-docker-build-timestamp) current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then - docker system prune -f + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune -f + # Remove huggingface model artifacts and compiler cache + rm -rf "${HF_MOUNT:?}/*" + rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else date "+%s" > /tmp/neuron-docker-build-timestamp fi -docker build -t neuron -f Dockerfile.neuron . +docker build -t "${image_name}" -f Dockerfile.neuron . # Setup cleanup -remove_docker_container() { docker rm -f neuron || true; } +remove_docker_container() { + docker image rm -f "${image_name}" || true; +} trap remove_docker_container EXIT -remove_docker_container # Run the image -docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ - --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & - -# Wait for the server to start -wait_for_server_to_start() { - timeout=300 - counter=0 - - while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do - sleep 1 - counter=$((counter + 1)) - if [ $counter -ge $timeout ]; then - echo "Timeout after $timeout seconds" - break - fi - done -} -wait_for_server_to_start - -# Test a simple prompt -curl -X POST -H "Content-Type: application/json" \ - localhost:8000/generate \ - -d '{"prompt": "San Francisco is a"}' +docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ + -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ + --name "${container_name}" \ + ${image_name} \ + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 6b12f424fd..6159b21ff8 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 770dad6ffa..650af0fac4 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -14,4 +14,13 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ + && python3 -m pip install pytest \ + && python3 -m pip install lm_eval[api]==0.4.4 \ + && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \ + && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ + && python3 /workspace/vllm/tests/tpu/test_compilation.py \ + && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ + && python3 /workspace/vllm/examples/offline_inference/tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index e0a12afbe7..4d344e58db 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference.py - python3 examples/offline_inference_cli.py -tp 2 + python3 examples/offline_inference/basic.py + python3 examples/offline_inference/cli.py -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 529daf54fa..daec467601 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -38,7 +38,7 @@ steps: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/dev/sampling_params.html + - grep \"sig sig-object py\" build/html/api/inference_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min fast_check: true @@ -76,7 +76,9 @@ steps: - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_preemption + - tests/basic_correctness/test_cumem.py commands: + - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py @@ -106,7 +108,7 @@ steps: source_file_dependencies: - vllm/ commands: - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process @@ -125,11 +127,15 @@ steps: - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile + - examples/offline_inference/rlhf.py commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - python3 ../examples/offline_inference/rlhf.py - label: Metrics, Tracing Test # 10min num_gpus: 2 @@ -187,19 +193,19 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference.py - - python3 cpu_offload.py - - python3 offline_inference_chat.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 offline_inference_vision_language.py - - python3 offline_inference_vision_language_multi_image.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference_encoder_decoder.py - - python3 offline_inference_classification.py - - python3 offline_inference_embedding.py - - python3 offline_inference_scoring.py - - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/basic.py + - python3 offline_inference/cpu_offload.py + - python3 offline_inference/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/vision_language.py + - python3 offline_inference/vision_language_multi_image.py + - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/classification.py + - python3 offline_inference/embedding.py + - python3 offline_inference/scoring.py + - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] @@ -214,6 +220,7 @@ steps: - vllm/model_executor/layers - vllm/sampling_metadata.py - tests/samplers + - tests/conftest.py commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers @@ -229,20 +236,22 @@ steps: - pytest -v -s test_logits_processor.py - pytest -v -s model_executor/test_guided_processors.py -- label: Speculative decoding tests # 30min +- label: Speculative decoding tests # 40min source_file_dependencies: - vllm/spec_decode - tests/spec_decode + - vllm/model_executor/models/eagle.py commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min @@ -367,6 +376,7 @@ steps: - tests/models/encoder_decoder/vision_language commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model @@ -457,7 +467,10 @@ steps: - vllm/worker/worker_base.py - vllm/worker/worker.py - vllm/worker/model_runner.py + - entrypoints/llm/test_collective_rpc.py commands: + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -466,7 +479,9 @@ steps: - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + # this test fails consistently. + # TODO: investigate and fix + # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py @@ -504,7 +519,9 @@ steps: - vllm/engine - tests/multi_step commands: - - pytest -v -s multi_step/test_correctness_async_llm.py + # this test is quite flaky + # TODO: investigate and fix. + # - pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 45min @@ -535,6 +552,7 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_minicpmv_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 3cb91fc0f8..bc324d8b98 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,32 +2,35 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth +/vllm/model_executor/guided_decoding @mgoin +/vllm/multimodal @DarkLight1337 @ywang96 CMakeLists.txt @tlrmchlsmth # vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic +/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat # Test ownership -/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo +/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu /tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/kernels @tlrmchlsmth @WoosukKwon -/tests/quantization @mgoin @robertgshaw2-neuralmagic +/tests/quantization @mgoin @robertgshaw2-redhat /.buildkite/lm-eval-harness @mgoin @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-neuralmagic @comaniac +/tests/multi_step @alexm-redhat @comaniac /tests/weight_loading @mgoin @youkaichao /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml index 794617a0cf..713e76c1a5 100644 --- a/.github/ISSUE_TEMPLATE/600-new-model.yml +++ b/.github/ISSUE_TEMPLATE/600-new-model.yml @@ -9,7 +9,7 @@ body: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). - #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. + #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model. - type: textarea attributes: label: The model to consider. diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 0226cf0ca0..0000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Lint GitHub Actions workflows -on: - push: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - pull_request: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml deleted file mode 100644 index 68149d2dc0..0000000000 --- a/.github/workflows/clang-format.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clang-format - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - pull_request: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - -jobs: - clang-format: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install clang-format==18.1.5 - - name: Running clang-format - run: | - EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' - ) - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index 68887adaae..0000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: codespell - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index ab6f6e5d20..556b60d2fc 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -27,7 +27,7 @@ jobs: version: v3.10.1 - name: Run chart-testing (lint) - run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm + run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm - name: Setup minio run: | @@ -64,7 +64,8 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & + helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json deleted file mode 100644 index f6d4479ee1..0000000000 --- a/.github/workflows/matchers/ruff.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "ruff", - "pattern": [ - { - "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ] - } - ] - } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 73eeacf1fa..0000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: mypy - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'pyproject.toml' - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - '**/*.py' - # - '.github/workflows/mypy.yaml' - # - 'tools/mypy.sh' - # - 'pyproject.toml' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install mypy==1.11.1 - pip install types-setuptools - pip install types-PyYAML - pip install types-requests - pip install types-setuptools - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml deleted file mode 100644 index 4932af943a..0000000000 --- a/.github/workflows/png-lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint PNG exports from excalidraw -on: - push: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - pull_request: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run png-lint.sh to check excalidraw exported images" - run: | - tools/png-lint.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000..06564969dc --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,19 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.12" + - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + with: + extra_args: --all-files --hook-stage manual diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index 7266cc378c..0000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: ruff - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/matchers/ruff.json - - .github/workflows/ruff.yml - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - "**/*.py" - # - pyproject.toml - # - requirements-lint.txt - # - .github/workflows/matchers/ruff.json - # - .github/workflows/ruff.yml - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml deleted file mode 100644 index 4b1587e373..0000000000 --- a/.github/workflows/shellcheck.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint shell scripts -on: - push: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - pull_request: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - shellcheck: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Check shell scripts" - run: | - tools/shellcheck.sh diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml deleted file mode 100644 index e0bb24276a..0000000000 --- a/.github/workflows/sphinx-lint.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - main - paths: - - "docs/**" - pull_request: - branches: - - main - paths: - - "docs/**" - -jobs: - sphinx-lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/sphinx-lint.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index ff441f9443..0000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: yapf - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/.gitignore b/.gitignore index bb7e4d5b24..89dab8f13b 100644 --- a/.gitignore +++ b/.gitignore @@ -79,10 +79,7 @@ instance/ # Sphinx documentation docs/_build/ -docs/source/getting_started/examples/*.rst -!**/*.template.rst -docs/source/getting_started/examples/*.md -!**/*.template.md +docs/source/getting_started/examples/ # PyBuilder .pybuilder/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..432bf5ed18 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,92 @@ +default_stages: + - pre-commit # Run locally + - manual # Run in CI +repos: +- repo: https://github.com/google/yapf + rev: v0.32.0 + hooks: + - id: yapf + args: [--in-place, --verbose] + additional_dependencies: [toml] # TODO: Remove when yapf is upgraded +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.5 + hooks: + - id: ruff + args: [--output-format, github] +- repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell + exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' +- repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.5 + hooks: + - id: clang-format + exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' + types_or: [c++, cuda] + args: [--style=file, --verbose] +- repo: https://github.com/jackdewinter/pymarkdown + rev: v0.9.27 + hooks: + - id: pymarkdown + files: docs/.* +- repo: https://github.com/rhysd/actionlint + rev: v1.7.6 + hooks: + - id: actionlint +- repo: local + hooks: + - id: mypy-local + name: Run mypy for local Python installation + entry: tools/mypy.sh 0 "local" + language: python + types: [python] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + stages: [pre-commit] # Don't run in CI + - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.9 + entry: tools/mypy.sh 1 "3.9" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.10 + entry: tools/mypy.sh 1 "3.10" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.11 + entry: tools/mypy.sh 1 "3.11" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.12 + entry: tools/mypy.sh 1 "3.12" + language: python + types: [python] + additional_dependencies: *mypy_deps + stages: [manual] # Only run in CI + - id: shellcheck + name: Lint shell scripts + entry: tools/shellcheck.sh + language: script + types: [shell] + - id: png-lint + name: Lint PNG exports from excalidraw + entry: tools/png-lint.sh + language: script + types: [png] + - id: suggestion + name: Suggestion + entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' + language: system + verbose: true diff --git a/CMakeLists.txt b/CMakeLists.txt index f4b9c3ec9c..5039ac2448 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) # Suppress potential warnings about unused manually-specified variables set(ignoreMe "${VLLM_PYTHON_PATH}") -# Prevent installation of dependencies (cutlass) by default. -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) - # # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. @@ -181,6 +178,31 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") # Define other extension targets # +# +# cumem_allocator extension +# + +set(VLLM_CUMEM_EXT_SRC + "csrc/cumem_allocator.cpp") + +set_gencode_flags_for_srcs( + SRCS "${VLLM_CUMEM_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + +if(VLLM_GPU_LANG STREQUAL "CUDA") + message(STATUS "Enabling cumem allocator extension.") + # link against cuda driver library + list(APPEND CUMEM_LIBS cuda) + define_gpu_extension_target( + cumem_allocator + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_CUMEM_EXT_SRC} + LIBRARIES ${CUMEM_LIBS} + USE_SABI 3.8 + WITH_SOABI) +endif() + # # _C extension # @@ -510,7 +532,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP") endif() # vllm-flash-attn currently only supported on CUDA -if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") +if (NOT VLLM_GPU_LANG STREQUAL "CUDA") return() endif () @@ -533,7 +555,7 @@ endif() # They should be identical but if they aren't, this is a massive footgun. # # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. -# To only install vllm-flash-attn, use --component vllm_flash_attn_c. +# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). # If no component is specified, vllm-flash-attn is still installed. # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. @@ -545,43 +567,41 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) endif() if(VLLM_FLASH_ATTN_SRC_DIR) - FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR}) + FetchContent_Declare( + vllm-flash-attn SOURCE_DIR + ${VLLM_FLASH_ATTN_SRC_DIR} + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn + ) else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c + GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() -# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization. -set(VLLM_PARENT_BUILD ON) - -# Ensure the vllm/vllm_flash_attn directory exists before installation -install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c) - -# Make sure vllm-flash-attn install rules are nested under vllm/ -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c) -install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) -install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c) # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") -# Restore the install prefix -install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c) - -# Copy over the vllm-flash-attn python files +# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in +# case only one is built, in the case both are built redundant work is done) install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm/vllm_flash_attn - COMPONENT vllm_flash_attn_c - FILES_MATCHING PATTERN "*.py" + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm_flash_attn + COMPONENT _vllm_fa2_C + FILES_MATCHING PATTERN "*.py" +) + +install( + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm_flash_attn + COMPONENT _vllm_fa3_C + FILES_MATCHING PATTERN "*.py" ) # Nothing after vllm-flash-attn, see comment about macros above diff --git a/Dockerfile b/Dockerfile index 088314eb38..cb9cf0da5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,8 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/dev/dockerfile/dockerfile.md and -# docs/source/assets/dev/dockerfile-stages-dependency.png +# docs/source/contributing/dockerfile/dockerfile.md and +# docs/source/assets/contributing/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### @@ -52,7 +52,7 @@ WORKDIR /workspace # after this step RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ fi COPY requirements-common.txt requirements-common.txt @@ -126,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py -# Default max size of the wheel is 250MB -ARG VLLM_MAX_SIZE_MB=250 +# sync the default value with .buildkite/check-wheel-size.py +ARG VLLM_MAX_SIZE_MB=300 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ @@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -COPY examples/sagemaker-entrypoint.sh . +COPY examples/online_serving/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f163edc27c..ebe226cf6d 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace -COPY requirements-build.txt requirements-build.txt ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ pip install -r requirements-build.txt @@ -37,9 +37,9 @@ FROM cpu-test-1 AS build WORKDIR /workspace/vllm -COPY requirements-common.txt requirements-common.txt -COPY requirements-cpu.txt requirements-cpu.txt RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ pip install -v -r requirements-cpu.txt COPY . . diff --git a/Dockerfile.hpu b/Dockerfile.hpu index 87e0c1a6a9..66cf68c32f 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest COPY ./ /workspace/vllm diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 269139fe90..e9cb82889d 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -15,8 +15,8 @@ RUN apt-get update && \ ffmpeg libsm6 libxext6 libgl1 ### Mount Point ### -# When launching the container, mount the code directory to /app -ARG APP_MOUNT=/app +# When launching the container, mount the code directory to /workspace +ARG APP_MOUNT=/workspace VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT}/vllm @@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install pytest COPY . . ARG GIT_REPO_CHECK=0 @@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils +# overwrite entrypoint to run bash script +RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py + CMD ["/bin/bash"] diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 8bd188ffde..32bcbfa9cc 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install -U pip # install build requirements RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt # build vLLM with OpenVINO backend diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index 9712485779..d3cd1c7b31 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -4,7 +4,7 @@ USER root ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" -RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 +RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba @@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi -# These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ + RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ diff --git a/Dockerfile.rocm b/Dockerfile.rocm index e733994f8c..14c522afd7 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,174 +1,119 @@ -# Default ROCm 6.2 base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0" +# default base image +ARG REMOTE_VLLM="0" +ARG USE_CYTHON="0" +ARG BUILD_RPD="1" +ARG COMMON_WORKDIR=/app +ARG BASE_IMAGE=rocm/vllm-dev:base -# Default ROCm ARCHes to build vLLM for. -ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" +FROM ${BASE_IMAGE} AS base -# Whether to install CK-based flash-attention -# If 0, will not install flash-attention -ARG BUILD_FA="1" -ARG FA_GFX_ARCHS="gfx90a;gfx942" -ARG FA_BRANCH="3cea2fb" - -# Whether to build triton on rocm -ARG BUILD_TRITON="1" -ARG TRITON_BRANCH="e192dba" - -### Base image build stage -FROM $BASE_IMAGE AS base - -# Import arg(s) defined before this build stage -ARG PYTORCH_ROCM_ARCH +ARG ARG_PYTORCH_ROCM_ARCH +ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} # Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y -RUN apt-get update && apt-get install -y \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - build-essential \ - wget \ - unzip \ - tmux \ - ccache \ - && rm -rf /var/lib/apt/lists/* - -# When launching the container, mount the code directory to /vllm-workspace -ARG APP_MOUNT=/vllm-workspace -WORKDIR ${APP_MOUNT} - -RUN python3 -m pip install --upgrade pip -# Remove sccache so it doesn't interfere with ccache -# TODO: implement sccache support across components +RUN apt-get update -q -y && apt-get install -q -y \ + sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev +# Remove sccache +RUN python3 -m pip install --upgrade pip && pip install setuptools_scm RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" - -# Install torch == 2.6.0 on ROCm -RUN --mount=type=cache,target=/root/.cache/pip \ - case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.2"*) \ - python3 -m pip uninstall -y torch torchvision \ - && python3 -m pip install --pre \ - torch==2.6.0.dev20241113+rocm6.2 \ - 'setuptools-scm>=8' \ - torchvision==0.20.0.dev20241113+rocm6.2 \ - --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ - *) ;; esac - -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: -ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: - -ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} -ENV CCACHE_DIR=/root/.cache/ccache +ARG COMMON_WORKDIR +WORKDIR ${COMMON_WORKDIR} -### AMD-SMI build stage -FROM base AS build_amdsmi -# Build amdsmi wheel always -RUN cd /opt/rocm/share/amd_smi \ - && python3 -m pip wheel . --wheel-dir=/install +# ----------------------- +# vLLM fetch stages +FROM base AS fetch_vllm_0 +ONBUILD COPY ./ vllm/ +FROM base AS fetch_vllm_1 +ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" +ARG VLLM_BRANCH="main" +ONBUILD RUN git clone ${VLLM_REPO} \ + && cd vllm \ + && git checkout ${VLLM_BRANCH} +FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm +# ----------------------- +# vLLM build stages +FROM fetch_vllm AS build_vllm +ARG USE_CYTHON +# Build vLLM +RUN cd vllm \ + && python3 -m pip install -r requirements-rocm.txt \ + && python3 setup.py clean --all \ + && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \ + && python3 setup.py bdist_wheel --dist-dir=dist +FROM scratch AS export_vllm +ARG COMMON_WORKDIR +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite -### Flash-Attention wheel build stage -FROM base AS build_fa -ARG BUILD_FA -ARG FA_GFX_ARCHS -ARG FA_BRANCH -# Build ROCm flash-attention wheel if `BUILD_FA = 1` -RUN --mount=type=cache,target=${CCACHE_DIR} \ - if [ "$BUILD_FA" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/flash-attention.git \ - && cd flash-attention \ - && git checkout "${FA_BRANCH}" \ - && git submodule update --init \ - && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ - # Create an empty directory otherwise as later build stages expect one - else mkdir -p /install; \ - fi +# ----------------------- +# Test vLLM image +FROM base AS test +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* -### Triton wheel build stage -FROM base AS build_triton -ARG BUILD_TRITON -ARG TRITON_BRANCH -# Build triton wheel if `BUILD_TRITON = 1` -RUN --mount=type=cache,target=${CCACHE_DIR} \ - if [ "$BUILD_TRITON" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && python3 -m pip install ninja cmake wheel pybind11 \ - && git clone https://github.com/OpenAI/triton.git \ - && cd triton \ - && git checkout "${TRITON_BRANCH}" \ - && cd python \ - && python3 setup.py bdist_wheel --dist-dir=/install; \ - # Create an empty directory otherwise as later build stages expect one - else mkdir -p /install; \ - fi +# Install vLLM +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ + && pip install -U -r requirements-rocm.txt \ + && pip uninstall -y vllm \ + && pip install *.whl - -### Final vLLM build stage -FROM base AS final -# Import the vLLM development directory from the build context -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install --upgrade pip - -# Package upgrades for useful functionality or to avoid dependency issues -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard - - -# Workaround for ray >= 2.10.0 -ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 -# Silences the HF Tokenizers warning -ENV TOKENIZERS_PARALLELISM=false - -RUN --mount=type=cache,target=${CCACHE_DIR} \ - --mount=type=bind,source=.git,target=.git \ - --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -Ur requirements-rocm.txt \ - && python3 setup.py clean --all \ - && python3 setup.py develop - -# Copy amdsmi wheel into final image -RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ - mkdir -p libs \ - && cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y amdsmi; - -# Copy triton wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y triton; fi - -# Copy flash-attn wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y flash-attn; fi - -# Install wheels that were built to the final image -RUN --mount=type=cache,target=/root/.cache/pip \ - if ls libs/*.whl; then \ - python3 -m pip install libs/*.whl; fi +WORKDIR /vllm-workspace +ARG COMMON_WORKDIR +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace # install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils +RUN cd /vllm-workspace \ + && rm -rf vllm \ + && python3 -m pip install -e tests/vllm_test_utils \ + && python3 -m pip install lm-eval[api]==0.4.4 \ + && python3 -m pip install pytest-shard + +# ----------------------- +# Final vLLM image +FROM base AS final + +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ + *) ;; esac + +RUN python3 -m pip install --upgrade huggingface-hub[cli] +ARG BUILD_RPD +RUN if [ ${BUILD_RPD} -eq "1" ]; then \ + git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \ + && cd rocmProfileData/rpd_tracer \ + && pip install -r requirements.txt && cd ../ \ + && make && make install \ + && cd hipMarker && python3 setup.py install ; fi + +# Install vLLM +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ + && pip install -U -r requirements-rocm.txt \ + && pip uninstall -y vllm \ + && pip install *.whl + +ARG COMMON_WORKDIR + +# Copy over the benchmark scripts as well +COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks +COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples + +ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +ENV TOKENIZERS_PARALLELISM=false + +# Performance environment variable. +ENV HIP_FORCE_DEV_KERNARG=1 CMD ["/bin/bash"] + diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base new file mode 100644 index 0000000000..5bbe98b0c2 --- /dev/null +++ b/Dockerfile.rocm_base @@ -0,0 +1,158 @@ +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete +ARG HIPBLASLT_BRANCH="4d40e36" +ARG HIPBLAS_COMMON_BRANCH="7c1566b" +ARG LEGACY_HIPBLASLT_OPTION= +ARG RCCL_BRANCH="648a58d" +ARG RCCL_REPO="https://github.com/ROCm/rccl" +ARG TRITON_BRANCH="e5be006" +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +ARG PYTORCH_BRANCH="8d4926e" +ARG PYTORCH_VISION_BRANCH="v0.19.1" +ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" +ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" +ARG FA_BRANCH="b7d29fb" +ARG FA_REPO="https://github.com/ROCm/flash-attention.git" + +FROM ${BASE_IMAGE} AS base + +ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV ROCM_PATH=/opt/rocm +ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942 +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} + +ARG PYTHON_VERSION=3.12 + +RUN mkdir -p /app +WORKDIR /app +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python and other dependencies +RUN apt-get update -y \ + && apt-get install -y software-properties-common git curl sudo vim less \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-lib2to3 python-is-python3 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython + +FROM base AS build_hipblaslt +ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH +# Set to "--legacy_hipblas_direct" for ROCm<=6.2 +ARG LEGACY_HIPBLASLT_OPTION +RUN git clone https://github.com/ROCm/hipBLAS-common.git +RUN cd hipBLAS-common \ + && git checkout ${HIPBLAS_COMMON_BRANCH} \ + && mkdir build \ + && cd build \ + && cmake .. \ + && make package \ + && dpkg -i ./*.deb +RUN git clone https://github.com/ROCm/hipBLASLt +RUN cd hipBLASLt \ + && git checkout ${HIPBLASLT_BRANCH} \ + && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ + && cd build/release \ + && make package +RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install + +FROM base AS build_rccl +ARG RCCL_BRANCH +ARG RCCL_REPO +RUN git clone ${RCCL_REPO} +RUN cd rccl \ + && git checkout ${RCCL_BRANCH} \ + && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} +RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install + +FROM base AS build_triton +ARG TRITON_BRANCH +ARG TRITON_REPO +RUN git clone ${TRITON_REPO} +RUN cd triton \ + && git checkout ${TRITON_BRANCH} \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install + +FROM base AS build_amdsmi +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=dist +RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install + +FROM base AS build_pytorch +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN git clone ${PYTORCH_REPO} pytorch +RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \ + pip install -r requirements.txt && git submodule update --init --recursive \ + && python3 tools/amd_build/build_amd.py \ + && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${PYTORCH_VISION_REPO} vision +RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${FA_REPO} +RUN cd flash-attention \ + && git checkout ${FA_BRANCH} \ + && git submodule update --init \ + && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ + && cp /app/vision/dist/*.whl /app/install \ + && cp /app/flash-attention/dist/*.whl /app/install + +FROM base AS final +RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ + && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ + && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + pip install /install/*.whl + +ARG BASE_IMAGE +ARG HIPBLASLT_BRANCH +ARG LEGACY_HIPBLASLT_OPTION +ARG RCCL_BRANCH +ARG RCCL_REPO +ARG TRITON_BRANCH +ARG TRITON_REPO +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ + && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ + && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ + && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ + && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \ + && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \ + && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ + && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ + && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt diff --git a/Dockerfile.tpu b/Dockerfile.tpu index b617932a85..ee0d94d98e 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20241017" +ARG NIGHTLY_DATE="20250122" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/README.md b/README.md index f83c9d759b..4ed905bf7a 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone --- *Latest News* 🔥 +- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! @@ -34,10 +35,12 @@ Easy, fast, and cheap LLM serving for everyone ## About vLLM is a fast and easy-to-use library for LLM inference and serving. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. + vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. @@ -68,16 +71,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod ## Getting Started -Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): +Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source): ```bash pip install vllm ``` -Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more. -- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) -- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) -- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) +Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. +- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) +- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) +- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) ## Contributing @@ -90,28 +93,33 @@ vLLM is a community project. Our compute resources for development and testing a - +Cash Donations: - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab - Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/SECURITY.md b/SECURITY.md index ad3f1f16ab..47196a1f12 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). --- diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index b67849038c..0612e8778a 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -22,6 +22,7 @@ class RequestFuncInput: prompt_len: int output_len: int model: str + model_name: Optional[str] = None best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None @@ -34,6 +35,7 @@ class RequestFuncOutput: generated_text: str = "" success: bool = False latency: float = 0.0 + output_tokens: int = 0 ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies @@ -49,7 +51,8 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -78,7 +81,7 @@ async def async_request_tgi( continue chunk_bytes = chunk_bytes.decode("utf-8") - #NOTE: Sometimes TGI returns a ping response without + # NOTE: Sometimes TGI returns a ping response without # any data, we should skip it. if chunk_bytes.startswith(":"): continue @@ -121,7 +124,8 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -155,7 +159,7 @@ async def async_request_trt_llm( timestamp = time.perf_counter() # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -185,7 +189,8 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { @@ -233,17 +238,23 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = { @@ -254,7 +265,6 @@ async def async_request_openai_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: @@ -269,15 +279,16 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": data = json.loads(chunk) # NOTE: Some completion API might have a last # usage summary response without a token so we # want to check a token was generated - if data["choices"][0]["text"]: + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") timestamp = time.perf_counter() # First token if not first_chunk_received: @@ -291,7 +302,10 @@ async def async_request_openai_completions( most_recent_timestamp) most_recent_timestamp = timestamp - generated_text += data["choices"][0]["text"] + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -300,7 +314,7 @@ async def async_request_openai_completions( "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!") output.generated_text = generated_text - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -323,12 +337,14 @@ async def async_request_openai_chat_completions( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "messages": [ { "role": "user", @@ -338,8 +354,12 @@ async def async_request_openai_chat_completions( "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = { @@ -365,17 +385,15 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) - delta = data["choices"][0]["delta"] - if delta.get("content", None): + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -383,13 +401,16 @@ async def async_request_openai_chat_completions( output.itl.append(timestamp - most_recent_timestamp) - generated_text += delta["content"] + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -417,14 +438,35 @@ def get_model(pretrained_model_name_or_path: str) -> str: def get_tokenizer( - pretrained_model_name_or_path: str, trust_remote_code: bool + pretrained_model_name_or_path: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path): pretrained_model_name_or_path = get_model( pretrained_model_name_or_path) - return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError( + "Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + if tokenizer_mode == "mistral": + try: + from vllm.transformers_utils.tokenizer import MistralTokenizer + except ImportError as e: + raise ImportError("MistralTokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use mistral tokenizer mode.") from e + return MistralTokenizer.from_pretrained( + str(pretrained_model_name_or_path)) + else: + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) ASYNC_REQUEST_FUNCS = { diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 0a14aedd5f..77c4f6aa92 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -13,6 +13,7 @@ from tqdm import tqdm from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType +from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser @@ -40,6 +41,20 @@ def main(args: argparse.Namespace): "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] + def llm_generate(): + if not args.use_beam_search: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + else: + llm.beam_search( + dummy_prompts, + BeamSearchParams( + beam_width=args.n, + max_tokens=args.output_len, + ignore_eos=True, + )) + def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: with torch.profiler.profile( @@ -49,15 +64,11 @@ def main(args: argparse.Namespace): ], on_trace_ready=torch.profiler.tensorboard_trace_handler( str(profile_dir))) as p: - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) + llm_generate() + print(p.key_averages().table(sort_by="self_cuda_time_total")) else: start_time = time.perf_counter() - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm_generate() end_time = time.perf_counter() latency = end_time - start_time return latency diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 13477ef535..0b8fba3815 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -2,8 +2,7 @@ Offline benchmark to test the long document QA throughput. Example usage: - # This command run the vllm with 50GB CPU memory for offloading - # The workload samples 8 different prompts with a default input + # This workload samples 8 different prompts with a default input # length of 20000 tokens, then replicates each prompt 2 times # in random order. python benchmark_long_document_qa_throughput.py \ diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 5e9381f712..3ab421a89c 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -10,7 +10,8 @@ Fixed example usage: --model meta-llama/Llama-2-7b-chat-hf \ --enable-prefix-caching \ --num-prompts 1 \ - --repeat-count 100 + --repeat-count 100 \ + --input-length-range 128:256 ShareGPT example usage: # This command samples 20 prompts with input lengths diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac..63d2c3f7c7 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,6 +25,7 @@ On the client side, run: import argparse import asyncio import base64 +import gc import io import json import os @@ -199,7 +200,7 @@ def sample_sonnet_requests( return sampled_requests -def sample_mmmu_pro_vision_requests( +def sample_vision_arena_requests( dataset, num_requests: int, tokenizer: PreTrainedTokenizerBase, @@ -211,13 +212,7 @@ def sample_mmmu_pro_vision_requests( if len(sampled_requests) == num_requests: break - # MMMU-Pro vision direct prompt - # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 - prompt = ( - "Answer with the option letter from the given choices directly. " - "The last line of your response should be of the following " - "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " - "options.") + prompt = data["turns"][0][0]['content'] prompt_token_ids = tokenizer(prompt).input_ids if fixed_output_len is None: @@ -229,10 +224,10 @@ def sample_mmmu_pro_vision_requests( output_len = fixed_output_len assert isinstance( - data["image"], + data["images"][0], Image), ("Input image format must be `PIL.Image.Image`, " f"given {type(data['image'])}.") - image: Image = data["image"] + image: Image = data["images"][0] image = image.convert("RGB") image_data = io.BytesIO() image.save(image_data, format='JPEG') @@ -251,7 +246,7 @@ def sample_mmmu_pro_vision_requests( def sample_hf_requests( dataset_path: str, - dataset_subset: str, + dataset_subset: Optional[str], dataset_split: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, @@ -259,19 +254,17 @@ def sample_hf_requests( fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - # Special case for MMMU-Pro vision dataset - if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': - assert dataset_split == "test" + # Special case for vision_arena dataset + if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ + and dataset_subset is None: + assert dataset_split == "train" dataset = load_dataset(dataset_path, name=dataset_subset, split=dataset_split, streaming=True) - assert "image" in dataset.features, ( - "MMMU/MMMU_Pro vision dataset must have 'image' column.") - filter_func = lambda x: isinstance(x["image"], Image) - dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - return sample_mmmu_pro_vision_requests(dataset, num_requests, - tokenizer, fixed_output_len) + dataset = dataset.shuffle(seed=random_seed) + return sample_vision_arena_requests(dataset, num_requests, tokenizer, + fixed_output_len) dataset = load_dataset(dataset_path, name=dataset_subset, @@ -423,7 +416,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 @@ -436,19 +429,23 @@ def calculate_metrics( e2els: List[float] = [] for i in range(len(outputs)): if outputs[i].success: - # We use the tokenizer to count the number of output tokens for all - # serving backends instead of looking at len(outputs[i].itl) since - # multiple output tokens may be bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + output_len = outputs[i].output_tokens + + if output_len is None: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] tpot = 0 if output_len > 1: - tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - - 1) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) tpots.append(tpot) # Note: if output_len <= 1, we regard tpot as 0 for goodput all_tpots.append(tpot) @@ -459,21 +456,21 @@ def calculate_metrics( else: actual_output_lens.append(0) - if gootput_config_dict: + if goodput_config_dict: valid_metrics = [] slo_values = [] - if "ttft" in gootput_config_dict: + if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(gootput_config_dict["ttft"] / + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "tpot" in gootput_config_dict: + if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(gootput_config_dict["tpot"] / + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "e2el" in gootput_config_dict: + if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(gootput_config_dict["e2el"] / + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) for req_metric in zip(*valid_metrics): @@ -525,6 +522,7 @@ async def benchmark( api_url: str, base_url: str, model_id: str, + model_name: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], @@ -536,7 +534,7 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: @@ -553,6 +551,7 @@ async def benchmark( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( model=model_id, + model_name=model_name, prompt=test_prompt, api_url=api_url, prompt_len=test_prompt_len, @@ -573,6 +572,7 @@ async def benchmark( if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=test_prompt, api_url=base_url + "/start_profile", prompt_len=test_prompt_len, @@ -616,6 +616,7 @@ async def benchmark( async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, @@ -657,7 +658,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -669,7 +670,7 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) - if gootput_config_dict: + if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", @@ -684,7 +685,7 @@ async def benchmark( "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, "request_goodput:": - metrics.request_goodput if gootput_config_dict else None, + metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -740,11 +741,11 @@ async def benchmark( def check_goodput_args(args): # Check and parse goodput arguments - gootput_config_dict = {} + goodput_config_dict = {} VALID_NAMES = ["ttft", "tpot", "e2el"] if args.goodput: - gootput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in gootput_config_dict.items(): + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): if slo_name not in VALID_NAMES: raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " @@ -755,22 +756,22 @@ def check_goodput_args(args): f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " "non-negative.") - return gootput_config_dict + return goodput_config_dict def parse_goodput(slo_pairs): - gootput_config_dict = {} + goodput_config_dict = {} try: for slo_pair in slo_pairs: slo_name, slo_val = slo_pair.split(":") - gootput_config_dict[slo_name] = float(slo_val) + goodput_config_dict[slo_name] = float(slo_val) except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " "Specify service level objectives for goodput as \"KEY:VALUE\" " "pairs, where the key is a metric name, and the value is a " "number in milliseconds.") from err - return gootput_config_dict + return goodput_config_dict def main(args: argparse.Namespace): @@ -780,6 +781,7 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model + model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_mode = args.tokenizer_mode @@ -869,7 +871,11 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") - gootput_config_dict = check_goodput_args(args) + goodput_config_dict = check_goodput_args(args) + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() benchmark_result = asyncio.run( benchmark( @@ -877,6 +883,7 @@ def main(args: argparse.Namespace): api_url=api_url, base_url=base_url, model_id=model_id, + model_name=model_name, tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, @@ -890,7 +897,7 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, )) @@ -1222,5 +1229,12 @@ if __name__ == "__main__": 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + args = parser.parse_args() main(args) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py new file mode 100644 index 0000000000..e1f613e1da --- /dev/null +++ b/benchmarks/kernels/benchmark_lora.py @@ -0,0 +1,1147 @@ +import argparse +import copy +import json +import pickle +import time +from dataclasses import dataclass +from enum import Enum, auto +from itertools import product +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import ArgPool, Bench, CudaGraphBenchParams +from weight_shapes import WEIGHT_SHAPES + +from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_TP_SIZES = [1] +DEFAULT_BATCH_SIZES = [ + 1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024, + 2048, 3072, 4096, 5120, 6144, 7168, 8192 +] +DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] +DEFAULT_LORA_RANKS = [16] +DEFAULT_NUM_LORAS = [1, 2, 3, 4] +DEFAULT_SORT_BY_LORA_IDS = [False, True] +DEFAULT_SEQ_LENGTHS = [1] +DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] + + +# Utilities +def dtype_to_str(dtype: torch.dtype): + if dtype == torch.float16: + return "f16" + if dtype == torch.bfloat16: + return "bf16" + if dtype == torch.float32: + return "f32" + raise ValueError(f"Unsupported dtype {dtype}") + + +def make_rand_lora_weight_tensor(k: int, + n: int, + num_loras: int, + dtype: torch.dtype, + device: str = "cuda") -> torch.Tensor: + + # LoRA weights column major + return torch.rand((num_loras, n, k), dtype=dtype).to(device) + + +def make_rand_tensors( + a_shape: Tuple[int], + b_shape: Tuple[int], + c_shape: Tuple[int], + a_dtype: torch.dtype, + b_dtype: torch.dtype, + c_dtype: torch.dtype, + num_slices: int, + device: str = "cuda", +) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: + """ + Make LoRA input/output matrices. + """ + A = torch.rand(a_shape, dtype=a_dtype).to(device) + + # LoRA weights column major + Bs = [ + torch.rand(b_shape, dtype=b_dtype).to(device) + for _ in range(num_slices) + ] + + C = torch.zeros(c_shape, dtype=c_dtype).to(device) + return A, Bs, C + + +def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, + sort_by_lora_id: bool, + device: str) -> torch.Tensor: + """ + All prompts are mapped to a Lora ID in range [0, num_active_loras). + where 0 refers to first lora, 1 refers to second lora and so on. + """ + assert num_active_loras > 0 + + if not sort_by_lora_id: + return torch.randint(0, + num_active_loras, (num_prompts, ), + dtype=torch.long) + + # Divide LoRAs equally and in order. + part_size = num_prompts // num_active_loras + part_size = max(part_size, 1) + + lora_id = 0 + prompt_lora_mapping = [] + while len(prompt_lora_mapping) < num_prompts: + prompt_lora_mapping.extend([lora_id] * part_size) + lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id + return torch.tensor(prompt_lora_mapping[:num_prompts], + dtype=torch.long, + device=device) + + +def make_token_lora_mapping(num_tokens: int, num_prompts: int, + prompt_lora_mapping: torch.Tensor, + seq_len_tensor: torch.Tensor, device: str): + """ + Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor + """ + assert prompt_lora_mapping.shape[0] == num_prompts + + # token to lora index mapping + token_lora_mapping = [0] * num_tokens + current_offset = 0 + for b_id in range(num_prompts): + lora_index = prompt_lora_mapping[b_id].item() + s = current_offset + e = s + seq_len_tensor[b_id].item() + token_lora_mapping[s:e] = [lora_index] * (e - s) + current_offset += seq_len_tensor[b_id].item() + + return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) + + +def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + seq_lens_cpu: torch.Tensor, + prompt_lora_mapping_cpu: torch.Tensor, scaling: float, + add_inputs: Optional[bool]): + """ + Torch group gemm reference implementation to test correctness of + benchmarking operations. + """ + batches = seq_lens_cpu.size(0) + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batches), seq_lens_cpu): + x = input[current_offset:b_length + current_offset, :] + current_offset += b_length + w = lora_weights[prompt_lora_mapping_cpu[lora_index]] + result = torch.nn.functional.linear(x, w) + result *= scaling + out_list.append(result) + torch.cat(out_list, dim=0) + + cat_result = torch.cat(out_list, dim=0) + + if add_inputs: + ref_out += cat_result + else: + ref_out.copy_(cat_result) + + +class OpType(Enum): + """ + LoRA Ops to benchmark and its properties. + """ + SGMV_SHRINK = auto() + BGMV_SHRINK = auto() + SGMV_EXPAND = auto() + BGMV_EXPAND = auto() + BGMV_EXPAND_SLICE = auto() + + @staticmethod + def from_str(s: str) -> "OpType": + if s.lower() == 'sgmv_shrink': + return OpType.SGMV_SHRINK + if s.lower() == 'sgmv_expand': + return OpType.SGMV_EXPAND + if s.lower() == 'bgmv_shrink': + return OpType.BGMV_SHRINK + if s.lower() == 'bgmv_expand': + return OpType.BGMV_EXPAND + if s.lower() == "bgmv_expand_slice": + return OpType.BGMV_EXPAND_SLICE + raise ValueError(f"Unrecognized str {s} to convert to OpType") + + def is_shrink_fn(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK] + + def is_expand_fn(self) -> bool: + return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND] + + def is_prefill_op(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND] + + def is_decode_op(self) -> bool: + return self in [ + OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE + ] + + def is_expand_slice_fn(self) -> bool: + return self in [OpType.BGMV_EXPAND_SLICE] + + def num_slices(self) -> List[int]: + if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: + # SGMV kernels supports slices + return [1, 2, 3] + if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]: + return [1] + if self in [OpType.BGMV_EXPAND_SLICE]: + return [2, 3] + raise ValueError(f"Unrecognized OpType {self}") + + def mkn(self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int) -> Tuple[int, int, int]: + num_tokens = batch_size * seq_length + if self.is_shrink_fn(): + m = num_tokens + k = hidden_size + n = lora_rank + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + m = num_tokens + k = lora_rank + n = hidden_size + return m, k, n + + def matmul_dtypes( + self, op_dtype: torch.dtype + ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + """ + return a type, b type and c type for A x B = C + """ + if self.is_shrink_fn(): + return op_dtype, op_dtype, torch.float32 + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + return torch.float32, op_dtype, op_dtype + + def matmul_shapes( + self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int, num_loras: int, + num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + """ + Given num_slices, return the shapes of the A, B, and C matrices + in A x B = C, for the op_type + """ + m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) + + b_shape = (num_loras, n, k) # col-major + if self == OpType.SGMV_SHRINK: + # SGMV shrink supports num_slices inherently in the kernel + return ((m, k), b_shape, (num_slices, m, n)) + if self == OpType.SGMV_EXPAND: + # SGMV expand supports num_slices inherently in the kernel + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + if self == OpType.BGMV_SHRINK: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND_SLICE: + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + + raise ValueError(f"Unrecognized op_type {self}") + + def bench_fn(self) -> Callable: + + def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): + for x in kwargs_list: + bgmv_expand_slice(**x) + + if self == OpType.SGMV_SHRINK: + return sgmv_shrink + if self == OpType.SGMV_EXPAND: + return sgmv_expand + if self == OpType.BGMV_SHRINK: + return bgmv_shrink + if self == OpType.BGMV_EXPAND: + return bgmv_expand + if self == OpType.BGMV_EXPAND_SLICE: + return emulate_bgmv_expand_slice + raise ValueError(f"Unrecognized optype {self}") + + def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + **kwargs) -> Callable: + """Each benchmark operation expected the input, lora_weights and outputs + in a slightly different format. Refer to self.matmul_shapes(). + run_ref_group_gemm accounts for those differences in executing a + reference group gemm for correctness testing. + """ + w_dtype = lora_weights[0].dtype + num_slices = len(lora_weights) + if self == OpType.SGMV_SHRINK: + for slice_idx in range(num_slices): + ref_group_gemm(ref_out=output[slice_idx, :], + input=input, + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.SGMV_EXPAND: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.BGMV_SHRINK: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input, + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input.clone().to(dtype=w_dtype), + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND_SLICE: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + raise ValueError(f"Unrecognized optype {self}") + + +@dataclass +class BenchmarkContext: + """ + LoRA benchmark context + """ + batch_size: int + hidden_size: int + num_loras: int + num_active_loras: int + lora_rank: int + sort_by_lora_id: bool + dtype: torch.dtype + seq_length: Optional[int] = None + num_slices: Optional[int] = None # num_slices for slice based ops + + def with_seq_length(self, seq_length: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.seq_length = seq_length + return ctx + + def with_num_slices(self, num_slices: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.num_slices = num_slices + return ctx + + def bench_label(self) -> str: + return f"lora-{self.dtype}" + + def bench_sublabel(self, op_type: OpType) -> str: + m, k, n = op_type.mkn(self.batch_size, self.seq_length, + self.hidden_size, self.lora_rank) + desc = { + 'bs': self.batch_size, + 'sl': self.seq_length, + 'm': m, + 'k': k, + 'n': n, + 'num_loras': self.num_loras, + 'sort_by_lora': self.sort_by_lora_id, + 'num_slices': self.num_slices, + } + return json.dumps(desc) + + +@dataclass +class BenchmarkTensors: + """ + Input/Output tensors used for benchmarks + """ + # matmul tensors + input: torch.Tensor + lora_weights_lst: List[torch.Tensor] + output: torch.Tensor + # metadata tensors + seq_lens: torch.Tensor + seq_start_loc: torch.Tensor + prompt_lora_mapping: torch.Tensor + token_lora_mapping: torch.Tensor + + def io_types(self) -> str: + return (f"{dtype_to_str(self.input.dtype)}x" + f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" + f"{dtype_to_str(self.output.dtype)}") + + @staticmethod + def make(ctx: BenchmarkContext, + op_type: OpType, + device: str = "cuda") -> "BenchmarkTensors": + + # Make input / output matmul tensors. + a_shape, b_shape, c_shape = op_type.matmul_shapes( + ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank, + ctx.num_loras, ctx.num_slices) + a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) + input_tensor, lora_weights, output_tensor = \ + make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type, + num_slices = ctx.num_slices) + + # Make metadata tensors. + # Keep the metadata tensors in the CPU for further processing if needed. + # The tensors get moved to the GPU before benchmarking. + assert ctx.num_active_loras <= ctx.num_loras + total_tokens = ctx.batch_size * ctx.seq_length + + # Prepare seq lens tensor + seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, + (ctx.batch_size, )) + # Prepare seq_start_loc tensor + seq_start_loc_tensor = torch.cumsum(torch.tensor( + [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0) + assert total_tokens == seq_len_tensor.sum() + # Prepare prompt lora indices tensor + prompt_lora_indices_tensor = make_prompt_lora_mapping( + ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") + # Prepare token lora indices tensor + token_lora_indices_tensor = make_token_lora_mapping( + total_tokens, ctx.batch_size, prompt_lora_indices_tensor, + seq_len_tensor, "cpu") + + return BenchmarkTensors(input_tensor, lora_weights, output_tensor, + seq_len_tensor, seq_start_loc_tensor, + prompt_lora_indices_tensor, + token_lora_indices_tensor) + + def sanity_check(self) -> None: + """ + Fails asserts when non-conformality is detected. + """ + num_tokens = self.input.shape[-2] + # check metadata tensors + assert torch.sum(self.seq_lens) == num_tokens + num_seqs = self.seq_lens.shape[0] + assert self.seq_start_loc.shape[0] == num_seqs + assert self.prompt_lora_mapping.shape[0] == num_seqs + assert self.token_lora_mapping.shape[0] == num_tokens + + def to_device(self, device: str): + """ + Transfer tensors to device if the tensors aren't already on the device + """ + + def to_device(tensor: torch.Tensor): + if tensor.device != device: + tensor = tensor.to(device=device) + return tensor + + self.input = to_device(self.input) + self.output = to_device(self.output) + self.seq_lens = to_device(self.seq_lens) + self.seq_start_loc = to_device(self.seq_start_loc) + self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) + self.token_lora_mapping = to_device(self.token_lora_mapping) + for i in range(len(self.lora_weights_lst)): + self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) + + def metadata(self) -> Tuple[int, int, int]: + """ + Return num_seqs, num_tokens and max_seq_len + """ + num_seqs = self.seq_lens.shape[0] + num_tokens = self.token_lora_mapping.shape[0] + max_seq_len = torch.max(self.seq_lens).item() + num_slices = len(self.lora_weights_lst) + return num_seqs, num_tokens, max_seq_len, num_slices + + def convert_to_sgmv_benchmark_tensors(self): + """ + For sgmv punica kernels, when consecutive sequences have the + same LoRA ID, we just merge them together. + This happens in punica.py::compute_metadata + """ + + # Collapse seq_lens and seq_start_loc + _, seq_lens = torch.unique_consecutive(self.token_lora_mapping, + return_counts=True) + cum_result = torch.cumsum(seq_lens, dim=0) + seq_start_loc = torch.zeros_like(seq_lens) + seq_start_loc[1:].copy_(cum_result[:-1]) + + # Collapse prompt mapping + prompt_lora_mapping = torch.unique_consecutive( + self.prompt_lora_mapping) + + assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \ + f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}" + + self.prompt_lora_mapping = prompt_lora_mapping.to( + dtype=self.prompt_lora_mapping.dtype) + self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) + self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) + + def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_slices, num_tokens, lora_rank] + assert len(o_shape) == 3 + assert o_shape == (num_slices, num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'scaling': 1.0, + } + + def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape : [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape : [num_lora, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape : [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'offset_start': 0, + 'add_inputs': add_inputs, + } + + def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_tokens, lora_rank] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'scaling': 1.0 + } + + def as_bgmv_expand_kwargs(self, add_inputs: bool): + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, lora_rank] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + lora_rank = i_shape[1] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'add_inputs': add_inputs + } + + def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + _, num_tokens, _, num_slices = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + self.to_device(self.input.device) + + kwargs_list = [] + for i in range(num_slices): + kwargs_list.append({ + 'inputs': self.input[i], + 'lora_b_weights': self.lora_weights_lst[i], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'slice_offset': i * hidden_size, + 'slice_size': hidden_size, + 'add_inputs': add_inputs, + }) + return {'kwargs_list': kwargs_list} + + def bench_fn_kwargs(self, + op_type: OpType, + add_inputs: Optional[bool] = None) -> Dict[str, Any]: + if op_type.is_shrink_fn(): + assert add_inputs is None + else: + assert add_inputs is not None + + if op_type == OpType.SGMV_SHRINK: + return self.as_sgmv_shrink_kwargs() + if op_type == OpType.SGMV_EXPAND: + return self.as_sgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_SHRINK: + return self.as_bgmv_shrink_kwargs() + if op_type == OpType.BGMV_EXPAND: + return self.as_bgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_EXPAND_SLICE: + return self.as_bgmv_expand_slice_kwargs(add_inputs) + raise ValueError(f"Unrecognized optype {self}") + + def test_correctness(self, op_type: OpType, + expand_fn_add_inputs: Optional[bool]) -> bool: + """ + Test correctness of op_type implementation against a grouped gemm + reference implementation. + """ + seq_lens_cpu = self.seq_lens.to(device="cpu") + prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu") + ref_output = self.output.clone() + + self.output.zero_() + op_type.bench_fn()( + **self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) + + op_type.run_ref_group_gemm( + ref_output, + self.input, + self.lora_weights_lst, + seq_lens_cpu=seq_lens_cpu, + prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, + scaling=1.0, + add_inputs=expand_fn_add_inputs) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[self.output.dtype] + + return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) + + +def bench_optype(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, + expand_fn_add_inputs: Optional[bool] = None, + test_correctness: bool = False) -> TMeasurement: + + assert arg_pool_size >= 1 + if op_type.is_shrink_fn(): + assert expand_fn_add_inputs is None + else: + assert expand_fn_add_inputs is not None + + # BenchmarkContext -> BenchmarkTensors + bench_tensors : List[BenchmarkTensors] = \ + [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] + for bt in bench_tensors: + bt.sanity_check() + + # Test correctness of our implementation. + if test_correctness: + assert all([ + bt.test_correctness(op_type, expand_fn_add_inputs) + for bt in bench_tensors + ]) + + # BenchmarkTensors -> Dict (kwargs) + kwargs_list = [ + bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + for bt in bench_tensors + ] + + # Clear LoRA optimization hash-maps. + _LORA_A_PTR_DICT.clear() + _LORA_B_PTR_DICT.clear() + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + for kwargs in kwargs_list: + op_type.bench_fn()(**kwargs) + torch.cuda.synchronize() + + # Merge into a single kwargs and qualify arguments as ArgPool + kwargs = {k: ArgPool([]) for k in kwargs_list[0]} + for _kwargs in kwargs_list: + for k, v in _kwargs.items(): + kwargs[k].values.append(v) + + describe_args = (f"add_inputs={expand_fn_add_inputs}" + if expand_fn_add_inputs is not None else "") + description = ( + f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})") + + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + timer = None + with Bench(cuda_graph_params, + ctx.bench_label(), ctx.bench_sublabel(op_type), description, + op_type.bench_fn(), **kwargs) as bench: + timer = bench.run() + return timer + + +def bench_torch_mm(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None) -> TMeasurement: + """ + Benchmark basic torch.mm as a roofline. + + When all the input tokens have the same LoRA ID, the LoRA kernels are just + a matmul. This torch.mm benchmark serves as a roofline for that case. + + input op_type is used in determining the m, k, n dimensions for the matmul. + """ + + batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size, + ctx.hidden_size, + ctx.lora_rank, + ctx.seq_length, + ctx.dtype) + + m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) + # For a fairer comparison. + n = n * ctx.num_slices + + # Get matmul input and output tensors for A x B = C + As, Bs, Cs = [], [], [] + for _ in range(arg_pool_size): + As.append(torch.rand((m, k), dtype=dtype).to("cuda")) + Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t()) + Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) + + # Make torch.mm kwargs + mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)} + + description = ( + f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" + f"x{dtype_to_str(dtype)}" + f"=>{dtype_to_str(dtype)})") + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + with Bench(cuda_graph_params, ctx.bench_label(), + ctx.bench_sublabel(op_type), description, torch.mm, + **mm_kwargs) as bench: + return bench.run() + + +# runner +def use_cuda_graph_recommendation() -> str: + return """ + Triton kernels have a significant launch overhead with + launched directly via python. This overhead is more noticeable + for small the problem sizes. For these cases, it is recommended + to use the script with `--cuda-graph-nops N` to benchmark N + consecutive invocations of the benchmarking operations from + inside a CUDA Graph. Note that the returned measurement is for N + invocations of the operation. + """ + + +def print_timers(timers: List[TMeasurement], + args: Optional[argparse.Namespace] = None): + compare = TBenchmark.Compare(timers) + compare.print() + + if args and args.cuda_graph_nops: + print( + f"Note : The timings reported above is for {args.cuda_graph_nops} " + "consecutive invocations of the benchmarking functions. " + f"Please divide by {args.cuda_graph_nops} for single invocation " + "timings.") + + print("Note on Comparison with torch.mm : The torch.mm numbers are " + "benchmark numbers of a simple matmul emulating the single lora " + "case. It is provided as a roofline for comparing our LoRA Kernel " + "implementations. It is expected that the LoRA kernels will be " + "slower than torch.mm in cases where num_loras is big. But for " + "small num_loras the goal should be to match the torch.mm numbers.") + + +def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): + + if args.cuda_graph_nops is not None: + assert args.cuda_graph_nops > 0 + print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA " + "Graph") + else: + print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") + + timers = [] + for bench_ctx in bench_ctxs: + for seq_len in args.seq_lengths: + bench_ops: List[OpType] = [] + if seq_len == 1: + # bench all decode ops + bench_ops = [op for op in args.op_types if op.is_decode_op()] + else: + # bench all prefill ops + bench_ops = [op for op in args.op_types if op.is_prefill_op()] + + seq_len_timers = [] + for bench_op in bench_ops: + for num_slices in bench_op.num_slices(): + _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( + num_slices) + # Benchmark torch.mm as a roofline + seq_len_timers.append( + bench_torch_mm(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops)) + + # Benchmark bench_op + expand_fn_add_inputs = [ + None + ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + for add_input_arg in expand_fn_add_inputs: + seq_len_timers.append( + bench_optype(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops, add_input_arg, + args.test_correctness)) + + print_timers(seq_len_timers) + timers.extend(seq_len_timers) + + # Result stdout dump + print("== All Results ====") + print_timers(timers, args) + + if args.output_directory: + # Result file dump + od = Path(args.output_directory) + if not od.exists(): + od.mkdir() + + timestamp = int(time.time()) + pkl_file = od / f"lora_bench-{timestamp}.pkl" + print(f"Writing benchmarks to {pkl_file}") + with open(pkl_file, "wb") as f: + pickle.dump(timers, f) + + +def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], + args: argparse.Namespace) -> List[BenchmarkContext]: + + ctxs: List[BenchmarkContext] = [] + for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, + args.sort_by_lora_id): + ctxs.append( + BenchmarkContext( + batch_size=batch_size, + hidden_size=hidden_size, + lora_rank=lora_rank, + num_loras=num_loras, + num_active_loras=args.num_active_loras + if args.num_active_loras else num_loras, + # To be filled based on the OpType to benchmark + seq_length=None, + sort_by_lora_id=sort_by_lora_id, + dtype=args.dtype, + # To be filled based on the OpType to benchmark + num_slices=None)) + + return ctxs + + +def run_list_bench(args: argparse.Namespace): + print(args) + + print("List bench :\n" + f" Hidden Sizes {args.hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_range_bench(args: argparse.Namespace): + print(args) + + hidden_sizes = list( + range(args.hidden_sizes_start, args.hidden_sizes_end + 1, + args.hidden_sizes_increment)) + lora_ranks = list( + range(args.lora_ranks_start, args.lora_ranks_end + 1, + args.lora_ranks_increment)) + + print("Range bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_model_bench(args: argparse.Namespace): + print(args) + + def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: + hidden_sizes = set() + for KN, tp_split_dim in WEIGHT_SHAPES[model]: + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + hidden_sizes.add(KN[1]) + return hidden_sizes + + # Get all hidden sizes + hidden_sizes: set[int] = set() + for model_name, tp_size in product(args.models, args.tp_sizes): + hidden_sizes = hidden_sizes.union( + hidden_sizes_from_model(model_name, tp_size)) + + print("Model bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "torch.float16": + return torch.float16 + if dt == "torch.bfloat16": + return torch.bfloat16 + raise ValueError("unsupported dtype") + + def get_bool(s: str) -> bool: + return s.lower() in ['true', '1'] + + def add_common_command_args(p: argparse.ArgumentParser): + p.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['torch.float16', 'torch.bfloat16']") + + p.add_argument( + "--arg-pool-size", + type=int, + default=32, + help="Run profiles with a pool of input/output/meta tensors instead" + "of simply reusing the same tensors for all runs. A bigger arg-pool" + "mitigates hardware caching effects during benchmarking.") + + p.add_argument( + "--cuda-graph-nops", + type=int, + help=("when set profiling is done using cudagraph, " + "with the given number of operations in a graph." + "Note that the measurement returned is the time " + "taken for N consecutive executions of the benchmarking " + "functions, where N is the value of this argument.")) + p.add_argument("--num-loras", + nargs="+", + type=int, + default=DEFAULT_NUM_LORAS) + p.add_argument("--num-active-loras", + type=int, + default=None, + help="Active LoRAs. When None, all LoRAs are active") + p.add_argument("--sort-by-lora-id", + nargs="+", + type=get_bool, + default=DEFAULT_SORT_BY_LORA_IDS) + p.add_argument("--op-types", + nargs="+", + type=OpType.from_str, + default=list(OpType)) + p.add_argument('--seq-lengths', + nargs="+", + type=int, + default=DEFAULT_SEQ_LENGTHS) + p.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + p.add_argument("--expand-fn-add-inputs", + nargs="+", + type=get_bool, + default=DEFAULT_EXPAND_FN_ADD_INPUTS) + p.add_argument( + '-o', + '--output-directory', + type=str, + help=("Output directory to store a the list of benchmarking" + "TMeasurement objects as a pickle file")) + + p.add_argument( + "--test-correctness", + action='store_true', + help=("When enabled, the benchmarking functions are tested" + "for correctness before the actual benchmarking")) + + parser = FlexibleArgumentParser( + description=f""" +Benchmark LoRA kernels: + {use_cuda_graph_recommendation()} + + list_bench example: + python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + model_bench example: + python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + range_bench example: + python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + subparsers = parser.add_subparsers(dest="cmd", required=True) + + list_parser = subparsers.add_parser("list_bench") + list_parser.add_argument("--hidden-sizes", + nargs="+", + type=int, + default=DEFAULT_HIDDEN_SIZES) + list_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(list_parser) + list_parser.set_defaults(func=run_list_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--hidden-sizes-start", type=int, required=True) + range_parser.add_argument("--hidden-sizes-end", type=int, required=True) + range_parser.add_argument("--hidden-sizes-increment", + type=int, + required=True) + range_parser.add_argument("--lora-ranks-start", type=int, required=True) + range_parser.add_argument("--lora-ranks-end", type=int, required=True) + range_parser.add_argument("--lora-ranks-increment", + type=int, + required=True) + add_common_command_args(range_parser) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(model_parser) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 8f538c21f7..1fa0da75c7 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,6 +1,7 @@ import argparse import time from datetime import datetime +from itertools import product from typing import Any, Dict, List, Tuple, TypedDict import ray @@ -13,6 +14,9 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( +) else torch.float8_e4m3fn + class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int @@ -80,8 +84,8 @@ def benchmark_config( a1_scale = torch.randn(1, dtype=torch.float32) a2_scale = torch.randn(1, dtype=torch.float32) - w1 = w1.to(torch.float8_e4m3fn) - w2 = w2.to(torch.float8_e4m3fn) + w1 = w1.to(FP8_DTYPE) + w2 = w2.to(FP8_DTYPE) input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) @@ -141,28 +145,172 @@ def benchmark_config( return avg -def get_configs_compute_bound() -> List[Dict[str, int]]: - # Reduced search space for faster tuning. - # TODO(woosuk): Increase the search space and use a performance model to - # prune the search space. +def get_rocm_tuning_space(use_fp16): + block_mn_range = [16, 32, 64, 128, 256] + block_k_range = [16, 32, 64, 128, 256] + if not use_fp16: + block_k_range.remove(16) # BLOCK_K=16 not supported for fp8 + num_warps_range = [1, 2, 4, 8] + group_m_range = [1, 4, 8, 16, 32] + num_stage_range = [2] + waves_per_eu_range = [0] + matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] + kpack_range = [1, 2] if use_fp16 else [] + + param_ranges = { + "BLOCK_SIZE_M": block_mn_range, + "BLOCK_SIZE_N": block_mn_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + "waves_per_eu": waves_per_eu_range, + } + if use_fp16: + param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range + param_ranges["kpack"] = kpack_range + + return param_ranges + + +def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: configs: List[BenchmarkConfig] = [] - for num_stages in [2, 3, 4, 5]: - for block_m in [16, 32, 64, 128, 256]: - for block_k in [64, 128, 256]: - for block_n in [32, 64, 128, 256]: - for num_warps in [4, 8]: - for group_size in [1, 16, 32, 64]: - configs.append({ - "BLOCK_SIZE_M": block_m, - "BLOCK_SIZE_N": block_n, - "BLOCK_SIZE_K": block_k, - "GROUP_SIZE_M": group_size, - "num_warps": num_warps, - "num_stages": num_stages, - }) + + if current_platform.is_rocm(): + param_ranges = get_rocm_tuning_space(use_fp16) + else: + # Reduced search space for faster tuning. + # TODO(woosuk): Increase the search space and use a performance model to + # prune the search space. + block_m_range = [16, 32, 64, 128, 256] + block_n_range = [32, 64, 128, 256] + block_k_range = [64, 128, 256] + num_warps_range = [4, 8] + group_m_range = [1, 16, 32, 64] + num_stage_range = [2, 3, 4, 5] + + param_ranges = { + "BLOCK_SIZE_M": block_m_range, + "BLOCK_SIZE_N": block_n_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + } + + keys, values = zip(*param_ranges.items()) + for config_values in product(*values): + config = dict(zip(keys, config_values)) + configs.append(config) return configs +def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, + search_space, is_fp16): + N1, K1 = shard_intermediate_size, hidden_size + N2, K2 = hidden_size, shard_intermediate_size // 2 + pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, + is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, + is_fp16) + search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) + return search_space + + +# The following code is inspired by ROCm/Triton GEMM tuning script: +# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89 +def prune_rocm_configs(M, N, K, configs, is_fp16=True): + pruned_configs = [] + elemBytes_a = 2 if is_fp16 else 1 + elemBytes_b = 2 if is_fp16 else 1 + + mfma = 16 if M < 32 or N < 32 else 32 + + # TODO (zhanglx): figure out the boundary between large and small gemms + large_gemm = False + if M >= 2048 and N >= 2048: + large_gemm = True + + for config in configs: + BLOCK_SIZE_M = config.get("BLOCK_SIZE_M") + BLOCK_SIZE_N = config.get("BLOCK_SIZE_N") + BLOCK_SIZE_K = config.get("BLOCK_SIZE_K") + num_warps = config.get("num_warps") + + if is_fp16: + matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") + if matrix_instr_nonkdim > mfma: + continue + if mfma == 4 and BLOCK_SIZE_K < 64: + continue + # some layouts could not work properly in case + # number elements per thread is less 1 + if BLOCK_SIZE_M * BLOCK_SIZE_N < 64: + continue + SPLIT_K = config.get("SPLIT_K", 1) + GROUP_M = config.get("GROUP_SIZE_M") + if is_fp16: + if (matrix_instr_nonkdim > BLOCK_SIZE_M + or matrix_instr_nonkdim > BLOCK_SIZE_N): + continue + if (matrix_instr_nonkdim >= M + and matrix_instr_nonkdim != BLOCK_SIZE_M): + continue + if (matrix_instr_nonkdim >= N + and matrix_instr_nonkdim != BLOCK_SIZE_N): + continue + # Skip BLOCK_SIZE that is too large compare to M/N + # unless BLOCK_SIZE is already small enough + if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16: + continue + if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16: + continue + # skip large split_k when not necessary + if SPLIT_K != 1 and not need_split_k(M, N, K): + continue + # skip split_k that leads to EVEN_K = false + leap = SPLIT_K * BLOCK_SIZE_K + modv = K % leap + if modv != 0: + continue + # skip large GROUP_M + if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1: + continue + # out of shared memory resource + # TODO (zhanglx): This does not consider the LDS usage in the epilogue + LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b) + if LDS > 65536: + continue + # Skip small block sizes and num_warps for large gemm + # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 + if large_gemm: + if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: + continue + if BLOCK_SIZE_K < 64: + continue + if num_warps < 4: + continue + + pruned_configs.append(config) + + return pruned_configs + + +def need_split_k(SIZE_M, SIZE_N, SIZE_K): + return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024 + + +def merge_unique_dicts(list1, list2): + result = [] + combined_list = list1.copy() + combined_list.extend(list2) + for dictionary in combined_list: + if dictionary not in result: + result.append(dictionary) + return result + + @ray.remote(num_gpus=1) class BenchmarkWorker: @@ -170,6 +318,10 @@ class BenchmarkWorker: torch.set_default_device("cuda") current_platform.seed_everything(seed) self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) def benchmark( self, @@ -217,25 +369,33 @@ class BenchmarkWorker: ) -> Dict[str, int]: best_config = None best_time = float("inf") - for config in tqdm(search_space): - try: - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=10) - except triton.runtime.autotuner.OutOfResources: - # Some configurations may be invalid and fail to compile. - continue + if current_platform.is_rocm(): + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = prune_rocm_search_space(num_tokens, + shard_intermediate_size, + hidden_size, search_space, + is_fp16) - if kernel_time < best_time: - best_time = kernel_time - best_config = config + with torch.cuda.device(self.device_id): + for config in tqdm(search_space): + try: + kernel_time = benchmark_config(config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") assert best_config is not None @@ -244,12 +404,27 @@ class BenchmarkWorker: def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { - "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], - "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], - "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], - "GROUP_SIZE_M": config["GROUP_SIZE_M"], - "num_warps": config["num_warps"], - "num_stages": config["num_stages"], + "BLOCK_SIZE_M": + config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": + config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": + config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": + config["GROUP_SIZE_M"], + "num_warps": + config["num_warps"], + "num_stages": + config["num_stages"], + **({ + "waves_per_eu": config["waves_per_eu"] + } if "waves_per_eu" in config else {}), + **({ + "matrix_instr_nonkdim": config["matrix_instr_nonkdim"] + } if "matrix_instr_nonkdim" in config else {}), + **({ + "kpack": config["kpack"] + } if "kpack" in config else {}), } @@ -294,7 +469,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" @@ -322,7 +497,8 @@ def main(args: argparse.Namespace): return ray.get(outputs) if args.tune: - search_space = get_configs_compute_bound() + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = get_configs_compute_bound(is_fp16) print(f"Start tuning over {len(search_space)} configurations...") start = time.time() diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 14eef00b85..219013a381 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -98,7 +98,9 @@ def main( start_time = time.perf_counter() # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, + dtype=torch.float32, + device=device) for _ in range(num_iters): if version == "v1": diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py new file mode 100644 index 0000000000..fee877b6f7 --- /dev/null +++ b/benchmarks/kernels/utils.py @@ -0,0 +1,210 @@ +import dataclasses +from typing import Any, Callable, Iterable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement + + +@dataclasses.dataclass +class CudaGraphBenchParams: + num_ops_in_cuda_graph: int + + +@dataclasses.dataclass +class ArgPool: + """ + When some argument of the benchmarking function is annotated with this type, + the benchmarking class (BenchMM) will collapse the argument to a pick a + single value from the given list of values, during function invocation. + For every invocation during a benchmarking run, it will choose a + different value from the list. + """ + values: Iterable[Any] + + def __getitem__(self, index): + return self.values[index] + + +class Bench: + + class ArgsIterator: + + def __init__(self, args_list, kwargs_list): + assert len(args_list) == len(kwargs_list) + self.args_list = args_list + self.kwargs_list = kwargs_list + self.n = len(self.args_list) + self.idx = 0 + + def __next__(self): + while True: + yield (self.args_list[self.idx], self.kwargs_list[self.idx]) + self.idx += 1 + self.idx = self.idx % self.n + + def reset(self): + self.idx = 0 + + @property + def n_args(self): + return self.n + + def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], + label: str, sub_label: str, description: str, fn: Callable, + *args, **kwargs): + + self.cuda_graph_params = cuda_graph_params + self.use_cuda_graph = self.cuda_graph_params is not None + self.label = label + self.sub_label = sub_label + self.description = description + self.fn = fn + + # Process args + self._args = args + self._kwargs = kwargs + self.args_list, self.kwargs_list = self.collapse_argpool( + *args, **kwargs) + self.args_iterator = self.ArgsIterator(self.args_list, + self.kwargs_list) + + # Cudagraph runner + self.g = None + if self.use_cuda_graph: + self.g = self.get_cuda_graph_runner() + + # benchmark run params + self.min_run_time = 1 + + def collapse_argpool(self, *args, **kwargs): + argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [ + arg for arg in kwargs.values() if isinstance(arg, ArgPool) + ] + if len(argpool_args) == 0: + return [args], [kwargs] + + # Make sure all argpools are of the same size + argpool_size = len(argpool_args[0].values) + assert all([argpool_size == len(arg.values) for arg in argpool_args]) + + # create copies of the args + args_list = [] + kwargs_list = [] + for _ in range(argpool_size): + args_list.append(args) + kwargs_list.append(kwargs.copy()) + + for i in range(argpool_size): + # collapse args; Just pick the ith value + args_list[i] = tuple([ + arg[i] if isinstance(arg, ArgPool) else arg + for arg in args_list[i] + ]) + + # collapse kwargs + kwargs_i = kwargs_list[i] + arg_pool_keys = [ + k for k, v in kwargs_i.items() if isinstance(v, ArgPool) + ] + for k in arg_pool_keys: + # again just pick the ith value + kwargs_i[k] = kwargs_i[k][i] + kwargs_list[i] = kwargs_i + + return args_list, kwargs_list + + def get_cuda_graph_runner(self): + assert self.use_cuda_graph + assert self.args_iterator is not None + + num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph + + # warmup + args_it = self.args_iterator.__next__() + for _ in range(2): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(num_graph_ops): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + return g + + def run_cudagrah(self) -> TMeasurement: + assert self.use_cuda_graph + globals = {'g': self.g} + + return TBenchmark.Timer( + stmt="g.replay()", + globals=globals, + label=( + f"{self.label}" + f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops" + ), + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run_eager(self) -> TMeasurement: + setup = None + stmt = None + globals = None + + has_arg_pool = self.args_iterator.n_args > 1 + if has_arg_pool: + setup = ''' + args_iterator.reset() + args_it = args_iterator.__next__() + ''' + stmt = ''' + args, kwargs = next(args_it) + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args_iterator': self.args_iterator} + else: + # no arg pool. Just use the args and kwargs directly + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + args, kwargs = next(args_it) + + setup = "" + stmt = ''' + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs} + + return TBenchmark.Timer( + stmt=stmt, + setup=setup, + globals=globals, + label=self.label, + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run(self) -> TMeasurement: + timer = None + if self.use_cuda_graph: # noqa SIM108 + timer = self.run_cudagrah() + else: + timer = self.run_eager() + if not timer.meets_confidence() or timer.has_warnings: + print("Doesn't meet confidence - re-running bench ...") + return self.run() + return timer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type: + print(f"exc type {exc_type}") + print(f"exc value {exc_value}") + print(f"exc traceback {traceback}") diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 68f7ca1af0..714abca2a5 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(MACOSX_FOUND TRUE) +endif() + + # # Define environment variables for special configurations # @@ -13,6 +18,9 @@ endif() include_directories("${CMAKE_SOURCE_DIR}/csrc") + +set (ENABLE_NUMA TRUE) + # # Check the compile flags # @@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") "-mf16c" ) endif() -list(APPEND CXX_COMPILE_FLAGS - "-fopenmp" - "-DVLLM_CPU_EXTENSION") -execute_process(COMMAND cat /proc/cpuinfo - RESULT_VARIABLE CPUINFO_RET - OUTPUT_VARIABLE CPUINFO) - -if (NOT CPUINFO_RET EQUAL 0) - message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") +if(MACOSX_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-Xpreprocessor" + "-fopenmp" + "-DVLLM_CPU_EXTENSION") +else() + list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-DVLLM_CPU_EXTENSION") endif() +if (NOT MACOSX_FOUND) + execute_process(COMMAND cat /proc/cpuinfo + RESULT_VARIABLE CPUINFO_RET + OUTPUT_VARIABLE CPUINFO) + if (NOT CPUINFO_RET EQUAL 0) + message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") + endif() +endif() + + function (find_isa CPUINFO TARGET OUT) string(FIND ${CPUINFO} ${TARGET} ISA_FOUND) if(NOT ISA_FOUND EQUAL -1) @@ -54,12 +72,17 @@ endfunction() is_avx512_disabled(AVX512_DISABLED) -find_isa(${CPUINFO} "avx2" AVX2_FOUND) -find_isa(${CPUINFO} "avx512f" AVX512_FOUND) -find_isa(${CPUINFO} "POWER10" POWER10_FOUND) -find_isa(${CPUINFO} "POWER9" POWER9_FOUND) -find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support -find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + set(APPLE_SILICON_FOUND TRUE) +else() + find_isa(${CPUINFO} "avx2" AVX2_FOUND) + find_isa(${CPUINFO} "avx512f" AVX512_FOUND) + find_isa(${CPUINFO} "POWER10" POWER10_FOUND) + find_isa(${CPUINFO} "POWER9" POWER9_FOUND) + find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support + find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +endif() + if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS @@ -103,6 +126,9 @@ elseif (ASIMD_FOUND) set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") endif() list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) +elseif(APPLE_SILICON_FOUND) + message(STATUS "Apple Silicon Detected") + set(ENABLE_NUMA OFF) else() message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") endif() @@ -139,7 +165,12 @@ endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") -list(APPEND LIBS numa) +if(ENABLE_NUMA) + list(APPEND LIBS numa) +else() + message(STATUS "NUMA is disabled") + add_compile_definitions(-DVLLM_NUMA_DISABLED) +endif() # # _C extension diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 40430dae10..15b09395a8 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # set(SRCS ${ORIG_SRCS}) set(CXX_SRCS ${ORIG_SRCS}) - list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") - list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$") # # Generate ROCm/HIP source file names from CUDA file names. diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 839dc36ba4..88275dbdd8 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -9,8 +9,16 @@ namespace vllm { +template +__device__ __forceinline__ scalar_t compute(const scalar_t& x, + const scalar_t& y) { + return act_first ? ACT_FN(x) * y : x * ACT_FN(y); +} // Activation and gating kernel template. -template + +template __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] @@ -19,7 +27,7 @@ __global__ void act_and_mul_kernel( for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = ACT_FN(x) * y; + out[token_idx * d + idx] = compute(x, y); } } @@ -55,7 +63,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { } // namespace vllm // Launch activation and gating kernel. -#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \ +// Use ACT_FIRST (bool) indicating whether to apply the activation function +// first. +#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \ int d = input.size(-1) / 2; \ int64_t num_tokens = input.numel() / input.size(-1); \ dim3 grid(num_tokens); \ @@ -64,7 +74,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ VLLM_DISPATCH_FLOATING_TYPES( \ input.scalar_type(), "act_and_mul_kernel", [&] { \ - vllm::act_and_mul_kernel> \ + vllm::act_and_mul_kernel, ACT_FIRST> \ <<>>(out.data_ptr(), \ input.data_ptr(), d); \ }); @@ -72,19 +82,27 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { void silu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true); +} + +void mul_and_silu(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] +{ + // The difference between mul_and_silu and silu_and_mul is that mul_and_silu + // applies the silu to the latter half of the input. + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false); } void gelu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true); } void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true); } namespace vllm { diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index 563e1438f0..eb216dc8ba 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -105,7 +105,7 @@ __device__ void paged_attention_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float k_scale, const float v_scale, const int tp_rank, + const float* k_scale, const float* v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { const int seq_idx = blockIdx.y; @@ -285,7 +285,7 @@ __device__ void paged_attention_kernel( Quant_vec k_vec_quant = *reinterpret_cast( k_ptr + offset1 * BLOCK_SIZE * x + offset2); k_vecs[j] = fp8::scaled_convert( - k_vec_quant, k_scale); + k_vec_quant, *k_scale); } } @@ -415,7 +415,7 @@ __device__ void paged_attention_kernel( *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8::scaled_convert(v_quant_vec, - v_scale); + *v_scale); } if (block_idx == num_seq_blocks - 1) { // NOTE(woosuk): When v_vec contains the tokens that are out of the @@ -513,7 +513,7 @@ __global__ void paged_attention_v1_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float k_scale, const float v_scale, const int tp_rank, + const float* k_scale, const float* v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { paged_attention_kernel& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -80,6 +80,8 @@ void paged_attention_v1_launcher( CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* seq_lens_ptr = seq_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; int padded_max_seq_len = @@ -176,9 +178,10 @@ void paged_attention_v1( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { const bool is_block_sparse = (blocksparse_vert_stride > 1); diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index c457bdb890..9935359e02 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -37,7 +37,7 @@ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ + kv_block_stride, kv_head_stride, k_scale_ptr, v_scale_ptr, tp_rank, \ blocksparse_local_blocks, blocksparse_vert_stride, \ blocksparse_block_size, blocksparse_head_sliding_step); \ vllm::paged_attention_v2_reduce_kernel& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -84,6 +84,8 @@ void paged_attention_v2_launcher( CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* seq_lens_ptr = seq_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); @@ -187,9 +189,10 @@ void paged_attention_v2( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { const bool is_block_sparse = (blocksparse_vert_stride > 1); diff --git a/csrc/cache.h b/csrc/cache.h index 1623c2ff8a..dcfea6a823 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -18,15 +18,15 @@ void copy_blocks(std::vector const& key_caches, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale); + const std::string& kv_cache_dtype, + torch::Tensor& k_scale, torch::Tensor& v_scale); void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, const std::string& kv_cache_dtype, - const double k_scale, const double v_scale); + torch::Tensor& k_scale, torch::Tensor& v_scale); void reshape_and_cache_flash_full_cuda( torch::Tensor& tokenshape, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 45675a1b09..a63b6f6b3c 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel( // block_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int key_stride, const int value_stride, const int num_heads, - const int head_size, const int block_size, const int x, const float k_scale, - const float v_scale) { + const int head_size, const int block_size, const int x, + const float* k_scale, const float* v_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { @@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel( value_cache[tgt_value_idx] = tgt_value; } else { key_cache[tgt_key_idx] = - fp8::scaled_convert(tgt_key, k_scale); + fp8::scaled_convert(tgt_key, *k_scale); value_cache[tgt_value_idx] = - fp8::scaled_convert(tgt_value, v_scale); + fp8::scaled_convert(tgt_value, *v_scale); } } } @@ -214,7 +214,7 @@ __global__ void reshape_and_cache_flash_kernel( const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, const int key_stride, const int value_stride, const int num_heads, const int head_size, const int block_size, - const float k_scale, const float v_scale) { + const float* k_scale, const float* v_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; // NOTE: slot_idx can be -1 if the token is padded @@ -239,9 +239,9 @@ __global__ void reshape_and_cache_flash_kernel( value_cache[tgt_key_value_idx] = tgt_value; } else { key_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_key, k_scale); + fp8::scaled_convert(tgt_key, *k_scale); value_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_value, v_scale); + fp8::scaled_convert(tgt_value, *v_scale); } } } @@ -304,7 +304,9 @@ __global__ void reshape_and_cache_flash_full_cuda_kernel( reinterpret_cast(key_cache.data_ptr()), \ reinterpret_cast(value_cache.data_ptr()), \ slot_mapping.data_ptr(), key_stride, value_stride, \ - num_heads, head_size, block_size, x, k_scale, v_scale); + num_heads, head_size, block_size, x, \ + reinterpret_cast(k_scale.data_ptr()), \ + reinterpret_cast(v_scale.data_ptr())); void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -314,8 +316,8 @@ void reshape_and_cache( torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale) { + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); @@ -345,7 +347,9 @@ void reshape_and_cache( reinterpret_cast(key_cache.data_ptr()), \ reinterpret_cast(value_cache.data_ptr()), \ slot_mapping.data_ptr(), block_stride, key_stride, \ - value_stride, num_heads, head_size, block_size, k_scale, v_scale); + value_stride, num_heads, head_size, block_size, \ + reinterpret_cast(k_scale.data_ptr()), \ + reinterpret_cast(v_scale.data_ptr())); void reshape_and_cache_flash( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -354,8 +358,8 @@ void reshape_and_cache_flash( torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size] torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale) { + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale) { // NOTE(woosuk): In vLLM V1, key.size(0) can be different from // slot_mapping.size(0) because of padding for CUDA graphs. // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index 408e736d5b..c2ae554c9f 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -32,7 +32,7 @@ class ScalarType { signed_(signed_), bias(bias), finite_values_only(finite_values_only), - nan_repr(nan_repr){}; + nan_repr(nan_repr) {}; static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits - 1, true, bias); diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e21832ba75..b9764056e8 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes) { + const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -459,12 +459,12 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + int64_t max_seq_len, const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); TORCH_CHECK(blocksparse_vert_stride <= 1, "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl", @@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, - int max_seq_len, const c10::optional& alibi_slopes) { + int max_seq_len, const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -781,12 +781,12 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + int64_t max_seq_len, const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); TORCH_CHECK(blocksparse_vert_stride <= 1, "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl", diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp index 31d454328b..e3809acad7 100644 --- a/csrc/cpu/cache.cpp +++ b/csrc/cpu/cache.cpp @@ -107,10 +107,8 @@ void copy_blocks(std::vector const& key_caches, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, double k_scale, - double v_scale) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); - + const std::string& kv_cache_dtype, + torch::Tensor& k_scale, torch::Tensor& v_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 28db047974..a718151061 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,13 +2,13 @@ #define CPU_TYPES_HPP #if defined(__x86_64__) - //x86 implementation + // x86 implementation #include "cpu_types_x86.hpp" #elif defined(__POWER9_VECTOR__) - //ppc implementation + // ppc implementation #include "cpu_types_vsx.hpp" #elif defined(__aarch64__) - //arm implementation + // arm implementation #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index 73e0f8cb2e..990e99f2fc 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -1,48 +1,50 @@ #include -#include +#include #include namespace vec_op { #ifdef ARM_BF16_SUPPORT - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) #else - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #endif -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { - template - constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); - }; -}; +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +}; +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; }; @@ -54,53 +56,106 @@ struct FP16Vec8 : public Vec { float16x8_t reg; - explicit FP16Vec8(const void *ptr) - : reg(vld1q_f16(static_cast(ptr))) {}; + explicit FP16Vec8(const void* ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { - vst1q_f16(static_cast<__fp16 *>(ptr), reg); - } + void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); } }; struct FP16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - float16x8x2_t reg; - - explicit FP16Vec16(const void *ptr) { - reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); - reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); - } - - explicit FP16Vec16(const FP32Vec16& vec); - - void save(void *ptr) const { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - - void save(void *ptr, const int elem_num) const { - int full_blocks = elem_num / 8; - int remainder = elem_num % 8; - - if (full_blocks > 0) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - if (full_blocks > 1) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - } - - if (remainder > 0) { - float16x8_t temp = reg.val[full_blocks]; - for (int i = 0; i < remainder; ++i) { - reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); - } - } - } -}; + constexpr static int VEC_ELEM_NUM = 16; + float16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void* ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void* ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; + } + } + } +}; #ifdef ARM_BF16_SUPPORT struct BF16Vec8 : public Vec { @@ -108,16 +163,17 @@ struct BF16Vec8 : public Vec { bfloat16x8_t reg; - explicit BF16Vec8(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + explicit BF16Vec8(float32x4x2_t v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -125,19 +181,18 @@ struct BF16Vec16 : public Vec { bfloat16x8x2_t reg; - explicit BF16Vec16(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - explicit BF16Vec16(float32x4x4_t v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) - }){}; + explicit BF16Vec16(float32x4x4_t v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; struct BF16Vec32 : public Vec { @@ -145,19 +200,15 @@ struct BF16Vec32 : public Vec { bfloat16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {}; + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; #endif @@ -175,11 +226,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; - explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {}; explicit FP32Vec4(float32x4_t data) : reg(data) {}; - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}; }; struct FP32Vec8 : public Vec { @@ -195,32 +246,37 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; - explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + explicit FP32Vec8(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; - explicit FP32Vec8(const FP16Vec8 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); - }; + explicit FP32Vec8(const FP16Vec8& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; - explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + explicit FP32Vec8(float16x8_t v) + : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; - #ifdef ARM_BF16_SUPPORT +#ifdef ARM_BF16_SUPPORT - explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + explicit FP32Vec8(bfloat16x8_t v) + : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; - explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + explicit FP32Vec8(const BF16Vec8& v) + : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; - #endif +#endif float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; } @@ -267,10 +323,14 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; - float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; - float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; - float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; - float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), + static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), + static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), + static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), + static_cast(erf(ar.values[7]))}; float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); @@ -280,25 +340,29 @@ struct FP32Vec8 : public Vec { result.val[1] = result1; return FP32Vec8(result); - } - - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1])})); } - void save(float *ptr) const { + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1])})); + } + + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); } @@ -313,103 +377,100 @@ struct FP32Vec16 : public Vec { float32x4x4_t reg; - explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + explicit FP32Vec16(float v) + : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} - explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + explicit FP32Vec16() + : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), + vmovq_n_f32(0.0)}) {} - explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + explicit FP32Vec16(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), + vld1q_f32(ptr + 12)}) {} explicit FP32Vec16(float32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec8 &data) { - reg.val[0] = data.reg.val[0]; - reg.val[1] = data.reg.val[1]; - reg.val[2] = data.reg.val[0]; - reg.val[3] = data.reg.val[1]; + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {} - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(bfloat16x8x2_t v) : reg({ - vcvtq_low_f32_bf16(v.val[0]), - vcvtq_high_f32_bf16(v.val[0]), - vcvtq_low_f32_bf16(v.val[1]), - vcvtq_high_f32_bf16(v.val[1]) - }) {}; - #endif +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) + : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {}; +#endif - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; }; - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(const BF16Vec16 &v) : reg({ - vcvtq_low_f32_bf16(v.reg.val[0]), - vcvtq_high_f32_bf16(v.reg.val[0]), - vcvtq_low_f32_bf16(v.reg.val[1]), - vcvtq_high_f32_bf16(v.reg.val[1]) - }) {}; +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16& v) + : reg({vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1])}) {}; - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; - #endif + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; +#endif - explicit FP32Vec16(const FP16Vec16 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); - reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); - reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); + explicit FP32Vec16(const FP16Vec16& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); }; - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vaddq_f32(reg.val[0], b.reg.val[0]), - vaddq_f32(reg.val[1], b.reg.val[1]), - vaddq_f32(reg.val[2], b.reg.val[2]), - vaddq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vmulq_f32(reg.val[0], b.reg.val[0]), - vmulq_f32(reg.val[1], b.reg.val[1]), - vmulq_f32(reg.val[2], b.reg.val[2]), - vmulq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vsubq_f32(reg.val[0], b.reg.val[0]), - vsubq_f32(reg.val[1], b.reg.val[1]), - vsubq_f32(reg.val[2], b.reg.val[2]), - vsubq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vdivq_f32(reg.val[0], b.reg.val[0]), - vdivq_f32(reg.val[1], b.reg.val[1]), - vdivq_f32(reg.val[2], b.reg.val[2]), - vdivq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3])})); }; float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; }; - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -422,7 +483,7 @@ struct FP32Vec16 : public Vec { return answer; }; - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); vst1q_f32(ptr + 8, reg.val[2]); @@ -430,43 +491,59 @@ struct FP32Vec16 : public Vec { }; }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; #ifdef ARM_BF16_SUPPORT -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; #endif -template void storeFP32(float v, T *ptr) { *ptr = v; } - -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<__fp16 *>(ptr) = v; +template +void storeFP32(float v, T* ptr) { + *ptr = v; } -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { - float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); - float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); - float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); - float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast<__fp16*>(ptr) = v; +} - reg.val[0] = vcombine_f16(low_0, high_0); - reg.val[1] = vcombine_f16(low_1, high_1); +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); + + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); }; -inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { - float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); - float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); +inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); - reg = vcombine_f16(lower_half, upper_half); + reg = vcombine_f16(lower_half, upper_half); }; -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); @@ -474,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { }; #ifdef ARM_BF16_SUPPORT -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); @@ -494,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { #endif #ifdef ARM_BF16_SUPPORT -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) { + }; -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) - }){}; +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), + v.reg.val[3])}) {}; #endif -inline void prefetch(const void *addr) { - __builtin_prefetch(addr, 0, 1); -}; +inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }; #ifdef ARM_BF16_SUPPORT template <> -inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v); }; #endif -}; \ No newline at end of file +}; // namespace vec_op \ No newline at end of file diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp index b50bdadc57..a8e1be37eb 100644 --- a/csrc/cpu/cpu_types_vsx.hpp +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -9,38 +9,40 @@ namespace vec_op { // FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec { __vector signed short reg; - explicit BF16Vec8(const void *ptr) - : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } }; struct BF16Vec16 : public Vec { @@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec { ss16x8x2_t reg; - explicit BF16Vec16(const void *ptr) { + explicit BF16Vec16(const void* ptr) { // Load 256 bits in two parts - reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); - reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); } - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { + void save(void* ptr) const { // Save 256 bits in two parts - vec_xst(reg.val[0], 0, (signed short *)ptr); - vec_xst(reg.val[1], 16, (signed short *)ptr); + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); } }; @@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; ss16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {} + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {} + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct FP32Vec4 : public Vec { @@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vec_splats(0.0f)) {} - explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} explicit FP32Vec4(__vector float data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec { reg.val[1] = vec_splats(0.0f); } - explicit FP32Vec8(const float *ptr) { + explicit FP32Vec8(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); } explicit FP32Vec8(f32x4x2_t data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) { + explicit FP32Vec8(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; } - explicit FP32Vec8(const BF16Vec8 &v) { + explicit FP32Vec8(const BF16Vec8& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); reg.val[1] = (__vector float)vec_mergel(zero, v.reg); } @@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec { return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); } @@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec { reg.val[3] = vec_splats(0.0f); } - explicit FP32Vec16(const float *ptr) { + explicit FP32Vec16(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); reg.val[2] = vec_xl(32, ptr); @@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec16 &data) { + explicit FP32Vec16(const FP32Vec16& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[2]; reg.val[3] = data.reg.val[3]; } - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; } - explicit FP32Vec16(const FP32Vec8 &data) { + explicit FP32Vec16(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[0]; reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_mul(reg.val[0], b.reg.val[0]), - vec_mul(reg.val[1], b.reg.val[1]), - vec_mul(reg.val[2], b.reg.val[2]), - vec_mul(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_add(reg.val[0], b.reg.val[0]), - vec_add(reg.val[1], b.reg.val[1]), - vec_add(reg.val[2], b.reg.val[2]), - vec_add(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_sub(reg.val[0], b.reg.val[0]), - vec_sub(reg.val[1], b.reg.val[1]), - vec_sub(reg.val[2], b.reg.val[2]), - vec_sub(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_div(reg.val[0], b.reg.val[0]), - vec_div(reg.val[1], b.reg.val[1]), - vec_div(reg.val[2], b.reg.val[2]), - vec_div(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); } float reduce_sum() const { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec { return result; } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[2], 32, ptr); @@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec { } }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } #ifndef __VEC_CLASS_FP_NAN -#define __VEC_CLASS_FP_NAN (1 << 6) + #define __VEC_CLASS_FP_NAN (1 << 6) #endif -const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +const static __vector unsigned char omask = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; #ifndef _ARCH_PWR10 -const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; -const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; -const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; -const static __vector unsigned int one = { 1, 1, 1, 1 }; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; #endif -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { #ifdef _ARCH_PWR10 __vector signed short ret[2]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); reg = vec_perm(ret[0], ret[1], omask); #elif defined(_ARCH_PWR9) __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); @@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { __vector unsigned int rnd1 = vec_add(lsb1, bias); inp0 = vec_add(inp0, rnd0); inp1 = vec_add(inp1, rnd1); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp0 = vec_sr(inp0, sh16); @@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { #endif } -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { #ifdef _ARCH_PWR10 __vector signed short ret[4]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); - ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); - ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[3]); reg.val[0] = vec_perm(ret[0], ret[1], omask); reg.val[1] = vec_perm(ret[2], ret[3], omask); #elif defined(_ARCH_PWR9) @@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { inp1 = vec_add(inp1, rnd1); inp2 = vec_add(inp2, rnd2); inp3 = vec_add(inp3, rnd3); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); - __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); - __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = + vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = + vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp2 = vec_sel(inp2, nan, sel2); @@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { #endif } -inline void prefetch(const void *addr) { +inline void prefetch(const void* addr) { __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 4bb4eb0f49..a4ef2be2a5 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - RECORD_FUNCTION(#NAME, c10::ArrayRef({})); -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) \ + RECORD_FUNCTION(#NAME, c10::ArrayRef({})); + #define CPU_KERNEL_GUARD_OUT(NAME) #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec { __m128i reg; - explicit FP16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit FP16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct FP16Vec16 : public Vec { @@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec { __m256i reg; - explicit FP16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit FP16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit FP16Vec16(const FP32Vec16 &); + explicit FP16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec { __m128i reg; - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec { __m256i reg; - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit BF16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec { __m512i reg; - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(__m512i data) : reg(data) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg((__m512i)_mm512_inserti32x4( _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( (__m128i)vec8_data.reg), @@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 2), (__m128i)vec8_data.reg, 3)) {} - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; } }; #else struct BF16Vec32 : public Vec { @@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec { __m256i reg_low; __m256i reg_high; - explicit BF16Vec32(const void *ptr) - : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), - reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + explicit BF16Vec32(const void* ptr) + : reg_low(_mm256_loadu_si256((__m256i const*)ptr)), + reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {} - explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), - reg_high(high) {} + explicit BF16Vec32(__m256i low, __m256i high) + : reg_low(low), reg_high(high) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg_low((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)), + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), reg_high((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} - void save(void *ptr) const { - *reinterpret_cast<__m256i *>(ptr) = reg_low; - *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + void save(void* ptr) const { + *reinterpret_cast<__m256i*>(ptr) = reg_low; + *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; } }; #endif @@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {} explicit FP32Vec4(__m128 data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {} explicit FP32Vec8(__m256 data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {} - explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} + explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {} - explicit FP32Vec8(const BF16Vec8 &v) + explicit FP32Vec8(const BF16Vec8& v) : reg(_mm256_castsi256_ps( _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} @@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec { erf(ar.values[1]), erf(ar.values[0]))); } - FP32Vec8 operator*(const FP32Vec8 &b) const { + FP32Vec8 operator*(const FP32Vec8& b) const { return FP32Vec8(_mm256_mul_ps(reg, b.reg)); } - FP32Vec8 operator+(const FP32Vec8 &b) const { + FP32Vec8 operator+(const FP32Vec8& b) const { return FP32Vec8(_mm256_add_ps(reg, b.reg)); } - FP32Vec8 operator-(const FP32Vec8 &b) const { + FP32Vec8 operator-(const FP32Vec8& b) const { return FP32Vec8(_mm256_sub_ps(reg, b.reg)); } - FP32Vec8 operator/(const FP32Vec8 &b) const { + FP32Vec8 operator/(const FP32Vec8& b) const { return FP32Vec8(_mm256_div_ps(reg, b.reg)); } - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); } }; #ifdef __AVX512F__ -struct INT32Vec16: public Vec { +struct INT32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m512i reg; @@ -272,12 +274,11 @@ struct INT32Vec16: public Vec { }; __m512i reg; - - explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {} - void save(int32_t* ptr) const { - _mm512_storeu_epi32(ptr, reg); - } + explicit INT32Vec16(const void* data_ptr) + : reg(_mm512_loadu_epi32(data_ptr)) {} + + void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); } void save(int32_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(__m512 data) : reg(data) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), @@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec { (__m128i)data.reg, 2), (__m128i)data.reg, 3)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg((__m512)_mm512_inserti32x8( _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - explicit FP32Vec16(const BF16Vec16 &v) + explicit FP32Vec16(const BF16Vec16& v) : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const INT32Vec16 &v) - : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {} + explicit FP32Vec16(const INT32Vec16& v) + : reg(_mm512_cvt_roundepi32_ps( + v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm512_mul_ps(reg, b.reg)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm512_add_ps(reg, b.reg)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm512_sub_ps(reg, b.reg)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm512_div_ps(reg, b.reg)); } @@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg)); } - FP32Vec16 abs() const { - return FP32Vec16(_mm512_abs_ps(reg)); - } + FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); } float reduce_sum() const { return _mm512_reduce_add_ps(reg); } @@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec { float reduce_min() const { return _mm512_reduce_min_ps(reg); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); return _mm512_mask_reduce_add_ps(mask, reg); } - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec { __m256 reg_low; __m256 reg_high; - explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), - reg_high(_mm256_set1_ps(v)) {} + explicit FP32Vec16(float v) + : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {} - explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), - reg_high(_mm256_set1_ps(0.0)) {} + explicit FP32Vec16() + : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), - reg_high(_mm256_loadu_ps(ptr + 8)) {} + explicit FP32Vec16(const float* ptr) + : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {} explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} - explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), - reg_high(data.reg_high) {} + explicit FP32Vec16(const FP32Vec16& data) + : reg_low(data.reg_low), reg_high(data.reg_high) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg_low((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)), + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)), reg_high((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg_low(data.reg), reg_high(data.reg) {} - explicit FP32Vec16(const FP16Vec16 &v) { + explicit FP32Vec16(const FP16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_cvtph_ps(high); } - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_castsi256_ps(v_high_shifted); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), _mm256_mul_ps(reg_high, b.reg_high)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), _mm256_add_ps(reg_high, b.reg_high)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), _mm256_sub_ps(reg_high, b.reg_high)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), _mm256_div_ps(reg_high, b.reg_high)); } @@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec { return low.reduce_sum() + high.reduce_sum(); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { float sum = 0.0; static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); @@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec { return sum; } - void save(float *ptr) const { + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg_low); _mm256_storeu_ps(ptr + 8, reg_high); } @@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec { #endif #ifdef __AVX512F__ -struct INT8Vec16: public Vec { +struct INT8Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m128i reg; @@ -523,14 +523,12 @@ struct INT8Vec16: public Vec { }; __m128i reg; - - explicit INT8Vec16(const FP32Vec16& vec) : reg( - _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) - ) {} - void save(int8_t* ptr) const { - _mm_storeu_epi8(ptr, reg); - } + explicit INT8Vec16(const FP32Vec16& vec) + : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32( + vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {} + + void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); } void save(int8_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -540,71 +538,92 @@ struct INT8Vec16: public Vec { }; #endif -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast(ptr) = +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast(ptr) = _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) +inline FP16Vec8::FP16Vec8(const FP32Vec8& v) : reg(_mm256_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #ifdef __AVX512F__ -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) : reg(_mm512_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #else -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) - : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) + : reg(_mm256_insertf128_si256( + _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), + FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} #endif #ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v); } -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); } #else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } -#ifdef __AVX512F__ -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + #ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(_mm256_cvtepi32_epi16( _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg(_mm512_cvtepi32_epi16( _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#else -namespace{ + #else +namespace { __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { __m256i ai = _mm256_castps_si256(a); ai = _mm256_srli_epi32(ai, 16); @@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { ai = _mm256_permute4x64_epi64(ai, 0b00111001); return _mm256_extracti128_si256(ai, 0); } -} +} // namespace -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); } -#endif // __AVX512F__ -#endif // __AVX512BF16__ + #endif // __AVX512F__ +#endif // __AVX512BF16__ -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } +inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index d9aed657a3..33b1637832 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& b, // [IC, OC], column-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] - const c10::optional& bias // [OC] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) // Checks for conformality @@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] const torch::Tensor& azp_adj, // [OC] - const c10::optional& azp, // [1] or [M] - const c10::optional& bias // [OC] + const std::optional& azp, // [1] or [M] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) // Checks for conformality @@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] const torch::Tensor& scale, - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); @@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] torch::Tensor& scale, // [..., 1] - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 03beefbc6d..5d1c5f4c83 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, - const c10::optional& bias); + const std::optional& bias); void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, const torch::Tensor& azp_adj, - const c10::optional& azp, - const c10::optional& bias); + const std::optional& azp, + const std::optional& bias); TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -148,7 +148,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); } diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 1138a55df2..42a1c1d924 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -1,10 +1,22 @@ -#include -#include -#include -#include +#ifndef VLLM_NUMA_DISABLED + #include + #include + #include + #include +#endif #include "cpu_types.hpp" +#ifdef VLLM_NUMA_DISABLED +std::string init_cpu_threads_env(const std::string& cpu_ids) { + return std::string( + "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has " + "no effect to setup thread affinity."); +} + +#endif + +#ifndef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); TORCH_CHECK(omp_cpu_mask->size > 0); @@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { omp_lock_t writelock; omp_init_lock(&writelock); -#pragma omp parallel for schedule(static, 1) + #pragma omp parallel for schedule(static, 1) for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { cpu_set_t mask; CPU_ZERO(&mask); @@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } +#endif \ No newline at end of file diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp new file mode 100644 index 0000000000..e8555d853b --- /dev/null +++ b/csrc/cumem_allocator.cpp @@ -0,0 +1,310 @@ +// A CUDAPluggableAllocator based on cumem* APIs. +// Important: allocation size, CUdeviceptr and CUmemGenericAllocationHandle* +// need to be unsigned long long +#include + +extern "C" { + +#define PY_SSIZE_T_CLEAN +#include + +#include +#include +#include + +#define CUDA_CHECK(condition) \ + do { \ + CUresult error = condition; \ + if (error != 0) { \ + char* error_string; \ + cuGetErrorString(error, (const char**)&error_string); \ + std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ + } while (0) + +// Global references to Python callables +// NOTE: this is borrowed reference, so we don't need to DECREF them. +// This brings the limitation that the allocator needs to be singleton. +static PyObject* g_python_malloc_callback = nullptr; +static PyObject* g_python_free_callback = nullptr; + +// --------------------------------------------------------------------------- +// Helper functions: + +void ensure_context(unsigned long long device) { + CUcontext pctx; + CUDA_CHECK(cuCtxGetCurrent(&pctx)); + if (!pctx) { + // Ensure device context. + CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device)); + CUDA_CHECK(cuCtxSetCurrent(pctx)); + } +} + +void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + ensure_context(device); + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Allocate memory using cuMemCreate + CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0)); + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = device; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1)); + // std::cout << "create_and_map: device=" << device << ", size=" << size << ", + // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; +} + +void unmap_and_release(unsigned long long device, ssize_t size, + CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + // std::cout << "unmap_and_release: device=" << device << ", size=" << size << + // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; + ensure_context(device); + CUDA_CHECK(cuMemUnmap(d_mem, size)); + CUDA_CHECK(cuMemRelease(*p_memHandle)); +} + +PyObject* create_tuple_from_c_integers(unsigned long long a, + unsigned long long b, + unsigned long long c, + unsigned long long d) { + // Create a new tuple of size 4 + PyObject* tuple = PyTuple_New(4); + if (!tuple) { + return NULL; // Return NULL on failure + } + + // Convert integers to Python objects and set them in the tuple + PyTuple_SetItem( + tuple, 0, + PyLong_FromUnsignedLongLong(a)); // Steals reference to the PyLong + PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b)); + PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c)); + PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d)); + + // Note: PyTuple_SetItem "steals" a reference to each object, + // so we do not need to Py_DECREF the PyLong objects explicitly. + + return tuple; // Return the created tuple +} + +// --------------------------------------------------------------------------- +// Our exported C functions that call Python: + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void* my_malloc(ssize_t size, int device, CUstream stream) { + ensure_context(device); + + // first allocation, align the size, and reserve an address, and also allocate + // a CUmemGenericAllocationHandle + + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Check if the allocation is supported + size_t granularity; + CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; + + CUdeviceptr d_mem; + CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0)); + + // allocate the CUmemGenericAllocationHandle + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)malloc( + sizeof(CUmemGenericAllocationHandle)); + + if (!g_python_malloc_callback) { + std::cerr << "ERROR: g_python_malloc_callback not set.\n"; + return nullptr; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* arg_tuple = create_tuple_from_c_integers( + (unsigned long long)device, (unsigned long long)alignedSize, + (unsigned long long)d_mem, (unsigned long long)p_memHandle); + + // Call g_python_malloc_callback + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL); + Py_DECREF(arg_tuple); + + if (!py_result) { + PyErr_Print(); + PyGILState_Release(gstate); + return nullptr; + } + + PyGILState_Release(gstate); + + // do the final mapping + create_and_map(device, alignedSize, d_mem, p_memHandle); + + return (void*)d_mem; +} + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void my_free(void* ptr, ssize_t size, int device, CUstream stream) { + // get memory handle from the pointer + if (!g_python_free_callback) { + std::cerr << "ERROR: g_python_free_callback not set.\n"; + return; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* py_ptr = + PyLong_FromUnsignedLongLong(reinterpret_cast(ptr)); + + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL); + + if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size, + &recv_d_mem, &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return; + } + + PyGILState_Release(gstate); + + // recv_size == size + // recv_device == device + + // Free memory + + CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + unmap_and_release(device, size, d_mem, p_memHandle); + + // free address and the handle + CUDA_CHECK(cuMemAddressFree(d_mem, size)); + free(p_memHandle); +} + +// --------------------------------------------------------------------------- +// Python extension boilerplate: + +// Python-exposed function: init_module(python_malloc, python_free) +static PyObject* py_init_module(PyObject* self, PyObject* args) { + PyObject* malloc_callback = nullptr; + PyObject* free_callback = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) { + return nullptr; + } + + if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be callables"); + return nullptr; + } + + // Save the Python callables + // This module does not handle GC of these objects, so they must be kept alive + // outside of this module. + g_python_malloc_callback = malloc_callback; + g_python_free_callback = free_callback; + + Py_RETURN_NONE; +} + +static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyObject* python_create_and_map(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyMethodDef module_methods[] = { + {"init_module", (PyCFunction)py_init_module, METH_VARARGS, + "Initialize module with python_malloc and python_free callables."}, + {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS, + "Create and map memory on the device."}, + {"python_unmap_and_release", (PyCFunction)python_unmap_and_release, + METH_VARARGS, "Unmap and release memory on the device."}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef cumem_allocator_module = { + PyModuleDef_HEAD_INIT, "cumem_allocator", + "cumem-based allocator for CUDAPluggableAllocator", -1, module_methods}; + +PyMODINIT_FUNC PyInit_cumem_allocator(void) { + // Initialize the module + PyObject* module = PyModule_Create(&cumem_allocator_module); + if (!module) { + return NULL; + } + return module; +} +} // extern "C" diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 85e359aa57..07c9e46c27 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -27,8 +27,7 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { int max_shared_mem_per_block_opt_in = 0; cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, - cudaDevAttrMaxSharedMemoryPerBlockOptin, - device); + cudaDevAttrMaxSharedMemoryPerBlockOptin, device); return max_shared_mem_per_block_opt_in; } diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp index 26f7423fd7..ef413e6dd7 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -68,7 +68,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { static_assert(std::is_same_v>); using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index c723adf126..c590c66a66 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -67,7 +67,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; static_assert(std::is_same_v> || @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp index 2c78572521..a1ff933cce 100644 --- a/csrc/cutlass_extensions/torch_utils.hpp +++ b/csrc/cutlass_extensions/torch_utils.hpp @@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor, template static inline auto maybe_make_cute_layout( - c10::optional const& tensor, + std::optional const& tensor, std::string_view name = "tensor") { using Layout = decltype(make_cute_layout(*tensor)); diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index dd1e6de2e0..f0e5533bca 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, const at::Tensor x, const at::Tensor weight, const at::Tensor out, - const c10::optional& bias, + const std::optional& bias, bool silu_activation, int64_t pad_slot_id, - const c10::optional& query_start_loc = std::nullopt, - const c10::optional& cache_indices = std::nullopt, - const c10::optional& has_initial_state = std::nullopt) { + const std::optional& query_start_loc = std::nullopt, + const std::optional& cache_indices = std::nullopt, + const std::optional& has_initial_state = std::nullopt) { // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, - const c10::optional &bias_, - const c10::optional &conv_states, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &bias_, + const std::optional &conv_states, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, bool silu_activation, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early @@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, void causal_conv1d_update(const at::Tensor &x, const at::Tensor &conv_state, const at::Tensor &weight, - const c10::optional &bias_, + const std::optional &bias_, bool silu_activation, - const c10::optional &cache_seqlens_, - const c10::optional &conv_state_indices_, + const std::optional &cache_seqlens_, + const std::optional &conv_state_indices_, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early int64_t pad_slot_id) { diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 7162469633..bd0a34119c 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const torch::Tensor out, const torch::Tensor z, const torch::Tensor out_z, - const c10::optional& D, - const c10::optional& delta_bias, + const std::optional& D, + const std::optional& delta_bias, const torch::Tensor ssm_states, bool has_z, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool varlen, int64_t pad_slot_id) { @@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C, - const c10::optional &D_, - const c10::optional &z_, - const c10::optional &delta_bias_, + const std::optional &D_, + const std::optional &z_, + const std::optional &delta_bias_, bool delta_softplus, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, const torch::Tensor &ssm_states, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 24341d63fb..d609ce1697 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, } } // namespace -template +template __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, @@ -32,12 +32,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, const size_t start_idx = threadIdx.x * tokens_per_thread; extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) - int32_t* cumsum = - shared_mem + - (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) + token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1); for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -74,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = cumsum[num_experts]; + *total_tokens_post_pad = static_cast(cumsum[num_experts]); } __syncthreads(); @@ -224,26 +220,46 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // If we have very large number of experts, we can no longer use shared - // memory. - // TODO(simon): the right solution should be calculating the exact right - // amount of shared memory and use that. The num_experts >= 256 is just a - // temporary solution to unblock Deepseek V3. - if (num_experts >= 256) { + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); + const int32_t shared_mem_i32 = + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + const int32_t shared_mem_i16 = + ((num_thread + 1) * num_experts) * sizeof(uint16_t) + + (num_experts + 1) * sizeof(int32_t); + + bool use_global_memory = false; + bool use_i16 = false; // Use uint16_t for shared memory token counts + if (shared_mem_i32 < device_max_shared_mem) { + // Do nothing in this case. We're all set to use int32_t token counts + } else if (shared_mem_i16 < device_max_shared_mem && + topk_ids.numel() <= 65535) { + // when nelements of topk_ids is smaller than 65535 (max value of uint16), + // element value of token_cnts would also smaller than 65535, + // so we can use uint16 as dtype of token_cnts + use_i16 = true; + } else { + use_global_memory = true; + } + + if (use_global_memory) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t mem_tokens_cnts = - ((num_experts + 1) * num_experts) * sizeof(int32_t); - const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t); - // allocate global memory - int32_t* tokens_cnts; - int32_t* cumsum; - cudaMalloc(&tokens_cnts, mem_tokens_cnts); - cudaMalloc(&cumsum, mem_cumsum); + auto options_int = torch::TensorOptions() + .dtype(torch::kInt) + .device(topk_ids.device()); + torch::Tensor token_cnts_buffer = + torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); auto kernel = vllm::moe::moe_align_block_size_global_mem_kernel; @@ -252,25 +268,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), tokens_cnts, cumsum); - cudaFree(tokens_cnts); - cudaFree(cumsum); + topk_ids.numel(), token_cnts_buffer.data_ptr(), + cumsum_buffer.data_ptr()); + }); + } else if (use_i16) { + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + // set dynamic shared mem + auto kernel = + vllm::moe::moe_align_block_size_kernel; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem_i16)); + kernel<<<1, num_thread, shared_mem_i16, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel()); }); } else { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t shared_mem = - ((num_thread + 1) * num_experts + (num_experts + 1)) * - sizeof(int32_t); - - // set dynamic shared mem - auto kernel = vllm::moe::moe_align_block_size_kernel; + auto kernel = + vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<<1, num_thread, shared_mem, stream>>>( + (void*)kernel, shared_mem_i32)); + kernel<<<1, num_thread, shared_mem_i32, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), diff --git a/csrc/ops.h b/csrc/ops.h index 347c502845..3468989640 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -33,9 +33,10 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + int64_t max_seq_len, const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); @@ -44,9 +45,10 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + int64_t max_seq_len, const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); @@ -86,6 +88,8 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query, void silu_and_mul(torch::Tensor& out, torch::Tensor& input); +void mul_and_silu(torch::Tensor& out, torch::Tensor& input); + void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); @@ -153,15 +157,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability); @@ -169,7 +173,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a); @@ -177,11 +181,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor const& scale, - c10::optional const& azp); + std::optional const& azp); void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, - c10::optional const& azp); + std::optional const& azp); torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, @@ -198,34 +202,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, - c10::optional const& scale_ub); + std::optional const& scale_ub); void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& B, const torch::Tensor& C, - const c10::optional& D_, - const c10::optional& z_, - const c10::optional& delta_bias_, + const std::optional& D_, + const std::optional& z_, + const std::optional& delta_bias_, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, const torch::Tensor& ssm_states, int64_t pad_slot_id); void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight, - const c10::optional& bias_, + const std::optional& bias_, bool silu_activation, - const c10::optional& cache_seqlens_, - const c10::optional& conv_state_indices_, + const std::optional& cache_seqlens_, + const std::optional& conv_state_indices_, int64_t pad_slot_id); void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, - const c10::optional& bias_, - const c10::optional& conv_states, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& bias_, + const std::optional& conv_states, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool silu_activation, int64_t pad_slot_id); #ifndef USE_ROCM diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index bd184ee226..c3902f4c2a 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -95,6 +95,16 @@ __global__ void advance_step_flashinfer_kernel( long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr, int64_t const block_tables_stride, int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) { + int const n_pad = num_seqs - num_queries; + if (n_pad && blockIdx.x == 0) { + // Handle cuda graph padding + int const offset = num_queries; + for (int i = threadIdx.x; i < n_pad; i += blockDim.x) { + input_tokens_ptr[offset + i] = 0; + input_positions_ptr[offset + i] = 0; + slot_mapping_ptr[offset + i] = -1; + } + } int num_query_blocks = div_ceil(num_queries, num_threads); if (blockIdx.x < num_query_blocks) { diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index e9987535bd..e797858271 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] torch::Tensor const& scale, - c10::optional const& azp) { + std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scale.numel() == 1); @@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] - torch::Tensor& scales, c10::optional const& azp) { + torch::Tensor& scales, std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scales.is_contiguous()); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index dbb72e8bbd..865fef5aee 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 123f4359c0..e18d7d79e5 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 4f7b6588ef..3f2b52624f 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, @@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #if defined CUDA_VERSION && CUDA_VERSION >= 12000 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, @@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #endif bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { @@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && @@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 2df4d18190..a9b5ddf4cb 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) { static inline std::optional maybe_scalartype( - c10::optional const& t) { + std::optional const& t) { if (!t) { return std::nullopt; } else { diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh index d4d19ae5de..e4af067915 100644 --- a/csrc/quantization/machete/machete_mm_kernel.cuh +++ b/csrc/quantization/machete/machete_mm_kernel.cuh @@ -183,11 +183,11 @@ struct MacheteKernelTemplate { torch::Tensor const& A, // MxK matrix torch::Tensor const& B, // KxN prepacked matrix torch::Tensor& D, // MxN matrix - c10::optional const& maybe_g_scales, // scale_KxN matrix - c10::optional const& maybe_g_zeros, // scale_KxN matrix - c10::optional maybe_group_size, - c10::optional const& maybe_ch_scales, // len N vector - c10::optional const& maybe_tok_scales) // len M vector + std::optional const& maybe_g_scales, // scale_KxN matrix + std::optional const& maybe_g_zeros, // scale_KxN matrix + std::optional maybe_group_size, + std::optional const& maybe_ch_scales, // len N vector + std::optional const& maybe_tok_scales) // len M vector { static_assert(!with_group_zeropoints || with_group_scales); diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh index 4b0da5b303..cabe0af46f 100644 --- a/csrc/quantization/machete/machete_mm_launcher.cuh +++ b/csrc/quantization/machete/machete_mm_launcher.cuh @@ -13,23 +13,23 @@ struct MMArgs { torch::Tensor const& A; torch::Tensor const& B; vllm::ScalarType const& b_type; - c10::optional const& maybe_out_type; - c10::optional const& maybe_group_scales; - c10::optional const& maybe_group_zeros; - c10::optional maybe_group_size; - c10::optional const& maybe_channel_scales; - c10::optional const& maybe_token_scales; - c10::optional maybe_schedule; + std::optional const& maybe_out_type; + std::optional const& maybe_group_scales; + std::optional const& maybe_group_zeros; + std::optional maybe_group_size; + std::optional const& maybe_channel_scales; + std::optional const& maybe_token_scales; + std::optional maybe_schedule; }; struct SupportedSchedulesArgs { at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; - c10::optional maybe_group_zeros_type; - c10::optional maybe_channel_scales_type; - c10::optional maybe_token_scales_type; - c10::optional maybe_out_type; + std::optional maybe_group_scales_type; + std::optional maybe_group_zeros_type; + std::optional maybe_channel_scales_type; + std::optional maybe_token_scales_type; + std::optional maybe_out_type; }; torch::Tensor mm_dispatch(MMArgs args); diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh index 3486d28be2..634b651a4d 100644 --- a/csrc/quantization/machete/machete_prepack_launcher.cuh +++ b/csrc/quantization/machete/machete_prepack_launcher.cuh @@ -10,7 +10,7 @@ struct PrepackBArgs { torch::Tensor const& B; at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; + std::optional maybe_group_scales_type; }; template diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index da2c2fb0d3..05a51ee21d 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -10,11 +10,11 @@ using namespace vllm; std::vector supported_schedules( at::ScalarType a_type, int64_t b_type_id, - c10::optional maybe_group_scales_type, - c10::optional maybe_group_zeros_type, - c10::optional maybe_channel_scales_type, - c10::optional maybe_token_scales_type, - c10::optional maybe_out_type) { + std::optional maybe_group_scales_type, + std::optional maybe_group_zeros_type, + std::optional maybe_channel_scales_type, + std::optional maybe_token_scales_type, + std::optional maybe_out_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return supported_schedules_dispatch({ .a_type = a_type, @@ -29,13 +29,13 @@ std::vector supported_schedules( torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, int64_t b_type_id, - c10::optional const& maybe_out_type, - c10::optional const& maybe_group_scales, - c10::optional const& maybe_group_zeros, - c10::optional maybe_group_size, - c10::optional const& maybe_channel_scales, - c10::optional const& maybe_token_scales, - c10::optional maybe_schedule) { + std::optional const& maybe_out_type, + std::optional const& maybe_group_scales, + std::optional const& maybe_group_zeros, + std::optional maybe_group_size, + std::optional const& maybe_channel_scales, + std::optional const& maybe_token_scales, + std::optional maybe_schedule) { ScalarType const b_type = ScalarType::from_id(b_type_id); return mm_dispatch({.A = A, .B = B, @@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, torch::Tensor prepack_B( torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, - c10::optional const& maybe_group_scales_type) { + std::optional const& maybe_group_scales_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return prepack_B_dispatch( {.B = B, diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index b48348a515..9477790629 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -218,7 +218,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, // head_size] scalar_t* __restrict__ final_out, // [num_seqs, num_heads, head_size] - int max_ctx_blocks, float k_scale, float v_scale) { + int max_ctx_blocks, const float* k_scale_ptr, const float* v_scale_ptr) { constexpr int NWARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; const int laneid = threadIdx.x % WARP_SIZE; @@ -406,7 +406,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( // Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d]; const _B8x8 Vlocalb8 = v_ptrh8be[d]; Vlocal[h][b * BLOCK_SIZE / 8 + d] = - scaled_convert_b8x8(Vlocalb8, v_scale); + scaled_convert_b8x8(Vlocalb8, *v_scale_ptr); } } } @@ -416,7 +416,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( #pragma unroll for (int d = 0; d < KHELOOP; d++) { Klocal[d] = - scaled_convert_b8x8(Klocalb8[d], k_scale); + scaled_convert_b8x8(Klocalb8[d], *k_scale_ptr); } } @@ -890,7 +890,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, // head_size] scalar_t* __restrict__ final_out, // [num_seqs, num_heads, head_size] - int max_ctx_blocks, float k_scale, float v_scale) { + int max_ctx_blocks, const float* k_scale, const float* v_scale) { UNREACHABLE_CODE } @@ -919,7 +919,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \ - k_scale, v_scale); + k_scale_ptr, v_scale_ptr); template @@ -928,8 +928,8 @@ void paged_attention_custom_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, const int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& context_lens, - int max_context_len, const c10::optional& alibi_slopes, - float k_scale, float v_scale) { + int max_context_len, const std::optional& alibi_slopes, + torch::Tensor& k_scale, torch::Tensor& v_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -953,6 +953,8 @@ void paged_attention_custom_launcher( KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* context_lens_ptr = context_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); const int max_num_partitions = @@ -1086,8 +1088,9 @@ void paged_attention( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& context_lens, // [num_seqs] int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale) { + const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale) { const int head_size = query.size(2); if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 9f085115a3..ba16195177 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, double scale, torch::Tensor& block_tables, torch::Tensor& context_lens, int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, - double v_scale); + const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale); diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index a283d4263d..a5d2e2f97a 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -27,7 +27,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { " int max_context_len," " Tensor? alibi_slopes," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention); } diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index 6223dc8cca..5a1879787c 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index d464b045b8..371de0950b 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, @@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) && diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 1865b4d8f2..5f78ac0393 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -55,6 +55,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul); + ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); + ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); + // Activation function used in GeGLU with `none` approximation. ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul); @@ -446,7 +449,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache); // Reshape the key and value tensors and cache them. @@ -456,7 +459,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache_flash", torch::kCUDA, &reshape_and_cache_flash); diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf102..5b801f79d1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -18,3 +18,7 @@ help: # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + rm -rf "$(SOURCEDIR)/getting_started/examples" diff --git a/docs/README.md b/docs/README.md index 46488c9bb0..1a44c1341f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,4 +16,5 @@ make html ```bash python -m http.server -d build/html/ ``` + Launch your browser and open localhost:8000. diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 25a700033c..8217bc3ba3 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -3,6 +3,8 @@ sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 myst-parser==3.0.1 sphinx-argparse==0.4.0 +sphinx-design==0.6.1 +sphinx-togglebutton==0.3.2 msgspec cloudpickle diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md similarity index 100% rename from docs/source/dev/engine/async_llm_engine.md rename to docs/source/api/engine/async_llm_engine.md diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md similarity index 100% rename from docs/source/dev/engine/engine_index.md rename to docs/source/api/engine/index.md diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md similarity index 100% rename from docs/source/dev/engine/llm_engine.md rename to docs/source/api/engine/llm_engine.md diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md new file mode 100644 index 0000000000..181c30cab9 --- /dev/null +++ b/docs/source/api/inference_params.md @@ -0,0 +1,21 @@ +# Inference Parameters + +Inference parameters for vLLM APIs. + +(sampling-params)= + +## Sampling Parameters + +```{eval-rst} +.. autoclass:: vllm.SamplingParams + :members: +``` + +(pooling-params)= + +## Pooling Parameters + +```{eval-rst} +.. autoclass:: vllm.PoolingParams + :members: +``` diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md new file mode 100644 index 0000000000..e103a51d00 --- /dev/null +++ b/docs/source/api/model/adapters.md @@ -0,0 +1,9 @@ +# Model Adapters + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.adapters + :members: + :member-order: bysource +``` diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md new file mode 100644 index 0000000000..113792147b --- /dev/null +++ b/docs/source/api/model/index.md @@ -0,0 +1,11 @@ +# Model Development + +## Submodules + +```{toctree} +:maxdepth: 1 + +interfaces_base +interfaces +adapters +``` diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md new file mode 100644 index 0000000000..55bee57f64 --- /dev/null +++ b/docs/source/api/model/interfaces.md @@ -0,0 +1,9 @@ +# Optional Interfaces + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.interfaces + :members: + :member-order: bysource +``` diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md new file mode 100644 index 0000000000..75d58d3422 --- /dev/null +++ b/docs/source/api/model/interfaces_base.md @@ -0,0 +1,9 @@ +# Base Model Interfaces + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.interfaces_base + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md new file mode 100644 index 0000000000..14efdb506d --- /dev/null +++ b/docs/source/api/multimodal/index.md @@ -0,0 +1,28 @@ +(multi-modality)= + +# Multi-Modality + +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. + +Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). + +## Module Contents + +```{eval-rst} +.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY +``` + +## Submodules + +```{toctree} +:maxdepth: 1 + +inputs +parse +processing +profiling +registry +``` diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md new file mode 100644 index 0000000000..21bd938be9 --- /dev/null +++ b/docs/source/api/multimodal/inputs.md @@ -0,0 +1,49 @@ +# Input Definitions + +## User-facing inputs + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.MultiModalDataDict +``` + +## Internal data structures + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.PlaceholderRange + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.NestedTensors +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalInputs + :members: + :show-inheritance: +``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md new file mode 100644 index 0000000000..4676139efe --- /dev/null +++ b/docs/source/api/multimodal/parse.md @@ -0,0 +1,9 @@ +# Data Parsing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.parse + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md new file mode 100644 index 0000000000..0d81c8d396 --- /dev/null +++ b/docs/source/api/multimodal/processing.md @@ -0,0 +1,9 @@ +# Data Processing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.processing + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md new file mode 100644 index 0000000000..b455145212 --- /dev/null +++ b/docs/source/api/multimodal/profiling.md @@ -0,0 +1,9 @@ +# Memory Profiling + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.profiling + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md new file mode 100644 index 0000000000..0737a4385c --- /dev/null +++ b/docs/source/api/multimodal/registry.md @@ -0,0 +1,9 @@ +# Registry + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.registry + :members: + :member-order: bysource +``` diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md similarity index 77% rename from docs/source/dev/offline_inference/offline_index.md rename to docs/source/api/offline_inference/index.md index 318a02d8c7..c32f99d59e 100644 --- a/docs/source/dev/offline_inference/offline_index.md +++ b/docs/source/api/offline_inference/index.md @@ -1,6 +1,7 @@ # Offline Inference ```{toctree} +:caption: Contents :maxdepth: 1 llm diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md similarity index 100% rename from docs/source/dev/offline_inference/llm.md rename to docs/source/api/offline_inference/llm.md diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md similarity index 100% rename from docs/source/dev/offline_inference/llm_inputs.md rename to docs/source/api/offline_inference/llm_inputs.md diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png similarity index 100% rename from docs/source/assets/dev/dockerfile-stages-dependency.png rename to docs/source/assets/contributing/dockerfile-stages-dependency.png diff --git a/docs/source/serving/architecture_helm_deployment.png b/docs/source/assets/deployment/architecture_helm_deployment.png similarity index 100% rename from docs/source/serving/architecture_helm_deployment.png rename to docs/source/assets/deployment/architecture_helm_deployment.png diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/features/disagg_prefill/abstraction.jpg similarity index 100% rename from docs/source/assets/usage/disagg_prefill/abstraction.jpg rename to docs/source/assets/features/disagg_prefill/abstraction.jpg diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/features/disagg_prefill/overview.jpg similarity index 100% rename from docs/source/assets/usage/disagg_prefill/overview.jpg rename to docs/source/assets/features/disagg_prefill/overview.jpg diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md index 43fa9ee616..ab5ea147f4 100644 --- a/docs/source/community/meetups.md +++ b/docs/source/community/meetups.md @@ -4,6 +4,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing) - [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing) diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index c6f83b3a92..fb93e65673 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -5,26 +5,34 @@ vLLM is a community project. Our compute resources for development and testing a +Cash Donations: + - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: + - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab - Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/docs/source/conf.py b/docs/source/conf.py index 71394c5302..7aa52db092 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,6 +43,11 @@ extensions = [ "sphinx.ext.autosummary", "myst_parser", "sphinxarg.ext", + "sphinx_design", + "sphinx_togglebutton", +] +myst_enable_extensions = [ + "colon_fence", ] # Add any paths that contain templates here, relative to this directory. @@ -51,7 +56,7 @@ templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.md"] +exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 7ffec83333..cb142318b8 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -1,7 +1,7 @@ # Dockerfile We provide a to construct the image for running an OpenAI compatible server with vLLM. -More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md). +More information about deploying with Docker can be found [here](#deployment-docker). Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: @@ -17,7 +17,7 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png + > ```{figure} /assets/contributing/dockerfile-stages-dependency.png > :align: center > :alt: query > :width: 100% diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md new file mode 100644 index 0000000000..b9b92fd027 --- /dev/null +++ b/docs/source/contributing/model/basic.md @@ -0,0 +1,125 @@ +(new-model-basic)= + +# Implementing a Basic Model + +This guide walks you through the steps to implement a basic vLLM model. + +## 1. Bring your model code + +First, clone the PyTorch model code from the source repository. +For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from +HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. + +```{warning} +Make sure to review and adhere to the original code's copyright and licensing terms! +``` + +## 2. Make your code compatible with vLLM + +To ensure compatibility with vLLM, your model must meet the following requirements: + +### Initialization Code + +All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: + +- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +```python +from torch import nn +from vllm.config import VllmConfig +from vllm.attention import Attention + +class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + +class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + +class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + +class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") +``` + +### Computation Code + +- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model. + +```python +class MyModel(nn.Module): + ... + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + ... +``` + +- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +```python +def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + ... +``` + +```{note} +Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. +If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +``` + +For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. + +## 3. (Optional) Implement tensor parallelism and quantization support + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +## 4. Implement the weight loading logic + +You now need to implement the `load_weights` method in your `*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +## 5. Register your model + +See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. + +## Frequently Asked Questions + +### How to support models with interleaving sliding windows? + +For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation. + +To support a model with interleaving sliding windows, we need to take care of the following details: + +- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model. +- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). + +With these two steps, interleave sliding windows should work with the model. diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md new file mode 100644 index 0000000000..fe018b61b0 --- /dev/null +++ b/docs/source/contributing/model/index.md @@ -0,0 +1,27 @@ +(new-model)= + +# Adding a New Model + +This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. + +```{toctree} +:caption: Contents +:maxdepth: 1 + +basic +registration +tests +multimodal +``` + +```{note} +The complexity of adding a new model depends heavily on the model's architecture. +The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. +However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. +``` + +```{tip} +If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) +or ask on our [developer slack](https://slack.vllm.ai). +We will be happy to help you out! +``` diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md new file mode 100644 index 0000000000..e5fd9a2877 --- /dev/null +++ b/docs/source/contributing/model/multimodal.md @@ -0,0 +1,452 @@ +(supports-multimodal)= + +# Multi-Modal Support + +This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). + +## 1. Update the base vLLM model + +It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). +Further update the model as follows: + +- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + + More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it. + +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: + + assert self.vision_encoder is not None + image_features = self.vision_encoder(image_input) + return self.multi_modal_projector(image_features) + + def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]: + + # Validate the multimodal input keyword arguments + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + # Run multimodal inputs through encoder and projector + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + ``` + + ```{important} + The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. + ``` + +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. + + ```python + from .utils import merge_multimodal_embeddings + + class YourModelForImage2Seq(nn.Module): + ... + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + # `get_input_embeddings` should already be implemented for the language + # model as one of the requirements of basic vLLM model implementation. + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_index) + + return inputs_embeds + ``` + +- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + + ```diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + ``` + + ```{note} + The model class does not have to be named {code}`*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + ``` + +## 2. Specify processing information + +Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo` +to provide basic information related to HF processing. + +### Maximum number of input items + +You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits` +to return the maximum number of input items for each modality supported by the model. + +For example, if the model supports any number of images but only one video per prompt: + +```python +def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": 1} +``` + +### Maximum number of placeholder feature tokens + +Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item` +to return the maximum number of placeholder feature tokens per input item for each modality. + +When calling the model, the output embeddings from the visual encoder are assigned to the input positions +containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal +to the size of the output embeddings. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at the code of HF's `LlavaForConditionalGeneration`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 +n_image_tokens = (input_ids == self.config.image_token_index).sum().item() +n_image_features = image_features.shape[0] * image_features.shape[1] + +if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) +special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) +) +image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) +inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) +``` + +The number of placeholder feature tokens per image is `image_features.shape[1]`. +`image_features` is calculated inside the `get_image_features` method: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 +image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + +selected_image_feature = image_outputs.hidden_states[vision_feature_layer] +if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] +elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature +else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") +image_features = self.multi_modal_projector(selected_image_feature) +return image_features +``` + +We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower +(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). +Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. +The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention +mechanism doesn't change the sequence length of the output hidden states. + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 +hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) +hidden_states = self.pre_layrnorm(hidden_states) + +encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, +) +``` + +To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 +target_dtype = self.patch_embedding.weight.dtype +patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] +patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + +class_embeds = self.class_embedding.expand(batch_size, 1, -1) +embeddings = torch.cat([class_embeds, patch_embeds], dim=1) +if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) +else: + embeddings = embeddings + self.position_embedding(self.position_ids) +return embeddings +``` + +We can infer that `embeddings.shape[1] == self.num_positions`, where + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 +self.num_patches = (self.image_size // self.patch_size) ** 2 +self.num_positions = self.num_patches + 1 +``` + +Overall, the number of placeholder feature tokens for an image can be calculated as: + +```python +def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, +) -> int: + hf_config = self.get_hf_config() + hf_processor = self.get_hf_processor() + + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + + num_image_tokens = (image_size // patch_size) ** 2 + 1 + if hf_processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + return num_image_tokens +``` + +Notice that the number of image tokens doesn't depend on the image width and height. +So, we can calculate the maximum number of image tokens using any image size: + +```python +def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + width = height = hf_config.image_size + return ImageSize(width=width, height=height) + +def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) +``` + +And thus, we can override the method as: + +```python +def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} +``` + +```{note} +Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP. +``` + +::: +:::: + +## 3. Specify dummy inputs + +Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for +HF processing as well as memory profiling. + +### For memory profiling + +Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs` +to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of +the model so that vLLM can reserve the correct amount of memory for it. + +Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based +on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava +Making use of the `get_image_size_with_most_features` method implemented in the previous section: + +```python +def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], +) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + hf_config = self.get_hf_config() + target_width, target_height = self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) +``` + +::: +:::: + +## 4. Specify processing details + +Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` +to fill in the missing details about HF processing. + +```{seealso} +[Multi-Modal Data Processing](#mm-processing) +``` + +### Multi-modal fields + +Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to +return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at the model's `forward` method: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404 +def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, +) -> Union[Tuple, LlavaCausalLMOutputWithPast]: +``` + +The only related keyword argument is `pixel_values` which directly corresponds to input images. +The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images. +So, we override the method as follows: + +```python +def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], +) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + ) +``` + +```{note} +Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports +pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. +``` + +::: +:::: + +### Prompt replacements + +Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to +return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances. + +Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace +operation performed by the HF processor. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at HF's `LlavaProcessor`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 +prompt_strings = [] +for sample in text: + sample = sample.replace(self.image_token, self.image_token * num_image_tokens) + prompt_strings.append(sample) +``` + +It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). +Based on this, we override the method as follows: + +```python +def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, +) -> list[PromptReplacement]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] +``` + +::: +:::: + +## 5. Register processor-related classes + +After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2), +{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3), +and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4), +decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` +to register them to the multi-modal registry: + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder) + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md new file mode 100644 index 0000000000..d6c9e4181d --- /dev/null +++ b/docs/source/contributing/model/registration.md @@ -0,0 +1,55 @@ +(new-model-registration)= + +# Registering a Model to vLLM + +vLLM relies on a model registry to determine how to run each model. +A list of pre-registered architectures can be found [here](#supported-models). + +If your model is not on this list, you must register it to vLLM. +This page provides detailed instructions on how to do so. + +## Built-in models + +To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source). +This gives you the ability to modify the codebase and test your model. + +After you have implemented your model (see [tutorial](#new-model-basic)), put it into the directory. +Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. +Finally, update our [list of supported models](#supported-models) to promote your model! + +```{important} +The list of models in each section should be maintained in alphabetical order. +``` + +## Out-of-tree models + +You can load an external model using a plugin without modifying the vLLM codebase. + +```{seealso} +[vLLM's Plugin System](#plugin-system) +``` + +To register the model, use the following code: + +```python +from vllm import ModelRegistry +from your_code import YourModelForCausalLM +ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +from vllm import ModelRegistry + +ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +``` + +```{important} +If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +Read more about that [here](#supports-multimodal). +``` + +```{note} +Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. +``` diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md new file mode 100644 index 0000000000..74c933b2f4 --- /dev/null +++ b/docs/source/contributing/model/tests.md @@ -0,0 +1,63 @@ +(new-model-tests)= + +# Writing Unit Tests + +This page explains how to write unit tests to verify the implementation of your model. + +## Required Tests + +These tests are necessary to get your PR merged into vLLM library. +Without them, the CI for your PR will fail. + +### Model loading + +Include an example HuggingFace repository for your model in . +This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. + +```{important} +The list of models in each section should be maintained in alphabetical order. +``` + +```{tip} +If your model requires a development version of HF Transformers, you can set +`min_transformers_version` to skip the test in CI until the model is released. +``` + +## Optional Tests + +These tests are optional to get your PR merged into vLLM library. +Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions. + +### Model correctness + +These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of . + +#### Generative models + +For [generative models](#generative-models), there are two levels of correctness tests, as defined in : + +- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. +- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. + +#### Pooling models + +For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in . + +(mm-processing-tests)= + +### Multi-modal processing + +#### Common tests + +Adding your model to verifies that the following input combinations result in the same outputs: + +- Text + multi-modal data +- Tokens + multi-modal data +- Text + cached multi-modal data +- Tokens + cached multi-modal data + +#### Model-specific tests + +You can add a new file under to run tests that only apply to your model. + +For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in . diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index c960790f47..36cf8e7440 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -25,10 +25,12 @@ Check out the [building from source](#build-from-source) documentation for detai ```bash pip install -r requirements-dev.txt -# linting and formatting -bash format.sh -# Static type checking -mypy +# Linting, formatting and static type checking +pre-commit install + +# You can manually run pre-commit with +pre-commit run --all-files + # Unit tests pytest tests/ ``` @@ -37,8 +39,6 @@ pytest tests/ Currently, the repository is not fully checked by `mypy`. ``` -# Contribution Guidelines - ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. @@ -90,7 +90,8 @@ If the PR spans more than one category, please include all relevant prefixes. The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use to format your code. +- Pass all linter checks. Please use `pre-commit` to format your code. See + if `pre-commit` is new to you. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 46210957c1..001db86bdf 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md new file mode 100644 index 0000000000..a9bbfde2af --- /dev/null +++ b/docs/source/contributing/vulnerability_management.md @@ -0,0 +1,60 @@ +# Vulnerability Management + +## Reporting Vulnerabilities + +As mentioned in the [security +policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security +vulnerabilities may be reported privately to the project via +[GitHub](https://github.com/vllm-project/vllm/security/advisories/new). + +## Vulnerability Management Team + +Once a vulnerability has been reported to the project, the Vulnerability +Management Team (VMT) is responsible for managing the vulnerability. The VMT is +responsible for: + +- Triaging the vulnerability. +- Coordinating with reporters and project maintainers on vulnerability analysis + and resolution. +- Drafting of security advisories for confirmed vulnerabilities, as appropriate. +- Coordination with project maintainers on a coordinated release of the fix and + security advisory. + +### Security Advisories + +Advisories are published via GitHub through the same system used to report +vulnerabilities. More information on the process can be found in the [GitHub +documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories). + +### Team Members + +We prefer to keep all vulnerability-related communication on the security report +on GitHub. However, if you need to contact the VMT directly for an urgent issue, +you may contact the following individuals: + +- Simon Mo - simon.mo@hey.com +- Russell Bryant - rbryant@redhat.com + +## Slack Discussion + +You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) +to discuss security-related topics. However, please do not disclose any +vulnerabilities in this channel. If you need to report a vulnerability, please +use the GitHub security advisory system or contact a VMT member privately. + +## Vulnerability Disclosure + +The process for disclosing vulnerabilities is the following: + +- The VMT will work with the project maintainers to develop a fix for the + vulnerability. +- The VMT will coordinate with the reporter and project maintainers to prepare a + security advisory that adequately describes the vulnerability and its impact. +- The VMT will coordinate with the project maintainers to publish a fix and + release an update that includes that fix. +- The VMT will publish the security advisory on GitHub. Release notes will be + updated to include a reference to the security advisory. + +The VMT and project maintainers will work to minimize the amount of time in +between disclosing any public information about the vulnerability and making a +release and advisory available. diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/deployment/docker.md similarity index 84% rename from docs/source/serving/deploying_with_docker.md rename to docs/source/deployment/docker.md index 844bd27800..438be47316 100644 --- a/docs/source/serving/deploying_with_docker.md +++ b/docs/source/deployment/docker.md @@ -1,6 +1,8 @@ -(deploying-with-docker)= +(deployment-docker)= -# Deploying with Docker +# Using Docker + +(deployment-docker-pre-built-image)= ## Use vLLM's Official Docker Image @@ -17,25 +19,32 @@ $ docker run --runtime nvidia --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` +You can add any other you need after the image tag (`vllm/vllm-openai:latest`). + ```{note} You can either use the `ipc=host` flag or `--shm-size` flag to allow the container to access the host's shared memory. vLLM uses PyTorch, which uses shared memory to share data between processes under the hood, particularly for tensor parallel inference. ``` +(deployment-docker-build-image-from-source)= + ## Building vLLM's Docker Image from Source You can build and run vLLM from source via the provided . To build vLLM: ```console -$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` ```{note} By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` for vLLM to find the current GPU type and build for that. + +If you are using Podman instead of Docker, you might need to disable SELinux labeling by +adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). ``` ## Building for Arm64/aarch64 diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/deployment/frameworks/bentoml.md similarity index 71% rename from docs/source/serving/deploying_with_bentoml.md rename to docs/source/deployment/frameworks/bentoml.md index dfa0de4f0f..2bf435bda8 100644 --- a/docs/source/serving/deploying_with_bentoml.md +++ b/docs/source/deployment/frameworks/bentoml.md @@ -1,7 +1,7 @@ -(deploying-with-bentoml)= +(deployment-bentoml)= -# Deploying with BentoML +# BentoML -[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. +[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes. For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html). diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md similarity index 93% rename from docs/source/serving/deploying_with_cerebrium.md rename to docs/source/deployment/frameworks/cerebrium.md index 950064c8c1..5787c4a407 100644 --- a/docs/source/serving/deploying_with_cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -1,6 +1,6 @@ -(deploying-with-cerebrium)= +(deployment-cerebrium)= -# Deploying with Cerebrium +# Cerebrium ```{raw} html

@@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr To install the Cerebrium client, run: ```console -$ pip install cerebrium -$ cerebrium login +pip install cerebrium +cerebrium login ``` Next, create your Cerebrium project, run: ```console -$ cerebrium init vllm-project +cerebrium init vllm-project ``` Next, to install the required packages, add the following to your cerebrium.toml: @@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): Then, run the following code to deploy it to the cloud: ```console -$ cerebrium deploy +cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/deployment/frameworks/dstack.md similarity index 95% rename from docs/source/serving/deploying_with_dstack.md rename to docs/source/deployment/frameworks/dstack.md index 381f5f786c..b42a34125c 100644 --- a/docs/source/serving/deploying_with_dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -1,6 +1,6 @@ -(deploying-with-dstack)= +(deployment-dstack)= -# Deploying with dstack +# dstack ```{raw} html

@@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```console -$ pip install "dstack[all] -$ dstack server +pip install "dstack[all] +dstack server ``` Next, to configure your dstack project, run: ```console -$ mkdir -p vllm-dstack -$ cd vllm-dstack -$ dstack init +mkdir -p vllm-dstack +cd vllm-dstack +dstack init ``` Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/deployment/frameworks/helm.md similarity index 98% rename from docs/source/serving/deploying_with_helm.md rename to docs/source/deployment/frameworks/helm.md index 7286a0a889..18ed293191 100644 --- a/docs/source/serving/deploying_with_helm.md +++ b/docs/source/deployment/frameworks/helm.md @@ -1,6 +1,6 @@ -(deploying-with-helm)= +(deployment-helm)= -# Deploying with Helm +# Helm A Helm chart to deploy vLLM for Kubernetes @@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release. ## Architecture -```{image} architecture_helm_deployment.png +```{image} /assets/deployment/architecture_helm_deployment.png ``` ## Values diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md new file mode 100644 index 0000000000..964782763f --- /dev/null +++ b/docs/source/deployment/frameworks/index.md @@ -0,0 +1,14 @@ +# Using other frameworks + +```{toctree} +:maxdepth: 1 + +bentoml +cerebrium +dstack +helm +lws +modal +skypilot +triton +``` diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/deployment/frameworks/lws.md similarity index 91% rename from docs/source/serving/deploying_with_lws.md rename to docs/source/deployment/frameworks/lws.md index 22bab419ea..349fa83fbc 100644 --- a/docs/source/serving/deploying_with_lws.md +++ b/docs/source/deployment/frameworks/lws.md @@ -1,6 +1,6 @@ -(deploying-with-lws)= +(deployment-lws)= -# Deploying with LWS +# LWS LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference. diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md new file mode 100644 index 0000000000..e7c42088e3 --- /dev/null +++ b/docs/source/deployment/frameworks/modal.md @@ -0,0 +1,7 @@ +(deployment-modal)= + +# Modal + +vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling. + +For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference). diff --git a/docs/source/serving/run_on_sky.md b/docs/source/deployment/frameworks/skypilot.md similarity index 94% rename from docs/source/serving/run_on_sky.md rename to docs/source/deployment/frameworks/skypilot.md index 115873ae49..051fc2f2a8 100644 --- a/docs/source/serving/run_on_sky.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -1,6 +1,6 @@ -(on-cloud)= +(deployment-skypilot)= -# Deploying and scaling up with SkyPilot +# SkyPilot ```{raw} html

@@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet ## Prerequisites -- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`. +- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`. - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). -- Check that {code}`sky check` shows clouds or Kubernetes are enabled. +- Check that `sky check` shows clouds or Kubernetes are enabled. ```console pip install skypilot-nightly @@ -61,7 +61,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://localhost:8081/v1 \ @@ -321,7 +321,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://$ENDPOINT/v1 \ @@ -334,12 +334,12 @@ run: | 1. Start the chat web UI: -```console -sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) -``` + ```console + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + ``` 2. Then, we can access the GUI at the returned gradio link: -```console -| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live -``` + ```console + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + ``` diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/deployment/frameworks/triton.md similarity index 87% rename from docs/source/serving/deploying_with_triton.md rename to docs/source/deployment/frameworks/triton.md index 9b0a6f1d54..94d8712015 100644 --- a/docs/source/serving/deploying_with_triton.md +++ b/docs/source/deployment/frameworks/triton.md @@ -1,5 +1,5 @@ -(deploying-with-triton)= +(deployment-triton)= -# Deploying with NVIDIA Triton +# NVIDIA Triton The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md new file mode 100644 index 0000000000..d47ede8967 --- /dev/null +++ b/docs/source/deployment/integrations/index.md @@ -0,0 +1,9 @@ +# External Integrations + +```{toctree} +:maxdepth: 1 + +kserve +kubeai +llamastack +``` diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/deployment/integrations/kserve.md similarity index 85% rename from docs/source/serving/deploying_with_kserve.md rename to docs/source/deployment/integrations/kserve.md index feaeb5d0ec..c780fd74e8 100644 --- a/docs/source/serving/deploying_with_kserve.md +++ b/docs/source/deployment/integrations/kserve.md @@ -1,6 +1,6 @@ -(deploying-with-kserve)= +(deployment-kserve)= -# Deploying with KServe +# KServe vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/deployment/integrations/kubeai.md similarity index 93% rename from docs/source/serving/deploying_with_kubeai.md rename to docs/source/deployment/integrations/kubeai.md index 3609d7e05a..2f5772e075 100644 --- a/docs/source/serving/deploying_with_kubeai.md +++ b/docs/source/deployment/integrations/kubeai.md @@ -1,6 +1,6 @@ -(deploying-with-kubeai)= +(deployment-kubeai)= -# Deploying with KubeAI +# KubeAI [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/deployment/integrations/llamastack.md similarity index 92% rename from docs/source/serving/serving_with_llamastack.md rename to docs/source/deployment/integrations/llamastack.md index 71dadca7ad..a6c3569637 100644 --- a/docs/source/serving/serving_with_llamastack.md +++ b/docs/source/deployment/integrations/llamastack.md @@ -1,13 +1,13 @@ -(run-on-llamastack)= +(deployment-llamastack)= -# Serving with Llama Stack +# Llama Stack vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . To install Llama Stack, run ```console -$ pip install llama-stack -q +pip install llama-stack -q ``` ## Inference using OpenAI Compatible API diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md new file mode 100644 index 0000000000..cbc95c20ff --- /dev/null +++ b/docs/source/deployment/k8s.md @@ -0,0 +1,249 @@ +(deployment-k8s)= + +# Using Kubernetes + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +## Deployment Steps + +1. Create a PVC, Secret and Deployment for vLLM + + PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + + ```yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: mistral-7b + namespace: default + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem + ``` + + Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + + ```yaml + apiVersion: v1 + kind: Secret + metadata: + name: hf-token-secret + namespace: default + type: Opaque + stringData: + token: "REPLACE_WITH_TOKEN" + ``` + + Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. + + Here are two examples for using NVIDIA GPU and AMD GPU. + + NVIDIA GPU: + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 + ``` + + AMD GPU: + + You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + # PVC + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "8Gi" + hostNetwork: true + hostIPC: true + containers: + - name: mistral-7b + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + securityContext: + seccompProfile: + type: Unconfined + runAsGroup: 44 + capabilities: + add: + - SYS_PTRACE + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + amd.com/gpu: "1" + requests: + cpu: "6" + memory: 6G + amd.com/gpu: "1" + volumeMounts: + - name: cache-volume + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + ``` + + You can get the full example with steps and sample yaml files from . + +2. Create a Kubernetes Service for vLLM + + Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + + ```yaml + apiVersion: v1 + kind: Service + metadata: + name: mistral-7b + namespace: default + spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP + ``` + +3. Deploy and Test + + Apply the deployment and service configurations using `kubectl apply -f `: + + ```console + kubectl apply -f deployment.yaml + kubectl apply -f service.yaml + ``` + + To test the deployment, run the following `curl` command: + + ```console + curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + ``` + + If the service is correctly deployed, you should receive a response from the vLLM model. + +## Conclusion + +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/deployment/nginx.md similarity index 99% rename from docs/source/serving/deploying_with_nginx.md rename to docs/source/deployment/nginx.md index a1f00d8536..a58f791c29 100644 --- a/docs/source/serving/deploying_with_nginx.md +++ b/docs/source/deployment/nginx.md @@ -1,6 +1,6 @@ (nginxloadbalancer)= -# Deploying with Nginx Loadbalancer +# Using Nginx This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 475a3e5fa9..cec503ef2f 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -53,11 +53,11 @@ for output in outputs: ``` More API details can be found in the {doc}`Offline Inference -` section of the API docs. +` section of the API docs. The code for the `LLM` class can be found in . -### OpenAI-compatible API server +### OpenAI-Compatible API Server The second primary interface to vLLM is via its OpenAI-compatible API server. This server can be started using the `vllm serve` command. @@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model That code can be found in . -More details on the API server can be found in the {doc}`OpenAI Compatible -Server ` document. +More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document. ## LLM Engine diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/design/automatic_prefix_caching.md similarity index 86% rename from docs/source/automatic_prefix_caching/details.md rename to docs/source/design/automatic_prefix_caching.md index 17f806217a..3928e0c165 100644 --- a/docs/source/automatic_prefix_caching/details.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -1,10 +1,12 @@ -# Implementation +(design-automatic-prefix-caching)= -The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. +# Automatic Prefix Caching + +The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. -``` +```text Block 1 Block 2 Block 3 [A gentle breeze stirred] [the leaves as children] [laughed in the distance] Block 1: |<--- block tokens ---->| @@ -12,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->| Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| ``` - In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: -``` +```text hash(prefix tokens + block tokens) <--> KV Block ``` With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. - This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. - ## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. @@ -39,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: -- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. +* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md deleted file mode 100644 index bb16920e3d..0000000000 --- a/docs/source/design/input_processing/input_processing_pipeline.md +++ /dev/null @@ -1,19 +0,0 @@ -(input-processing-pipeline)= - -# Input Processing Pipeline - -1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`). - -2. Tokenize the data if necessary. - -3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `. - - - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. - -4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`. - -5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`. - -6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `. - - - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md deleted file mode 100644 index cb415366e5..0000000000 --- a/docs/source/design/input_processing/model_inputs_index.md +++ /dev/null @@ -1,43 +0,0 @@ -(input-processing)= - -# Input Processing - -```{eval-rst} -.. currentmodule:: vllm.inputs -``` - -Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via -{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. - -Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input -data in addition to input prompt, but it can be extended to text-only language models when needed. - -## Guides - -```{toctree} -:maxdepth: 1 - -input_processing_pipeline -``` - -## Module Contents - -### LLM Engine Inputs - -```{eval-rst} -.. autoclass:: vllm.inputs.DecoderOnlyInputs - :members: - :show-inheritance: -``` - -### Registry - -```{eval-rst} -.. autodata:: vllm.inputs.INPUT_REGISTRY -``` - -```{eval-rst} -.. automodule:: vllm.inputs.registry - :members: - :show-inheritance: -``` diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md index c21985b36e..f896f903c7 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/source/design/kernel/paged_attention.md @@ -1,3 +1,5 @@ +(design-paged-attention)= + # vLLM Paged Attention - Currently, vLLM utilizes its own implementation of a multi-head query diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md new file mode 100644 index 0000000000..a0d01205e6 --- /dev/null +++ b/docs/source/design/mm_processing.md @@ -0,0 +1,64 @@ +(mm-processing)= + +# Multi-Modal Data Processing + +To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. + +Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`: + +## Prompt Replacement Detection + +One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `` for a single image) with feature placeholder tokens (e.g. `...`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs. + +In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt. + +## Tokenized Prompt Inputs + +To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data. + +### The problem + +Consider that HF processors follow these main steps: + +1. Tokenize the text +2. Process multi-modal inputs +3. Perform prompt replacement + +And we require that: + +- For text + multi-modal inputs, apply all steps 1--3. +- For tokenized + multi-modal inputs, apply only steps 2--3. + +How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs: + +- For text + multi-modal inputs, simply call the HF processor directly. +- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs. + +While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs. + +Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other. + +(mm-dummy-text)= + +### Dummy text + +We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. + +(mm-automatic-prompt-replacement)= + +### Automatic prompt replacement + +We address the second issue by implementing model-agnostic code in +{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. + +### Summary + +With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`. + +## Processor Output Caching + +Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again. + +When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache. + +Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md deleted file mode 100644 index bcccd28487..0000000000 --- a/docs/source/design/multimodal/adding_multimodal_plugin.md +++ /dev/null @@ -1,16 +0,0 @@ -(adding-multimodal-plugin)= - -# Adding a Multimodal Plugin - -This document teaches you how to add a new modality to vLLM. - -Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. -For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`. - -The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s. - -```{note} -This article is a work in progress. -``` - -% TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md deleted file mode 100644 index e4f2171e84..0000000000 --- a/docs/source/design/multimodal/multimodal_index.md +++ /dev/null @@ -1,83 +0,0 @@ -(multi-modality)= - -# Multi-Modality - -```{eval-rst} -.. currentmodule:: vllm.multimodal -``` - -vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. - -Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) -via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. - -Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following [this guide](#adding-multimodal-plugin). - -Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). - -## Guides - -```{toctree} -:maxdepth: 1 - -adding_multimodal_plugin -``` - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal -``` - -### Registry - -```{eval-rst} -.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalRegistry - :members: - :show-inheritance: -``` - -### Base Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.base - :members: - :show-inheritance: -``` - -### Input Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.inputs - :members: - :show-inheritance: -``` - -### Audio Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.audio - :members: - :show-inheritance: -``` - -### Image Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.image - :members: - :show-inheritance: -``` - -### Video Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.video - :members: - :show-inheritance: -``` diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index 34564413b3..c2cdb75ea0 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -2,7 +2,7 @@ ## Debugging -Please see the [Debugging Tips](#debugging-python-multiprocessing) +Please see the [Troubleshooting](#troubleshooting-python-multiprocessing) page for information on known issues and how to solve them. ## Introduction @@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges. ## Multiprocessing Methods -[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include: +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python 3.14. diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md deleted file mode 100644 index 74b2c57443..0000000000 --- a/docs/source/dev/pooling_params.md +++ /dev/null @@ -1,6 +0,0 @@ -# Pooling Parameters - -```{eval-rst} -.. autoclass:: vllm.PoolingParams - :members: -``` diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md deleted file mode 100644 index bdc36af515..0000000000 --- a/docs/source/dev/sampling_params.md +++ /dev/null @@ -1,6 +0,0 @@ -# Sampling Parameters - -```{eval-rst} -.. autoclass:: vllm.SamplingParams - :members: -``` diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/features/automatic_prefix_caching.md similarity index 97% rename from docs/source/automatic_prefix_caching/apc.md rename to docs/source/features/automatic_prefix_caching.md index c0c141c5fb..3d70cbb29c 100644 --- a/docs/source/automatic_prefix_caching/apc.md +++ b/docs/source/features/automatic_prefix_caching.md @@ -1,13 +1,13 @@ -(apc)= +(automatic-prefix-caching)= -# Introduction +# Automatic Prefix Caching -## What is Automatic Prefix Caching +## Introduction Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. ```{note} -Technical details on how vLLM implements APC are in the next page. +Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). ``` ## Enabling APC in vLLM diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md similarity index 96% rename from docs/source/usage/compatibility_matrix.md rename to docs/source/features/compatibility_matrix.md index 3cefa12ea8..47ab616b30 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar * - Feature - [CP](#chunked-prefill) - - [APC](#apc) + - [APC](#automatic-prefix-caching) - [LoRA](#lora-adapter) - prmpt adptr - [SD](#spec_decode) @@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [APC](#apc) + * - [APC](#automatic-prefix-caching) - ✅ - - @@ -307,7 +307,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ? - ? - - ✅ + - [✗](gh-issue:11484) - ✅ - ✗ - ? @@ -322,7 +322,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ``` -### Feature x Hardware +(feature-x-hardware)= + +## Feature x Hardware ```{list-table} :header-rows: 1 @@ -345,7 +347,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - [APC](#apc) + * - [APC](#automatic-prefix-caching) - [✗](gh-issue:3687) - ✅ - ✅ @@ -359,7 +361,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](gh-pr:4830) + - ✅ - ✅ * - prmpt adptr - ✅ diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/features/disagg_prefill.md similarity index 90% rename from docs/source/usage/disagg_prefill.md rename to docs/source/features/disagg_prefill.md index a61c00fad1..efa2efc661 100644 --- a/docs/source/usage/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -1,8 +1,12 @@ (disagg-prefill)= -# Disaggregated prefilling (experimental) +# Disaggregated Prefilling (experimental) -This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. +This page introduces you the disaggregated prefilling feature in vLLM. + +```{note} +This feature is experimental and subject to change. +``` ## Why disaggregated prefilling? @@ -17,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput. ## Usage example -Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. +Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. ## Benchmarks @@ -41,13 +45,13 @@ Key abstractions for disaggregated prefilling: Here is a figure illustrating how the above 3 abstractions are organized: -```{image} /assets/usage/disagg_prefill/abstraction.jpg +```{image} /assets/features/disagg_prefill/abstraction.jpg :alt: Disaggregated prefilling abstractions ``` The workflow of disaggregated prefilling is as follows: -```{image} /assets/usage/disagg_prefill/overview.jpg +```{image} /assets/features/disagg_prefill/overview.jpg :alt: Disaggregated prefilling workflow ``` diff --git a/docs/source/usage/lora.md b/docs/source/features/lora.md similarity index 97% rename from docs/source/usage/lora.md rename to docs/source/features/lora.md index cf06916d70..b00d05147b 100644 --- a/docs/source/usage/lora.md +++ b/docs/source/features/lora.md @@ -47,7 +47,7 @@ outputs = llm.generate( ) ``` -Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/source/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md similarity index 95% rename from docs/source/quantization/auto_awq.md rename to docs/source/features/quantization/auto_awq.md index c02fbf0605..404505eb38 100644 --- a/docs/source/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). ```console -$ pip install autoawq +pip install autoawq ``` After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/quantization/bnb.md b/docs/source/features/quantization/bnb.md similarity index 86% rename from docs/source/quantization/bnb.md rename to docs/source/features/quantization/bnb.md index 8240eca1c7..7525e8e786 100644 --- a/docs/source/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```console -$ pip install bitsandbytes>=0.45.0 +pip install bitsandbytes>=0.45.0 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. @@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and You can find bitsandbytes quantized models on . And usually, these repositories have a config.json file that includes a quantization_config section. -## Read quantized checkpoint. +## Read quantized checkpoint ```python from vllm import LLM @@ -37,3 +37,11 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` + +## OpenAI Compatible Server + +Append the following to your 4bit model arguments: + +```console +--quantization bitsandbytes --load-format bitsandbytes +``` diff --git a/docs/source/quantization/fp8.md b/docs/source/features/quantization/fp8.md similarity index 95% rename from docs/source/quantization/fp8.md rename to docs/source/features/quantization/fp8.md index b2eda74fd1..1398e8a324 100644 --- a/docs/source/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -54,16 +54,15 @@ The quantization process involves three main steps: ### 1. Loading the Model -Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: +Load your model and tokenizer using the standard `transformers` AutoModel classes: ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - -model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", +) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -98,7 +97,7 @@ tokenizer.save_pretrained(SAVE_DIR) Install `vllm` and `lm-evaluation-harness`: ```console -$ pip install vllm lm-eval==0.4.4 +pip install vllm lm-eval==0.4.4 ``` Load and run the model in `vllm`: diff --git a/docs/source/quantization/gguf.md b/docs/source/features/quantization/gguf.md similarity index 80% rename from docs/source/quantization/gguf.md rename to docs/source/features/quantization/gguf.md index eebf11dfc1..640997cf4b 100644 --- a/docs/source/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console -$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` ```{warning} diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md new file mode 100644 index 0000000000..56ccdb5f00 --- /dev/null +++ b/docs/source/features/quantization/index.md @@ -0,0 +1,18 @@ +(quantization-index)= + +# Quantization + +Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. + +```{toctree} +:caption: Contents +:maxdepth: 1 + +supported_hardware +auto_awq +bnb +gguf +int8 +fp8 +quantized_kvcache +``` diff --git a/docs/source/quantization/int8.md b/docs/source/features/quantization/int8.md similarity index 93% rename from docs/source/quantization/int8.md rename to docs/source/features/quantization/int8.md index 1ac50ba987..592a60d398 100644 --- a/docs/source/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -30,14 +30,13 @@ The quantization process involves four main steps: ### 1. Loading the Model -Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: +Load your model and tokenizer using the standard `transformers` AutoModel classes: ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md new file mode 100644 index 0000000000..9f36c2949e --- /dev/null +++ b/docs/source/features/quantization/quantized_kvcache.md @@ -0,0 +1,147 @@ +(quantized-kvcache)= + +# Quantized KV Cache + +## FP8 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, improving throughput. + +### FP8 Formats + +[OCP (Open Compute Project)](https://www.opencompute.org) specifies two common 8-bit floating point data formats: + +- E5M2 (5 exponent bits and 2 mantissa bits) +- E4M3FN (4 exponent bits and 3 mantissa bits, often shortened as E4M3) + +The E4M3 format offers higher precision compared to E5M2. However, due to its small dynamic range (±240.0), E4M3 typically requires a higher-precision (FP32) scaling factor alongside each quantized tensor. + +### Current Limitations + +For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). + +### Performance Impact + +The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: + +- Processing longer context lengths for individual requests, or +- Handling more concurrent request batches + +However, there are currently no latency improvements as the implementation does not yet include fused dequantization and attention operations. Future releases will support quantized attention with hardware acceleration, which should provide additional performance benefits. While the most recent silicon offerings (e.g. AMD MI300, NVIDIA Hopper or later) support native hardware conversion between FP8 and other formats (fp32, fp16, bf16), this benefit is not yet fully realized. + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy, making it a practical choice for throughput optimization. + +## Usage Example + +Here is an example of how to enable FP8 quantization: + +```python +# To calculate kv cache scales on the fly enable the calculate_kv_scales +# parameter + +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True) +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` + +The `kv_cache_dtype` argument specifies the data type for KV cache storage: +- `"auto"`: Uses the model's default "unquantized" data type +- `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU) +- `"fp8_e5m2"`: Supported on CUDA 11.8+ + +## Calibrated Scales for Better Accuracy + +For optimal model quality when using FP8 KV Cache, we recommend using calibrated scales tuned to representative inference data. [LLM Compressor](https://github.com/vllm-project/llm-compressor/) is the recommended tool for this process. + +### Installation + +First, install the required dependencies: + +```console +pip install llmcompressor +``` + +### Example Usage + +Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): + +```python +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer +from llmcompressor.transformers import oneshot + +# Select model and load it +MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Configure calibration parameters +NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess dataset +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def process_and_tokenize(example): + text = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + text, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + +ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) + +# Configure quantization settings +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true +""" + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save quantized model +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. + +When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales. + +```python +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="Llama-3.1-8B-Instruct-FP8-KV", kv_cache_dtype="fp8") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md similarity index 86% rename from docs/source/quantization/supported_hardware.md rename to docs/source/features/quantization/supported_hardware.md index 7330c2f8aa..f5c0a95ea4 100644 --- a/docs/source/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -1,6 +1,6 @@ -(supported-hardware-for-quantization)= +(quantization-supported-hardware)= -# Supported Hardware for Quantization Kernels +# Supported Hardware The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: @@ -113,19 +113,19 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✅︎ - ✅︎ - - ✗ + - ✅︎ - ✗ - ✗ - ✗ - ✗ ``` -## Notes: - - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - "✅︎" indicates that the quantization method is supported on the specified hardware. - "✗" indicates that the quantization method is not supported on the specified hardware. -Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. +```{note} +This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. +``` diff --git a/docs/source/usage/spec_decode.md b/docs/source/features/spec_decode.md similarity index 66% rename from docs/source/usage/spec_decode.md rename to docs/source/features/spec_decode.md index 8302da81b6..ab7b2f302b 100644 --- a/docs/source/usage/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -1,6 +1,6 @@ (spec-decode)= -# Speculative decoding +# Speculative Decoding ```{warning} Please note that speculative decoding in vLLM is not yet optimized and does @@ -159,6 +159,70 @@ A variety of speculative models of this type are available on HF hub: - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator) +## Speculating using EAGLE based draft models + +The following code configures vLLM to use speculative decoding where proposals are generated by +an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="meta-llama/Meta-Llama-3-8B-Instruct", + tensor_parallel_size=4, + speculative_model="path/to/modified/eagle/model", + speculative_draft_tensor_parallel_size=1, +) + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +``` + +A few important things to consider when using the EAGLE based draft models: + +1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be + used directly with vLLM due to differences in the expected layer names and model definition. + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + to convert them. Note that this script does not modify the model's weights. + + In the above example, use the script to first convert + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + and then use the converted checkpoint as the draft model in vLLM. + +2. The EAGLE based draft models need to be run without tensor parallelism + (i.e. speculative_draft_tensor_parallel_size is set to 1), although + it is possible to run the main model using tensor parallelism (see example above). + +3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is + reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under + investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). + +A variety of EAGLE draft models are available on the Hugging Face hub: + +| Base Model | EAGLE on Hugging Face | # EAGLE Parameters | +|---------------------------------------------------------------------|-------------------------------------------|--------------------| +| Vicuna-7B-v1.3 | yuhuili/EAGLE-Vicuna-7B-v1.3 | 0.24B | +| Vicuna-13B-v1.3 | yuhuili/EAGLE-Vicuna-13B-v1.3 | 0.37B | +| Vicuna-33B-v1.3 | yuhuili/EAGLE-Vicuna-33B-v1.3 | 0.56B | +| LLaMA2-Chat 7B | yuhuili/EAGLE-llama2-chat-7B | 0.24B | +| LLaMA2-Chat 13B | yuhuili/EAGLE-llama2-chat-13B | 0.37B | +| LLaMA2-Chat 70B | yuhuili/EAGLE-llama2-chat-70B | 0.99B | +| Mixtral-8x7B-Instruct-v0.1 | yuhuili/EAGLE-mixtral-instruct-8x7B | 0.28B | +| LLaMA3-Instruct 8B | yuhuili/EAGLE-LLaMA3-Instruct-8B | 0.25B | +| LLaMA3-Instruct 70B | yuhuili/EAGLE-LLaMA3-Instruct-70B | 0.99B | +| Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | +| Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | + ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of @@ -182,9 +246,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. - -**Conclusion** + titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -193,9 +255,7 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -**Mitigation Strategies** - -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/usage/structured_outputs.md b/docs/source/features/structured_outputs.md similarity index 97% rename from docs/source/usage/structured_outputs.md rename to docs/source/features/structured_outputs.md index 7292012e36..1d77c7339a 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -5,7 +5,7 @@ vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding. This document shows you some examples of the different options that are available to generate structured outputs. -## Online Inference (OpenAI API) +## Online Serving (OpenAI API) You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. @@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. - `guided_decoding_backend`: used to select the guided decoding backend to use. -You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. +You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: @@ -131,7 +131,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +Full example: ## Experimental Automatic Parsing (OpenAI API) @@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are: - `backend` - `whitespace_pattern` -These parameters can be used in the same way as the parameters from the Online Inference examples above. +These parameters can be used in the same way as the parameters from the Online Serving examples above. One example for the usage of the `choices` parameter is shown below: ```python @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/usage/tool_calling.md b/docs/source/features/tool_calling.md similarity index 97% rename from docs/source/usage/tool_calling.md rename to docs/source/features/tool_calling.md index 34b26647a9..027ddb6d5e 100644 --- a/docs/source/usage/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8 vllm serve meta-llama/Llama-3.1-8B-Instruct \ --enable-auto-tool-choice \ --tool-call-parser llama3_json \ - --chat-template examples/tool_chat_template_llama3_json.jinja + --chat-template examples/tool_chat_template_llama3.1_json.jinja ``` Next, make a request to the model that should result in it using the available tools: @@ -55,21 +55,24 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` Example output: -``` + +```text Function called: get_weather Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` This example demonstrates: -- Setting up the server with tool calling enabled -- Defining an actual function to handle tool calls -- Making a request with `tool_choice="auto"` -- Handling the structured response and executing the corresponding function + +* Setting up the server with tool calling enabled +* Defining an actual function to handle tool calls +* Making a request with `tool_choice="auto"` +* Handling the structured response and executing the corresponding function You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the callers responsibility to: + 1. Define appropriate tools in the request 2. Include relevant context in the chat messages 3. Handle the tool calls in your application logic @@ -77,20 +80,21 @@ Remember that it's the callers responsibility to: For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below. ## Named Function Calling + vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. - ## Automatic Function Calling To enable this feature, you should set the following flags: + * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers @@ -104,28 +108,28 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! - ### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. + * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` * `NousResearch/Hermes-3-*` - _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` - ### Mistral Models (`mistral`) Supported models: + * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. Known issues: + 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is much shorter than what vLLM generates. Since an exception is thrown when this condition @@ -136,13 +140,12 @@ it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated * `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt when tools are provided, that results in much better reliability when working with parallel tool calling. - Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` - ### Llama Models (`llama3_json`) Supported models: + * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct` @@ -152,6 +155,7 @@ The tool calling that is supported is the [JSON based tool calling](https://llam Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: + 1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -164,6 +168,7 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool #### IBM Granite Supported models: + * `ibm-granite/granite-3.0-8b-instruct` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` @@ -182,42 +187,45 @@ Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/t `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. - ### InternLM Models (`internlm`) Supported models: + * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: + * Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` - ### Jamba Models (`jamba`) + AI21's Jamba-1.5 models are supported. + * `ai21labs/AI21-Jamba-1.5-Mini` * `ai21labs/AI21-Jamba-1.5-Large` - Flags: `--tool-call-parser jamba` - ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: + ```python [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] ``` Limitations: + * The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) * Llama's smaller models struggle to use tools effectively. Example supported models: + * `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) @@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format. --- - ## How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser): ``` Then you can use this plugin in the command line like this. -``` + +```console --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index aef32f7559..aaa13d0fb6 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -1,54 +1,239 @@ +import itertools import re +from dataclasses import dataclass, field from pathlib import Path +ROOT_DIR = Path(__file__).parent.parent.parent.resolve() +ROOT_DIR_RELATIVE = '../../../..' +EXAMPLE_DIR = ROOT_DIR / "examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples" + def fix_case(text: str) -> str: - subs = [ - ("api", "API"), - ("llm", "LLM"), - ("vllm", "vLLM"), - ("openai", "OpenAI"), - ("multilora", "MultiLoRA"), - ] - for sub in subs: - text = re.sub(*sub, text, flags=re.IGNORECASE) + subs = { + "api": "API", + "Cli": "CLI", + "cpu": "CPU", + "llm": "LLM", + "tpu": "TPU", + "aqlm": "AQLM", + "gguf": "GGUF", + "lora": "LoRA", + "vllm": "vLLM", + "openai": "OpenAI", + "multilora": "MultiLoRA", + "mlpspeculator": "MLPSpeculator", + r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 + r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 + } + for pattern, repl in subs.items(): + text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) return text -def generate_title(filename: str) -> str: - # Turn filename into a title - title = filename.replace("_", " ").title() - # Handle acronyms and names - title = fix_case(title) - return f"# {title}" +@dataclass +class Index: + """ + Index class to generate a structured document index. + + Attributes: + path (Path): The path save the index file to. + title (str): The title of the index. + description (str): A brief description of the index. + caption (str): An optional caption for the table of contents. + maxdepth (int): The maximum depth of the table of contents. Defaults to 1. + documents (list[str]): A list of document paths to include in the index. Defaults to an empty list. + + Methods: + generate() -> str: + Generates the index content as a string in the specified format. + """ # noqa: E501 + path: Path + title: str + description: str + caption: str + maxdepth: int = 1 + documents: list[str] = field(default_factory=list) + + def generate(self) -> str: + content = f"# {self.title}\n\n{self.description}\n\n" + content += "```{toctree}\n" + content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" + content += "\n".join(self.documents) + "\n```\n" + return content + + +@dataclass +class Example: + """ + Example class for generating documentation content from a given path. + + Attributes: + path (Path): The path to the main directory or file. + category (str): The category of the document. + main_file (Path): The main file in the directory. + other_files (list[Path]): List of other files in the directory. + title (str): The title of the document. + + Methods: + __post_init__(): Initializes the main_file, other_files, and title attributes. + determine_main_file() -> Path: Determines the main file in the given path. + determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. + determine_title() -> str: Determines the title of the document. + generate() -> str: Generates the documentation content. + """ # noqa: E501 + path: Path + category: str = None + main_file: Path = field(init=False) + other_files: list[Path] = field(init=False) + title: str = field(init=False) + + def __post_init__(self): + self.main_file = self.determine_main_file() + self.other_files = self.determine_other_files() + self.title = self.determine_title() + + def determine_main_file(self) -> Path: + """ + Determines the main file in the given path. + If the path is a file, it returns the path itself. Otherwise, it searches + for Markdown files (*.md) in the directory and returns the first one found. + Returns: + Path: The main file path, either the original path if it's a file or the first + Markdown file found in the directory. + Raises: + IndexError: If no Markdown files are found in the directory. + """ # noqa: E501 + return self.path if self.path.is_file() else list( + self.path.glob("*.md")).pop() + + def determine_other_files(self) -> list[Path]: + """ + Determine other files in the directory excluding the main file. + + This method checks if the given path is a file. If it is, it returns an empty list. + Otherwise, it recursively searches through the directory and returns a list of all + files that are not the main file. + + Returns: + list[Path]: A list of Path objects representing the other files in the directory. + """ # noqa: E501 + if self.path.is_file(): + return [] + is_other_file = lambda file: file.is_file() and file != self.main_file + return [file for file in self.path.rglob("*") if is_other_file(file)] + + def determine_title(self) -> str: + return fix_case(self.path.stem.replace("_", " ").title()) + + def generate(self) -> str: + # Convert the path to a relative path from __file__ + make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to( + ROOT_DIR) + + content = f"Source .\n\n" + include = "include" if self.main_file.suffix == ".md" else \ + "literalinclude" + if include == "literalinclude": + content += f"# {self.title}\n\n" + content += f":::{{{include}}} {make_relative(self.main_file)}\n" + if include == "literalinclude": + content += f":language: {self.main_file.suffix[1:]}\n" + content += ":::\n\n" + + if not self.other_files: + return content + + content += "## Example materials\n\n" + for file in self.other_files: + include = "include" if file.suffix == ".md" else "literalinclude" + content += f":::{{admonition}} {file.relative_to(self.path)}\n" + content += ":class: dropdown\n\n" + content += f":::{{{include}}} {make_relative(file)}\n:::\n" + content += ":::\n\n" + + return content def generate_examples(): - root_dir = Path(__file__).parent.parent.parent.resolve() + # Create the EXAMPLE_DOC_DIR if it doesn't exist + if not EXAMPLE_DOC_DIR.exists(): + EXAMPLE_DOC_DIR.mkdir(parents=True) - # Source paths - script_dir = root_dir / "examples" - script_paths = sorted(script_dir.glob("*.py")) + # Create empty indices + examples_index = Index( + path=EXAMPLE_DOC_DIR / "examples_index.md", + title="Examples", + description= + "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using from examples found in .", # noqa: E501 + caption="Examples", + maxdepth=2) + # Category indices stored in reverse order because they are inserted into + # examples_index.documents at index 0 in order + category_indices = { + "other": + Index( + path=EXAMPLE_DOC_DIR / "examples_other_index.md", + title="Other", + description= + "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 + caption="Examples", + ), + "online_serving": + Index( + path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md", + title="Online Serving", + description= + "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 + caption="Examples", + ), + "offline_inference": + Index( + path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", + title="Offline Inference", + description= + "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 + caption="Examples", + ), + } - # Destination paths - doc_dir = root_dir / "docs/source/getting_started/examples" - doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths] + examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] + # Find categorised examples + for category in category_indices: + category_dir = EXAMPLE_DIR / category + globs = [category_dir.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path, category)) + # Find examples in subdirectories + for path in category_dir.glob("*/*.md"): + examples.append(Example(path.parent, category)) + # Find uncategorised examples + globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path)) + # Find examples in subdirectories + for path in EXAMPLE_DIR.glob("*/*.md"): + # Skip categorised examples + if path.parent.name in category_indices: + continue + examples.append(Example(path.parent)) - # Generate the example docs for each example script - for script_path, doc_path in zip(script_paths, doc_paths): - # Make script_path relative to doc_path and call it include_path - include_path = '../../../..' / script_path.relative_to(root_dir) - content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source: .\n\n" - f"```{{literalinclude}} {include_path}\n" - ":language: python\n" - ":linenos:\n```") + # Generate the example documentation + for example in sorted(examples, key=lambda e: e.path.stem): + doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" with open(doc_path, "w+") as f: - f.write(content) + f.write(example.generate()) + # Add the example to the appropriate index + index = category_indices.get(example.category, examples_index) + index.documents.append(example.path.stem) - # Generate the toctree for the example scripts - with open(doc_dir / "examples_index.template.md") as f: - examples_index = f.read() - with open(doc_dir / "examples_index.md", "w+") as f: - example_docs = "\n".join(path.stem + ".md" for path in script_paths) - f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) + # Generate the index files + for category_index in category_indices.values(): + if category_index.documents: + examples_index.documents.insert(0, category_index.path.name) + with open(category_index.path, "w+") as f: + f.write(category_index.generate()) + + with open(examples_index.path, "w+") as f: + f.write(examples_index.generate()) diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md deleted file mode 100644 index 6d01efbbf8..0000000000 --- a/docs/source/getting_started/amd-installation.md +++ /dev/null @@ -1,163 +0,0 @@ -(installation-rocm)= - -# Installation with ROCm - -vLLM supports AMD GPUs with ROCm 6.2. - -## Requirements - -- OS: Linux -- Python: 3.9 -- 3.12 -- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -- ROCm 6.2 - -Installation options: - -1. [Build from source with docker](#build-from-source-docker-rocm) -2. [Build from source](#build-from-source-rocm) - -(build-from-source-docker-rocm)= - -## Option 1: Build from source with docker (recommended) - -You can build and install vLLM from source. - -First, build a docker image from and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: - -```console -{ - "features": { - "buildkit": true - } -} -``` - - uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. -It provides flexibility to customize the build of docker image using the following arguments: - -- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. -- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. -- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` -- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. - -Their values can be passed in when running `docker build` with `--build-arg` options. - -To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: - -```console -$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . -``` - -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: - -```console -$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . -``` - -To run the above docker image `vllm-rocm`, use the below command: - -```console -$ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash -``` - -Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. - -(build-from-source-rocm)= - -## Option 2: Build from source - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) -- [PyTorch](https://pytorch.org/) - -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) - -1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) - -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) - -```console -$ python3 -m pip install ninja cmake wheel pybind11 -$ pip uninstall -y triton -$ git clone https://github.com/OpenAI/triton.git -$ cd triton -$ git checkout e192dba -$ cd python -$ pip3 install . -$ cd ../.. -``` - -```{note} -- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. -``` - -2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) - -Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) -Alternatively, wheels intended for vLLM use can be accessed under the releases. - -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. - -```console -$ git clone https://github.com/ROCm/flash-attention.git -$ cd flash-attention -$ git checkout 3cea2fb -$ git submodule update --init -$ GPU_ARCHS="gfx90a" python3 setup.py install -$ cd .. -``` - -```{note} -- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -``` - -3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: - -```bash -$ pip install --upgrade pip - -# Install PyTorch -$ pip uninstall torch -y -$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - -# Build & install AMD SMI -$ pip install /opt/rocm/share/amd_smi - -# Install dependencies -$ pip install --upgrade numba scipy huggingface-hub[cli] -$ pip install "numpy<2" -$ pip install -r requirements-rocm.txt - -# Build vLLM for MI210/MI250/MI300. -$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" -$ python3 setup.py develop -``` - -This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation. - -```{tip} -- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. -- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. -- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. -- The ROCm version of PyTorch, ideally, should match the ROCm driver version. -``` - -```{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md deleted file mode 100644 index 799b597b3a..0000000000 --- a/docs/source/getting_started/arm-installation.md +++ /dev/null @@ -1,46 +0,0 @@ -(installation-arm)= - -# Installation for ARM CPUs - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: - -- CPU backend inference capabilities -- Relevant runtime environment variables -- Performance optimization tips - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. -Contents: - -1. [Requirements](#arm-backend-requirements) -2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile) -3. [Building from Source](#build-arm-backend-from-source) - -(arm-backend-requirements)= - -## Requirements - -- **Operating System**: Linux or macOS -- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) -- **Instruction Set Architecture (ISA)**: NEON support is required - -(arm-backend-quick-start-dockerfile)= - -## Quick Start with Dockerfile - -You can quickly set up vLLM on ARM using Docker: - -```console -$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env -``` - -(build-arm-backend-from-source)= - -## Building from Source - -To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md deleted file mode 100644 index de7a91c0ff..0000000000 --- a/docs/source/getting_started/examples/examples_index.template.md +++ /dev/null @@ -1,8 +0,0 @@ -# Examples - -```{toctree} -:maxdepth: 1 -:caption: Scripts - -%EXAMPLE_DOCS% -``` \ No newline at end of file diff --git a/docs/source/usage/faq.md b/docs/source/getting_started/faq.md similarity index 98% rename from docs/source/usage/faq.md rename to docs/source/getting_started/faq.md index fde2954f10..4751b325e6 100644 --- a/docs/source/usage/faq.md +++ b/docs/source/getting_started/faq.md @@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. -**Mitigation Strategies** +## Mitigation Strategies - For improved stability and reduced variance, use `float32`. Note that this will require more memory. - If using `bfloat16`, switching to `float16` can also help. diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md similarity index 89% rename from docs/source/getting_started/gaudi-installation.md rename to docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 1f2ee62860..ae42dd0c0d 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -1,8 +1,13 @@ -# Installation with Intel® Gaudi® AI Accelerators +# Installation -This README provides instructions on running vLLM with Intel Gaudi devices. +This tab provides instructions on running vLLM with Intel Gaudi devices. -## Requirements and Installation +## Requirements + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) @@ -10,42 +15,24 @@ to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). -### Requirements +## Configure a new environment -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 - -### Quick start using Dockerfile - -```console -$ docker build -f Dockerfile.hpu -t vllm-hpu-env . -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env -``` - -```{tip} -If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -``` - -### Build from source - -#### Environment verification +### Environment verification To verify that the Intel Gaudi software was correctly installed, run: ```console -$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -$ pip list | grep neural # verify that neural_compressor is installed +hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +pip list | grep neural # verify that neural_compressor is installed ``` Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. -#### Run Docker Image +### Run Docker Image It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi @@ -55,33 +42,60 @@ for more details. Use the following commands to run a Docker image: ```console -$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -#### Build and Install vLLM +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built Intel Gaudi wheels. + +### Build wheel from source To build and install vLLM from source, run: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python setup.py develop +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-hpu.txt +python setup.py develop ``` Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -$ git clone https://github.com/HabanaAI/vllm-fork.git -$ cd vllm-fork -$ git checkout habana_main -$ python setup.py develop +git clone https://github.com/HabanaAI/vllm-fork.git +cd vllm-fork +git checkout habana_main +pip install -r requirements-hpu.txt +python setup.py develop ``` -## Supported Features +## Set up using Docker -- [Offline batched inference](#offline-batched-inference) -- Online inference via [OpenAI-Compatible Server](#openai-compatible-server) +### Pre-built images + +Currently, there are no pre-built Intel Gaudi images. + +### Build image from source + +```console +docker build -f Dockerfile.hpu -t vllm-hpu-env . +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +```{tip} +If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +``` + +## Extra information + +## Supported features + +- [Offline inference](#offline-inference) +- Online serving via [OpenAI-Compatible Server](#openai-compatible-server) - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, @@ -92,14 +106,14 @@ $ python setup.py develop for accelerating low-batch latency and throughput - Attention with Linear Biases (ALiBi) -## Unsupported Features +## Unsupported features - Beam search - LoRA adapters - Quantization - Prefill chunking (mixed-batch inferencing) -## Supported Configurations +## Supported configurations The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. @@ -135,7 +149,7 @@ Gaudi2 devices. Configurations that are not listed may or may not work. - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -## Performance Tuning +## Performance tuning ### Execution modes @@ -179,7 +193,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -``` +```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -190,7 +204,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1 Example (with ramp-up) -``` +```text min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -199,7 +213,7 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -``` +```text min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) @@ -222,7 +236,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -``` +```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -271,7 +285,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -``` +```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -347,26 +361,26 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Default values: - Prompt: - : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` - Decode: - : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs -## Troubleshooting: Tweaking HPU Graphs +## Troubleshooting: tweaking HPU graphs If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following @@ -383,5 +397,5 @@ the below: completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to - server (for online inference), or by passing `enforce_eager=True` + server (for online serving), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md new file mode 100644 index 0000000000..a6c4c44305 --- /dev/null +++ b/docs/source/getting_started/installation/ai_accelerator/index.md @@ -0,0 +1,375 @@ +# Other AI accelerators + +vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: + +## Requirements + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Configure a new environment + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} ../python_env_setup.inc.md +``` + +::: + +:::: + +## Set up using Python + +### Pre-built wheels + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::: + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::: + +### Build image from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::: + +## Extra information + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "## Extra information" +``` + +::: + +:::: diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md similarity index 83% rename from docs/source/getting_started/neuron-installation.md rename to docs/source/getting_started/installation/ai_accelerator/neuron.inc.md index baaeeb9f53..575a9f9c2e 100644 --- a/docs/source/getting_started/neuron-installation.md +++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md @@ -1,6 +1,4 @@ -(installation-neuron)= - -# Installation with Neuron +# Installation vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. Paged Attention and Chunked Prefill are currently in development and will be available soon. @@ -14,28 +12,9 @@ Data types currently supported in Neuron SDK are FP16 and BF16. - Pytorch 2.0.1/2.1.1 - AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) -Installation steps: +## Configure a new environment -- [Build from source](#build-from-source-neuron) - - - [Step 0. Launch Trn1/Inf2 instances](#launch-instances) - - [Step 1. Install drivers and tools](#install-drivers) - - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx) - - [Step 3. Install vLLM from source](#install-vllm) - -(build-from-source-neuron)= - -```{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -``` - -## Build from source - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -(launch-instances)= - -### Step 0. Launch Trn1/Inf2 instances +### Launch Trn1/Inf2 instances Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). @@ -45,9 +24,7 @@ Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch N - When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance -(install-drivers)= - -### Step 1. Install drivers and tools +### Install drivers and tools The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: @@ -82,9 +59,21 @@ sudo apt-get install aws-neuronx-tools=2.* -y export PATH=/opt/aws/neuron/bin:$PATH ``` -(install-tnx)= +## Set up using Python -### Step 2. Install transformers-neuronx and its dependencies +### Pre-built wheels + +Currently, there are no pre-built Neuron wheels. + +### Build wheel from source + +```{note} +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +``` + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +#### Install transformers-neuronx and its dependencies [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances. Follow the steps below to install transformer-neuronx package and its dependencies. @@ -116,17 +105,31 @@ python -m pip install awscli python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx ``` -(install-vllm)= - -### Step 3. Install vLLM from source +#### Install vLLM from source Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -U -r requirements-neuron.txt -$ VLLM_TARGET_DEVICE="neuron" pip install . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements-neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install . ``` If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built Neuron images. + +### Build image from source + +See for instructions on building the Docker image. + +Make sure to use in place of the default Dockerfile. + +## Extra information + +There is no extra information for this device. diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md similarity index 61% rename from docs/source/getting_started/openvino-installation.md rename to docs/source/getting_started/installation/ai_accelerator/openvino.inc.md index 8b43c0a904..a786747258 100644 --- a/docs/source/getting_started/openvino-installation.md +++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md @@ -1,63 +1,65 @@ -(installation-openvino)= +# Installation -# Installation with OpenVINO - -vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: - -- Prefix caching (`--enable-prefix-caching`) -- Chunked prefill (`--enable-chunked-prefill`) - -**Table of contents**: - -- [Requirements](#openvino-backend-requirements) -- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile) -- [Build from source](#install-openvino-backend-from-source) -- [Performance tips](#openvino-backend-performance-tips) -- [Limitations](#openvino-backend-limitations) - -(openvino-backend-requirements)= +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). ## Requirements - OS: Linux - Instruction set architecture (ISA) requirement: at least AVX2. -(openvino-backend-quick-start-dockerfile)= +## Set up using Python -## Quick start using Dockerfile +### Pre-built wheels + +Currently, there are no pre-built OpenVINO wheels. + +### Build wheel from source + +First, install Python. For example, on Ubuntu 22.04, you can run: ```console -$ docker build -f Dockerfile.openvino -t vllm-openvino-env . -$ docker run -it --rm vllm-openvino-env +sudo apt-get update -y +sudo apt-get install python3 ``` -(install-openvino-backend-from-source)= +Second, install prerequisites vLLM OpenVINO backend installation: -## Install from source +```console +pip install --upgrade pip +pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` -- First, install Python. For example, on Ubuntu 22.04, you can run: +Finally, install vLLM with OpenVINO backend: - ```console - $ sudo apt-get update -y - $ sudo apt-get install python3 - ``` +```console +PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . +``` -- Second, install prerequisites vLLM OpenVINO backend installation: +:::{tip} +To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). +::: - ```console - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` +## Set up using Docker -- Finally, install vLLM with OpenVINO backend: +### Pre-built images - ```console - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . - ``` +Currently, there are no pre-built OpenVINO images. -- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). +### Build image from source -(openvino-backend-performance-tips)= +```console +docker build -f Dockerfile.openvino -t vllm-openvino-env . +docker run -it --rm vllm-openvino-env +``` + +## Extra information + +## Supported features + +OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) ## Performance tips @@ -95,8 +97,6 @@ $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json ``` -(openvino-backend-limitations)= - ## Limitations - LoRA serving is not supported. diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md similarity index 85% rename from docs/source/getting_started/tpu-installation.md rename to docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index 4d3ac541c9..6a911cc6b9 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -1,6 +1,4 @@ -(installation-tpu)= - -# Installation with TPU +# Installation Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs @@ -54,7 +52,16 @@ In all of the following commands, replace the ALL CAPS parameter names with appropriate values. See the parameter descriptions table for more information. ``` -## Provision a Cloud TPU with the queued resource API +### Provision Cloud TPUs with GKE + +For more information about using TPUs with GKE, see: +- +- +- + +## Configure a new environment + +### Provision a Cloud TPU with the queued resource API Create a TPU v5e with 4 TPU chips: @@ -102,6 +109,14 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built TPU wheels. + +### Build wheel from source + Install Miniconda: ```bash @@ -142,28 +157,25 @@ Run the setup script: VLLM_TARGET_DEVICE="tpu" python setup.py develop ``` -## Provision Cloud TPUs with GKE +## Set up using Docker -For more information about using TPUs with GKE, see - - - +### Pre-built images -(build-docker-tpu)= +See for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. -## Build a docker image with {code}`Dockerfile.tpu` +### Build image from source You can use to build a Docker image with TPU support. ```console -$ docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: ```console -$ # Make sure to add `--privileged --net host --shm-size=16G`. -$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +# Make sure to add `--privileged --net host --shm-size=16G`. +docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` ```{note} @@ -189,3 +201,7 @@ Install OpenBLAS with the following command: $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev ``` ```` + +## Extra information + +There is no extra information for this device. diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md new file mode 100644 index 0000000000..56545253b1 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/apple.inc.md @@ -0,0 +1,56 @@ +# Installation + +vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. + +Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. + +## Requirements + +- OS: `macOS Sonoma` or later +- SDK: `XCode 15.4` or later with Command Line Tools +- Compiler: `Apple Clang >= 15.0.0` + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. + +```console +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-cpu.txt +pip install -e . +``` + +```{note} +On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. +``` + +#### Troubleshooting + +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +[Command Line Tools for Xcode](https://developer.apple.com/download/all/). + +```text +[...] fatal error: 'map' file not found + 1 | #include + | ^~~~~ + 1 error generated. + [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o + +[...] fatal error: 'cstddef' file not found + 10 | #include + | ^~~~~~~~~ + 1 error generated. +``` + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md new file mode 100644 index 0000000000..08a764e1a2 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/arm.inc.md @@ -0,0 +1,30 @@ +# Installation + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. + +## Requirements + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): NEON support is required + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +:::{include} build.inc.md +::: + +Testing has been conducted on AWS Graviton3 instances for compatibility. + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md new file mode 100644 index 0000000000..f8d1044a0d --- /dev/null +++ b/docs/source/getting_started/installation/cpu/build.inc.md @@ -0,0 +1,21 @@ +First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```console +sudo apt-get update -y +sudo apt-get install -y gcc-12 g++-12 libnuma-dev +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Second, install Python packages for vLLM CPU backend building: + +```console +pip install --upgrade pip +pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` + +Finally, build and install vLLM CPU backend: + +```console +VLLM_TARGET_DEVICE=cpu python setup.py install +``` diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/installation/cpu/index.md similarity index 59% rename from docs/source/getting_started/cpu-installation.md rename to docs/source/getting_started/installation/cpu/index.md index c3d3f715ed..4ec907c0e9 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/installation/cpu/index.md @@ -1,35 +1,136 @@ -(installation-cpu)= +# CPU -# Installation with CPU +vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: +::::{tab-set} +:sync-group: device -- Tensor Parallel -- Model Quantization (`INT8 W8A8, AWQ`) -- Chunked-prefill -- Prefix-caching -- FP8-E5M2 KV-Caching (TODO) +:::{tab-item} x86 +:sync: x86 -Table of contents: +```{include} x86.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` -1. [Requirements](#cpu-backend-requirements) -2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile) -3. [Build from source](#build-cpu-backend-from-source) -4. [Related runtime environment variables](#env-intro) -5. [Intel Extension for PyTorch](#ipex-guidance) -6. [Performance tips](#cpu-backend-performance-tips) +::: -(cpu-backend-requirements)= +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: ## Requirements -- OS: Linux -- Compiler: `gcc/g++>=12.3.0` (optional, recommended) -- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) +- Python: 3.9 -- 3.12 -(cpu-backend-quick-start-dockerfile)= +::::{tab-set} +:sync-group: device -## Quick start using Dockerfile +:::{tab-item} x86 +:sync: x86 + +```{include} x86.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Set up using Python + +### Create a new Python environment + +```{include} ../python_env_setup.inc.md +``` + +### Pre-built wheels + +Currently, there are no pre-built CPU wheels. + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} x86 +:sync: x86 + +```{include} x86.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-build CPU images. + +### Build image from source ```console $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . @@ -41,69 +142,42 @@ $ docker run -it \ vllm-cpu-env ``` -(build-cpu-backend-from-source)= +:::{tip} +For ARM or Apple silicon, use `Dockerfile.arm` +::: -## Build from source +## Supported features -- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +vLLM CPU backend supports the following vLLM features: -```console -$ sudo apt-get update -y -$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev -$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 -``` - -- Second, install Python packages for vLLM CPU backend building: - -```console -$ pip install --upgrade pip -$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy -$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu -``` - -- Finally, build and install vLLM CPU backend: - -```console -$ VLLM_TARGET_DEVICE=cpu python setup.py install -``` - -```{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. -``` - -(env-intro)= +- Tensor Parallel +- Model Quantization (`INT8 W8A8, AWQ, GPTQ`) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) ## Related runtime environment variables - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. -(ipex-guidance)= - -## Intel Extension for PyTorch - -- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. - -(cpu-backend-performance-tips)= - ## Performance tips - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library -$ find / -name *libtcmalloc* # find the dynamic link library path -$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference.py # run vLLM +sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +find / -name *libtcmalloc* # find the dynamic link library path +export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +python examples/offline_inference/basic.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console -$ export VLLM_CPU_KVCACHE_SPACE=40 -$ export VLLM_CPU_OMP_THREADS_BIND=0-29 -$ vllm serve facebook/opt-125m +export VLLM_CPU_KVCACHE_SPACE=40 +export VLLM_CPU_OMP_THREADS_BIND=0-29 +vllm serve facebook/opt-125m ``` - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: @@ -132,23 +206,23 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference.py +$ python examples/offline_inference/basic.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. -## CPU Backend Considerations +## Other considerations - The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md). diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md new file mode 100644 index 0000000000..e4f99d3ceb --- /dev/null +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -0,0 +1,35 @@ +# Installation + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. + +## Requirements + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): AVX512 (optional, recommended) + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +:::{include} build.inc.md +::: + +```{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. +``` + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information + +## Intel Extension for PyTorch + +- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. diff --git a/docs/source/getting_started/installation/device.template.md b/docs/source/getting_started/installation/device.template.md new file mode 100644 index 0000000000..44f538da93 --- /dev/null +++ b/docs/source/getting_started/installation/device.template.md @@ -0,0 +1,17 @@ +# Installation + +## Requirements + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation/gpu/cuda.inc.md similarity index 50% rename from docs/source/getting_started/installation.md rename to docs/source/getting_started/installation/gpu/cuda.inc.md index 996fb346f4..4cce65278c 100644 --- a/docs/source/getting_started/installation.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -1,118 +1,118 @@ -(installation)= - # Installation -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. +vLLM contains pre-compiled C++ and CUDA (12.1) binaries. ## Requirements -- OS: Linux -- Python: 3.9 -- 3.12 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -## Install released versions +## Set up using Python -You can install vLLM using pip: - -```console -$ # (Recommended) Create a new conda environment. -$ conda create -n myenv python=3.12 -y -$ conda activate myenv - -$ # Install vLLM with CUDA 12.1. -$ pip install vllm -``` +### Create a new Python environment ```{note} -Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See for more details. -``` - -````{note} -As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. -We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: - -```console -$ # Install vLLM with CUDA 11.8. -$ export VLLM_VERSION=0.6.1.post1 -$ export PYTHON_VERSION=310 -$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. ``` In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. -Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. -```` +Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details. + +### Pre-built wheels + +You can install vLLM using either `pip` or `uv pip`: + +```console +# Install vLLM with CUDA 12.1. +pip install vllm # If you are using pip. +uv pip install vllm # If you are using uv. +``` + +As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: + +```console +# Install vLLM with CUDA 11.8. +export VLLM_VERSION=0.6.1.post1 +export PYTHON_VERSION=310 +pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` (install-the-latest-code)= -## Install the latest code +#### Install the latest code -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command: +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. + +##### Install the latest code using `pip` ```console -$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` -If you want to access the wheels for previous commits, you can specify the commit hash in the URL: +`--pre` is required for `pip` to consider pre-released versions. + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` -Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. -Another way to access the latest code is to use the docker images: +##### Install the latest code using `uv` + +Another way to install the latest code is to use `uv`: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly ``` -These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: -The latest code can contain bugs and may not be stable. Please use it with caution. +```console +export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +``` -(build-from-source)= +The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -## Build from source +### Build wheel from source -(python-only-build)= - -### Python-only build (without compilation) +#### Set up using Python-only build (without compilation) If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_USE_PRECOMPILED=1 pip install --editable . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel and use the compiled libraries from there in the install. +This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): ```console -$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl -$ pip install --editable . +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +pip install --editable . ``` -You can find more information about vLLM's wheels [above](#install-the-latest-code). +You can find more information about vLLM's wheels in . ```{note} There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. -It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel. +It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. ``` -### Full build (with compilation) +#### Full build (with compilation) If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -e . ``` ```{tip} @@ -125,7 +125,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ``` -#### Use an existing PyTorch installation +##### Use an existing PyTorch installation There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: @@ -135,32 +135,32 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python use_existing_torch.py -$ pip install -r requirements-build.txt -$ pip install -e . --no-build-isolation +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +pip install -r requirements-build.txt +pip install -e . --no-build-isolation ``` -#### Use the local cutlass for compilation +##### Use the local cutlass for compilation Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` -#### Troubleshooting +##### Troubleshooting To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```console -$ export MAX_JOBS=6 -$ pip install -e . +export MAX_JOBS=6 +pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -169,31 +169,56 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. ```console -$ # Use `--ipc=host` to make sure the shared memory is large enough. -$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +# Use `--ipc=host` to make sure the shared memory is large enough. +docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console -$ export CUDA_HOME=/usr/local/cuda -$ export PATH="${CUDA_HOME}/bin:$PATH" +export CUDA_HOME=/usr/local/cuda +export PATH="${CUDA_HOME}/bin:$PATH" ``` Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ```console -$ nvcc --version # verify that nvcc is in your PATH -$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +nvcc --version # verify that nvcc is in your PATH +${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` -### Unsupported OS build +#### Unsupported OS build vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```console -$ export VLLM_TARGET_DEVICE=empty -$ pip install -e . +export VLLM_TARGET_DEVICE=empty +pip install -e . ``` + +## Set up using Docker + +### Pre-built images + +See for instructions on using the official Docker image. + +Another way to access the latest code is to use the docker images: + +```console +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +``` + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +### Build image from source + +See for instructions on building the Docker image. + +## Supported features + +See compatibility matrix for feature support information. diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md new file mode 100644 index 0000000000..6c007382b2 --- /dev/null +++ b/docs/source/getting_started/installation/gpu/index.md @@ -0,0 +1,300 @@ +# GPU + +vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Set up using Python + +### Create a new Python environment + +```{include} ../python_env_setup.inc.md +``` + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Create a new Python environment" +:end-before: "### Pre-built wheels" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +There is no extra information on creating a new Python environment for this device. + +::: + +:::{tab-item} XPU +:sync: xpu + +There is no extra information on creating a new Python environment for this device. + +::: + +:::: + +### Pre-built wheels + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::: + +(build-from-source)= + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::: + +### Build image from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::: + +## Supported features + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Supported features" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "## Supported features" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "## Supported features" +``` + +::: + +:::: diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md new file mode 100644 index 0000000000..69238f6e36 --- /dev/null +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -0,0 +1,173 @@ +# Installation + +vLLM supports AMD GPUs with ROCm 6.2. + +## Requirements + +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- ROCm 6.2 + +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built ROCm wheels. + +However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized +docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator. + +```{tip} +Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) +for instructions on how to use this prebuilt docker image. +``` + +### Build wheel from source + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) + + For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + + Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + +1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) + + Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + + ```console + python3 -m pip install ninja cmake wheel pybind11 + pip uninstall -y triton + git clone https://github.com/OpenAI/triton.git + cd triton + git checkout e192dba + cd python + pip3 install . + cd ../.. + ``` + + ```{note} + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ``` + +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) + + Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) + Alternatively, wheels intended for vLLM use can be accessed under the releases. + + For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + + ```console + git clone https://github.com/ROCm/flash-attention.git + cd flash-attention + git checkout 3cea2fb + git submodule update --init + GPU_ARCHS="gfx90a" python3 setup.py install + cd .. + ``` + + ```{note} + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ``` + +3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: + + ```bash + $ pip install --upgrade pip + + # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2 + + # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi + + # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt + + # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + ``` + + This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. + + ```{tip} + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + ``` + +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built ROCm images. + +### Build image from source + +Building the Docker image from source is the recommended way to use vLLM with ROCm. + +First, build a docker image from and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +```console +{ + "features": { + "buildkit": true + } +} +``` + + uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using +- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build +- `BUILD_RPD`: Include RocmProfileData profiling tool in the image +- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image + +Their values can be passed in when running `docker build` with `--build-arg` options. + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +```console +DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +``` + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: + +```console +DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm . +``` + +To run the above docker image `vllm-rocm`, use the below command: + +```console +docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash +``` + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + +## Supported features + +See compatibility matrix for feature support information. diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/installation/gpu/xpu.inc.md similarity index 57% rename from docs/source/getting_started/xpu-installation.md rename to docs/source/getting_started/installation/gpu/xpu.inc.md index 9554ae4b7f..577986eba7 100644 --- a/docs/source/getting_started/xpu-installation.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -1,26 +1,47 @@ -(installation-xpu)= - -# Installation with XPU +# Installation vLLM initially supports basic model inferencing and serving on Intel GPU platform. -Table of contents: - -1. [Requirements](#xpu-backend-requirements) -2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile) -3. [Build from source](#build-xpu-backend-from-source) - -(xpu-backend-requirements)= - ## Requirements -- OS: Linux - Supported Hardware: Intel Data Center GPU, Intel ARC GPU - OneAPI requirements: oneAPI 2024.2 -(xpu-backend-quick-start-dockerfile)= +## Set up using Python -## Quick start using Dockerfile +### Pre-built wheels + +Currently, there are no pre-built XPU wheels. + +### Build wheel from source + +- First, install required driver and intel OneAPI 2024.2 or later. +- Second, install Python packages for vLLM XPU backend building: + +```console +source /opt/intel/oneapi/setvars.sh +pip install --upgrade pip +pip install -v -r requirements-xpu.txt +``` + +- Finally, build and install vLLM XPU backend: + +```console +VLLM_TARGET_DEVICE=xpu python setup.py install +``` + +```{note} +- FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. +``` + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built XPU images. + +### Build image from source ```console $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . @@ -32,43 +53,19 @@ $ docker run -it \ vllm-xpu-env ``` -(build-xpu-backend-from-source)= - -## Build from source - -- First, install required driver and intel OneAPI 2024.2 or later. -- Second, install Python packages for vLLM XPU backend building: - -```console -$ source /opt/intel/oneapi/setvars.sh -$ pip install --upgrade pip -$ pip install -v -r requirements-xpu.txt -``` - -- Finally, build and install vLLM XPU backend: - -```console -$ VLLM_TARGET_DEVICE=xpu python setup.py install -``` - -```{note} -- FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. -``` - -## Distributed inference and serving +## Supported features XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: ```console -$ python -m vllm.entrypoints.openai.api_server \ -$ --model=facebook/opt-13b \ -$ --dtype=bfloat16 \ -$ --device=xpu \ -$ --max_model_len=1024 \ -$ --distributed-executor-backend=ray \ -$ --pipeline-parallel-size=2 \ -$ -tp=8 +python -m vllm.entrypoints.openai.api_server \ + --model=facebook/opt-13b \ + --dtype=bfloat16 \ + --device=xpu \ + --max_model_len=1024 \ + --distributed-executor-backend=ray \ + --pipeline-parallel-size=2 \ + -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md new file mode 100644 index 0000000000..bc1d268bf0 --- /dev/null +++ b/docs/source/getting_started/installation/index.md @@ -0,0 +1,13 @@ +(installation-index)= + +# Installation + +vLLM supports the following hardware platforms: + +```{toctree} +:maxdepth: 1 + +gpu/index +cpu/index +ai_accelerator/index +``` diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md new file mode 100644 index 0000000000..25cfac5f58 --- /dev/null +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -0,0 +1,19 @@ +You can create a new Python environment using `conda`: + +```console +# (Recommended) Create a new conda environment. +conda create -n myenv python=3.12 -y +conda activate myenv +``` + +```{note} +[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. +``` + +Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: + +```console +# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +``` diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 9c8b7e4f59..8ac80e5e5c 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -2,34 +2,45 @@ # Quickstart -This guide will help you quickly get started with vLLM to: +This guide will help you quickly get started with vLLM to perform: -- [Run offline batched inference](#offline-batched-inference) -- [Run OpenAI-compatible inference](#openai-compatible-server) +- [Offline batched inference](#quickstart-offline) +- [Online serving using OpenAI-compatible server](#quickstart-online) ## Prerequisites - OS: Linux - Python: 3.9 -- 3.12 -- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) ## Installation -You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. +If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly. + +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -$ conda create -n myenv python=3.10 -y -$ conda activate myenv -$ pip install vllm +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +uv pip install vllm ``` -Please refer to the {ref}`installation documentation ` for more details on installing vLLM. +You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. -(offline-batched-inference)= +```console +conda create -n myenv python=3.12 -y +conda activate myenv +pip install vllm +``` + +```{note} +For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. +``` + +(quickstart-offline)= ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -40,7 +51,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class from vllm import LLM, SamplingParams ``` -The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html). +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). ```python prompts = [ @@ -73,7 +84,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -(openai-compatible-server)= +(quickstart-online)= ## OpenAI-Compatible Server @@ -83,7 +94,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: ```console -$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} @@ -94,7 +105,7 @@ You can learn about overriding it [here](#chat-template). This server can be queried in the same format as OpenAI API. For example, to list the models: ```console -$ curl http://localhost:8000/v1/models +curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. @@ -104,14 +115,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: ```console -$ curl http://localhost:8000/v1/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "prompt": "San Francisco is a", -$ "max_tokens": 7, -$ "temperature": 0 -$ }' +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' ``` Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: @@ -131,7 +142,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` -A more detailed client example can be found here: +A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM @@ -140,15 +151,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: ```console -$ curl http://localhost:8000/v1/chat/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "messages": [ -$ {"role": "system", "content": "You are a helpful assistant."}, -$ {"role": "user", "content": "Who won the world series in 2020?"} -$ ] -$ }' +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"} + ] + }' ``` Alternatively, you can use the `openai` Python package: diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/troubleshooting.md similarity index 72% rename from docs/source/getting_started/debugging.md rename to docs/source/getting_started/troubleshooting.md index 19eb699572..7bfe9b4036 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/troubleshooting.md @@ -1,8 +1,8 @@ -(debugging)= +(troubleshooting)= -# Debugging Tips +# Troubleshooting -This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. +This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. ```{note} Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. @@ -22,9 +22,9 @@ It'd be better to store the model in a local disk. Additionally, have a look at To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. ``` -## Model is too large +## Out of memory -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging @@ -47,6 +47,8 @@ You might also need to set `export NCCL_SOCKET_IFNAME=` If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. +(troubleshooting-incorrect-hardware-driver)= + ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -117,13 +119,13 @@ dist.destroy_process_group() If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console -$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py ``` If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: ```console -$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py ``` If the script runs successfully, you should see the message `sanity check is successful!`. @@ -139,7 +141,8 @@ A multi-node environment is more complicated than a single-node one. If you see Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. ``` -(debugging-python-multiprocessing)= +(troubleshooting-python-multiprocessing)= + ## Python multiprocessing ### `RuntimeError` Exception @@ -150,7 +153,7 @@ If you have seen a warning in your logs like this: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing for more information. ``` @@ -194,6 +197,63 @@ if __name__ == '__main__': llm = vllm.LLM(...) ``` +## `torch.compile` Error + +vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: + +```python +import torch + +@torch.compile +def f(x): + # a simple function to test torch.compile + x = x + 1 + x = x * 2 + x = x.sin() + return x + +x = torch.randn(4, 4).cuda() +print(f(x)) +``` + +If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. + +## Model failed to be inspected + +If you see an error like: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. +``` + +It means that vLLM failed to import the model file. +Usually, it is related to missing dependencies or outdated binaries in the vLLM build. +Please read the logs carefully to determine the root cause of the error. + +## Model not supported + +If you see an error like: + +```text +Traceback (most recent call last): +... + File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls + for arch in architectures: +TypeError: 'NoneType' object is not iterable +``` + +or: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] +``` + +But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model. + ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). diff --git a/docs/source/index.md b/docs/source/index.md index 34f9c4caeb..d7a1117df9 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# Welcome to vLLM! +# Welcome to vLLM ```{figure} ./assets/logos/vllm-logo-text-light.png :align: center @@ -23,10 +23,12 @@ vLLM is a fast and easy-to-use library for LLM inference and serving. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. + vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 @@ -50,103 +52,133 @@ For more information, check out the following: - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. -- {ref}`vLLM Meetups `. +- [vLLM Meetups](#meetups) ## Documentation +% How to start using vLLM? + ```{toctree} :caption: Getting Started :maxdepth: 1 -getting_started/installation -getting_started/amd-installation -getting_started/openvino-installation -getting_started/cpu-installation -getting_started/gaudi-installation -getting_started/arm-installation -getting_started/neuron-installation -getting_started/tpu-installation -getting_started/xpu-installation +getting_started/installation/index getting_started/quickstart -getting_started/debugging getting_started/examples/examples_index +getting_started/troubleshooting +getting_started/faq ``` -```{toctree} -:caption: Serving -:maxdepth: 1 - -serving/openai_compatible_server -serving/deploying_with_docker -serving/deploying_with_k8s -serving/deploying_with_helm -serving/deploying_with_nginx -serving/distributed_serving -serving/metrics -serving/integrations -serving/tensorizer -serving/runai_model_streamer -``` +% What does vLLM support? ```{toctree} :caption: Models :maxdepth: 1 -models/supported_models models/generative_models models/pooling_models -models/adding_model -models/enabling_multimodal_inputs +models/supported_models +models/extensions/index ``` +% Additional capabilities + ```{toctree} -:caption: Usage +:caption: Features :maxdepth: 1 -usage/lora -usage/multimodal_inputs -usage/tool_calling -usage/structured_outputs -usage/spec_decode -usage/compatibility_matrix -usage/performance -usage/faq -usage/engine_args -usage/env_vars -usage/usage_stats -usage/disagg_prefill +features/quantization/index +features/lora +features/tool_calling +features/structured_outputs +features/automatic_prefix_caching +features/disagg_prefill +features/spec_decode +features/compatibility_matrix ``` +% Details about running vLLM + ```{toctree} -:caption: Quantization +:caption: Inference and Serving :maxdepth: 1 -quantization/supported_hardware -quantization/auto_awq -quantization/bnb -quantization/gguf -quantization/int8 -quantization/fp8 -quantization/fp8_e5m2_kvcache -quantization/fp8_e4m3_kvcache +serving/offline_inference +serving/openai_compatible_server +serving/multimodal_inputs +serving/distributed_serving +serving/metrics +serving/engine_args +serving/env_vars +serving/usage_stats +serving/integrations/index ``` +% Scaling up vLLM for production + ```{toctree} -:caption: Automatic Prefix Caching +:caption: Deployment :maxdepth: 1 -automatic_prefix_caching/apc -automatic_prefix_caching/details +deployment/docker +deployment/k8s +deployment/nginx +deployment/frameworks/index +deployment/integrations/index ``` +% Making the most out of vLLM + ```{toctree} :caption: Performance :maxdepth: 1 +performance/optimization performance/benchmarks ``` -% Community: User community resources +% Explanation of vLLM internals + +```{toctree} +:caption: Design Documents +:maxdepth: 2 + +design/arch_overview +design/huggingface_integration +design/plugin_system +design/kernel/paged_attention +design/mm_processing +design/automatic_prefix_caching +design/multiprocessing +``` + +% How to contribute to the vLLM project + +```{toctree} +:caption: Developer Guide +:maxdepth: 2 + +contributing/overview +contributing/profiling/profiling_index +contributing/dockerfile/dockerfile +contributing/model/index +contributing/vulnerability_management +``` + +% Technical API specifications + +```{toctree} +:caption: API Reference +:maxdepth: 2 + +api/offline_inference/index +api/engine/index +api/inference_params +api/multimodal/index +api/model/index +``` + +% Latest news and acknowledgements ```{toctree} :caption: Community @@ -156,45 +188,7 @@ community/meetups community/sponsors ``` -% API Documentation: API reference aimed at vllm library usage - -```{toctree} -:caption: API Documentation -:maxdepth: 2 - -dev/sampling_params -dev/pooling_params -dev/offline_inference/offline_index -dev/engine/engine_index -``` - -% Design: docs about vLLM internals - -```{toctree} -:caption: Design -:maxdepth: 2 - -design/arch_overview -design/huggingface_integration -design/plugin_system -design/input_processing/model_inputs_index -design/kernel/paged_attention -design/multimodal/multimodal_index -design/multiprocessing -``` - -% For Developers: contributing to the vLLM project - -```{toctree} -:caption: For Developers -:maxdepth: 2 - -contributing/overview -contributing/profiling/profiling_index -contributing/dockerfile/dockerfile -``` - -# Indices and tables +## Indices and tables - {ref}`genindex` - {ref}`modindex` diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md deleted file mode 100644 index 02537fba02..0000000000 --- a/docs/source/models/adding_model.md +++ /dev/null @@ -1,155 +0,0 @@ -(adding-a-new-model)= - -# Adding a New Model - -This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. - -```{note} -The complexity of adding a new model depends heavily on the model's architecture. -The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. -However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -``` - -```{note} -By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, -please follow [this guide](#enabling-multimodal-inputs) after implementing the model here. -``` - -```{tip} -If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository. -We will be happy to help you out! -``` - -## 0. Fork the vLLM repository - -Start by forking our [GitHub] repository and then [build it from source](#build-from-source). -This gives you the ability to modify the codebase and test your model. - -```{tip} -If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. -``` - -## 1. Bring your model code - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the directory. -For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. - -```{warning} -When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. -``` - -## 2. Make your code compatible with vLLM - -To ensure compatibility with vLLM, your model must meet the following requirements: - -### Initialization Code - -All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: - -- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. -- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. - -The initialization code should look like this: - -```python -from torch import nn -from vllm.config import VllmConfig -from vllm.attention import Attention - -class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - -class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - -class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - -class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") -``` - -### Computation Code - -Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. - -```python -def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, -) -> torch.Tensor: - ... -``` - -```{note} -Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. -If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -``` - -For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. - -## 3. (Optional) Implement tensor parallelism and quantization support - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -## 4. Implement the weight loading logic - -You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -## 5. Register your model - -Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . - -## 6. Out-of-Tree Model Integration - -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system). - -To register the model, use the following code: - -```python -from vllm import ModelRegistry -from your_code import YourModelForCausalLM -ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -``` - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -```python -from vllm import ModelRegistry - -ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") -``` - -```{important} -If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#enabling-multimodal-inputs). -``` - -```{note} -Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -``` diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md deleted file mode 100644 index fdd7708879..0000000000 --- a/docs/source/models/enabling_multimodal_inputs.md +++ /dev/null @@ -1,143 +0,0 @@ -(enabling-multimodal-inputs)= - -# Enabling Multimodal Inputs - -This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). - -```{seealso} -[Adding a New Model](adding-a-new-model) -``` - -## 1. Update the base vLLM model - -It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model). -Further update the model as follows: - -- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - - ```diff - + from vllm.model_executor.models.interfaces import SupportsMultiModal - - - class YourModelForImage2Seq(nn.Module): - + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - ``` - - ```{note} - The model class does not have to be named {code}`*ForCausalLM`. - Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. - ``` - -- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward` - for each input tensor that corresponds to a multi-modal input, as shown in the following example: - - ```diff - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - ``` - -## 2. Register input mappers - -For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper `. -This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`. - -```diff - from vllm.model_executor.models.interfaces import SupportsMultiModal -+ from vllm.multimodal import MULTIMODAL_REGISTRY - -+ @MULTIMODAL_REGISTRY.register_image_input_mapper() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` - -## 3. Register maximum number of multi-modal tokens - -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item -and register it via {meth}`INPUT_REGISTRY.register_dummy_data `. - -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() -+ @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -Here are some examples: - -- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` - -## 4. (Optional) Register dummy data - -During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. -In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data `. - -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() -+ @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -```{note} -The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. -``` - -Here are some examples: - -- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` - -## 5. (Optional) Register input processor - -Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor. -This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call. -You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor `. - -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() -+ @INPUT_REGISTRY.register_input_processor() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. -Here are some examples: - -- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md new file mode 100644 index 0000000000..cff09d12eb --- /dev/null +++ b/docs/source/models/extensions/index.md @@ -0,0 +1,8 @@ +# Built-in Extensions + +```{toctree} +:maxdepth: 1 + +runai_model_streamer +tensorizer +``` diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md similarity index 69% rename from docs/source/serving/runai_model_streamer.md rename to docs/source/models/extensions/runai_model_streamer.md index d4269050ff..75f7a9fcad 100644 --- a/docs/source/serving/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -1,6 +1,6 @@ (runai-model-streamer)= -# Loading Models with Run:ai Model Streamer +# Loading models with Run:ai Model Streamer Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). @@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea You first need to install vLLM RunAI optional dependency: ```console -$ pip3 install vllm[runai] +pip3 install vllm[runai] ``` To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` ## Tunable parameters @@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` ```{note} diff --git a/docs/source/serving/tensorizer.md b/docs/source/models/extensions/tensorizer.md similarity index 89% rename from docs/source/serving/tensorizer.md rename to docs/source/models/extensions/tensorizer.md index d3dd29d48f..ae17e3437b 100644 --- a/docs/source/serving/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -1,6 +1,6 @@ (tensorizer)= -# Loading Models with CoreWeave's Tensorizer +# Loading models with CoreWeave's Tensorizer vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized @@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). ```{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 35e0302b86..e4b4cd03a9 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. +For generative models, the only supported `--task` option is `"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. See [Engine Arguments](#engine-args) for a list of options when initializing the model. -For generative models, the only supported {code}`task` option is {code}`"generate"`. -Usually, this is automatically inferred so you don't have to specify it. - ### `LLM.generate` The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. @@ -33,7 +33,7 @@ for output in outputs: ``` You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. -For example, you can use greedy sampling by setting {code}`temperature=0`: +For example, you can use greedy sampling by setting `temperature=0`: ```python llm = LLM(model="facebook/opt-125m") @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: @@ -118,9 +118,9 @@ print("Loaded chat template:", custom_template) outputs = llm.chat(conversation, chat_template=custom_template) ``` -## Online Inference +## Online Serving -Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. - [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 76c96c9edc..91db694be2 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -14,31 +14,54 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature pooling models as they only work on the generation or decode stage, so performance may not improve as much. ``` +For pooling models, we support the following `--task` options. +The selected option sets the default pooler used to extract the final hidden states: + +```{list-table} +:widths: 50 25 25 25 +:header-rows: 1 + +* - Task + - Pooling Type + - Normalization + - Softmax +* - Embedding (`embed`) + - `LAST` + - ✅︎ + - ✗ +* - Classification (`classify`) + - `LAST` + - ✗ + - ✅︎ +* - Sentence Pair Scoring (`score`) + - \* + - \* + - \* +* - Reward Modeling (`reward`) + - `ALL` + - ✗ + - ✗ +``` + +\*The default pooler is always defined by the model. + +```{note} +If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. +``` + +When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, +we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). + +```{tip} +You can customize the model's pooling method via the `--override-pooler-config` option, +which takes priority over both the model's and Sentence Transformers's defaults. +``` + ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. See [Engine Arguments](#engine-args) for a list of options when initializing the model. -For pooling models, we support the following {code}`task` options: - -- Embedding ({code}`"embed"` / {code}`"embedding"`) -- Classification ({code}`"classify"`) -- Sentence Pair Scoring ({code}`"score"`) -- Reward Modeling ({code}`"reward"`) - -The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used: - -- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. -- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. -- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. -- Reward Modeling: Extract all of the hidden states and return them directly. - -When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`). - -You can customize the model's pooling method via the {code}`override_pooler_config` option, -which takes priority over both the model's and Sentence Transformers's defaults. - ### `LLM.encode` The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. @@ -65,7 +88,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -80,7 +103,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -102,11 +125,11 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: -## Online Inference +## Online Serving -Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models. diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 7682ed104b..8cdc663a03 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -1,9 +1,9 @@ (supported-models)= -# Supported Models +# List of Supported Models vLLM supports generative and pooling models across various tasks. -If a model supports more than one task, you can set the task via the {code}`--task` argument. +If a model supports more than one task, you can set the task via the `--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. @@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it. By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models). -To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository. -If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory. +To determine whether a given model is supported, you can check the `config.json` file inside the HF repository. +If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory. ````{tip} The easiest way to check if your model is really supported at runtime is to run the program below: @@ -37,7 +37,7 @@ print(output) If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. ```` -Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM. +Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. ### ModelScope @@ -45,10 +45,10 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: ```shell -$ export VLLM_USE_MODELSCOPE=True +export VLLM_USE_MODELSCOPE=True ``` -And use with {code}`trust_remote_code=True`. +And use with `trust_remote_code=True`. ```python from vllm import LLM @@ -216,6 +216,11 @@ See [this page](#generative-models) for more information on how to use generativ - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - ✅︎ - ✅︎ +* - `InternLM3ForCausalLM` + - InternLM3 + - `internlm/internlm3-8b-instruct`, etc. + - ✅︎ + - ✅︎ * - `JAISLMHeadModel` - Jais - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. @@ -297,8 +302,8 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Phi3ForCausalLM` - - Phi-3 - - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. + - Phi-4, Phi-3 + - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. - ✅︎ - ✅︎ * - `Phi3SmallForCausalLM` @@ -322,7 +327,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Qwen2ForCausalLM` - - Qwen2 + - QwQ, Qwen2 - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ @@ -420,20 +425,23 @@ you should explicitly specify the task type to ensure that the model is used in ``` ```{note} -{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. -You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`. +`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. +You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. ``` ```{note} -Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. -You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. +Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. +You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. -On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention +On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention despite being described otherwise on its model card. + +Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be +loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. #### Reward Modeling (`--task reward`) @@ -462,14 +470,19 @@ of the whole prompt are extracted from the normalized hidden state corresponding - `Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +* - `Qwen2ForProcessRewardModel` + - Qwen2-based + - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. +{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. ```{important} -For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, -e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, +e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. ``` #### Classification (`--task classify`) @@ -496,7 +509,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) @@ -537,16 +550,38 @@ The following modalities are supported depending on the model: - **V**ideo - **A**udio -Any combination of modalities joined by {code}`+` are supported. +Any combination of modalities joined by `+` are supported. -- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs. +- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs. -On the other hand, modalities separated by {code}`/` are mutually exclusive. +On the other hand, modalities separated by `/` are mutually exclusive. -- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. +- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. +````{important} +To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) +or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: + +Offline inference: +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` + +Online serving: +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` + +```{note} +vLLM currently only supports adding LoRA to the language backbone of multimodal models. +``` + ### Generative Models See [this page](#generative-models) for more information on how to use generative models. @@ -585,6 +620,13 @@ See [this page](#generative-models) for more information on how to use generativ - - ✅︎ - ✅︎ +* - `DeepseekVLV2ForCausalLM` + - DeepSeek-VL2 + - T + I+ + - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + - + - ✅︎ + - ✅︎ * - `FuyuForCausalLM` - Fuyu - T + I @@ -640,14 +682,14 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅︎ - - + - ✅︎ * - `LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - T + I+ + V+ - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - ✅︎ * - `MiniCPMV` - MiniCPM-V - T + IE+ @@ -686,14 +728,14 @@ See [this page](#generative-models) for more information on how to use generativ * - `Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + IE+ - - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. + - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - - ✅︎ - ✅︎ * - `PixtralForConditionalGeneration` - Pixtral - T + I+ - - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. + - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. - - ✅︎ - ✅︎ @@ -710,55 +752,44 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ - - + - ✅︎ * - `Qwen2VLForConditionalGeneration` - - Qwen2-VL + - QVQ, Qwen2-VL - T + IE+ + VE+ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - - + - ✅︎ * - `UltravoxModel` - Ultravox - T + AE+ - `fixie-ai/ultravox-v0_3` - - ✅︎ - - + - ✅︎ ``` E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -````{important} -To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) -or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: - -```python -llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, -) -``` - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 -``` -```` - ```{note} -vLLM currently only supports adding LoRA to the language backbone of multimodal models. +To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. ``` ```{note} -To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` ```{note} -The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now. +The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. For more details, please see: ``` +```{note} +The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). +A corrected version is available at . +``` + ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. @@ -770,7 +801,7 @@ you should explicitly specify the task type to ensure that the model is used in #### Text Embedding (`--task embed`) -Any text generation model can be converted into an embedding model by passing {code}`--task embed`. +Any text generation model can be converted into an embedding model by passing `--task embed`. ```{note} To get the best results, you should use pooling models that are specifically trained as such. @@ -810,19 +841,22 @@ The following table lists those that are tested in vLLM. _________________ -# Model Support Policy +## Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -```{tip} -When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. -``` + ```{tip} + When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + ``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. diff --git a/docs/source/usage/performance.md b/docs/source/performance/optimization.md similarity index 98% rename from docs/source/usage/performance.md rename to docs/source/performance/optimization.md index 2cd3801bfc..4fbc376e1a 100644 --- a/docs/source/usage/performance.md +++ b/docs/source/performance/optimization.md @@ -1,6 +1,6 @@ -(performance)= +(optimization-and-tuning)= -# Performance and Tuning +# Optimization and Tuning ## Preemption @@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -``` +```text WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/quantization/fp8_e4m3_kvcache.md deleted file mode 100644 index f200c722d1..0000000000 --- a/docs/source/quantization/fp8_e4m3_kvcache.md +++ /dev/null @@ -1,44 +0,0 @@ -(fp8-e4m3-kvcache)= - -# FP8 E4M3 KV Cache - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -```console -$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo -``` - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - -Here is an example of how to enable this feature: - -```python -# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to -# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. - -from vllm import LLM, SamplingParams -sampling_params = SamplingParams(temperature=1.3, top_p=0.8) -llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") -prompt = "London is the capital of" -out = llm.generate(prompt, sampling_params)[0].outputs[0].text -print(out) - -# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, -# output w/o scaling factors: England, located in the southeastern part of the country. It is known -``` diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/quantization/fp8_e5m2_kvcache.md deleted file mode 100644 index 3a81ab17f3..0000000000 --- a/docs/source/quantization/fp8_e5m2_kvcache.md +++ /dev/null @@ -1,31 +0,0 @@ -(fp8-kv-cache)= - -# FP8 E5M2 KV Cache - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -```python -from vllm import LLM, SamplingParams -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md deleted file mode 100644 index 77f848088e..0000000000 --- a/docs/source/serving/deploying_with_k8s.md +++ /dev/null @@ -1,248 +0,0 @@ -(deploying-with-k8s)= - -# Deploying with Kubernetes - -Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. - -## Prerequisites - -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` -- Available GPU resources in your cluster - -## Deployment Steps - -1. **Create a PVC , Secret and Deployment for vLLM** - -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options - -```yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: mistral-7b - namespace: default -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem -``` - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: hf-token-secret - namespace: default -type: Opaque -data: - token: "REPLACE_WITH_TOKEN" -``` - -Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. - -Here are two examples for using NVIDIA GPU and AMD GPU. - -- NVIDIA GPU - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 -``` - -- AMD GPU - -You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - # PVC - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "8Gi" - hostNetwork: true - hostIPC: true - containers: - - name: mistral-7b - image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 - securityContext: - seccompProfile: - type: Unconfined - runAsGroup: 44 - capabilities: - add: - - SYS_PTRACE - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - amd.com/gpu: "1" - requests: - cpu: "6" - memory: 6G - amd.com/gpu: "1" - volumeMounts: - - name: cache-volume - mountPath: /root/.cache/huggingface - - name: shm - mountPath: /dev/shm -``` -You can get the full example with steps and sample yaml files from . - -2. **Create a Kubernetes Service for vLLM** - -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: - -```yaml -apiVersion: v1 -kind: Service -metadata: - name: mistral-7b - namespace: default -spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP -``` - -3. **Deploy and Test** - -Apply the deployment and service configurations using `kubectl apply -f `: - -```console -kubectl apply -f deployment.yaml -kubectl apply -f service.yaml -``` - -To test the deployment, run the following `curl` command: - -```console -curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "mistralai/Mistral-7B-Instruct-v0.3", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' -``` - -If the service is correctly deployed, you should receive a response from the vLLM model. - -## Conclusion - -Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index a1dd0e89e8..daf6e2f250 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. ``` -## Details for Distributed Inference and Serving +## Running vLLM on a single node vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. -To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: +To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: ```python from vllm import LLM @@ -32,45 +32,45 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4) output = llm.generate("San Franciso is a") ``` -To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: +To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console -$ vllm serve facebook/opt-13b \ -$ --tensor-parallel-size 4 + vllm serve facebook/opt-13b \ + --tensor-parallel-size 4 ``` -You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: +You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: ```console -$ vllm serve gpt2 \ -$ --tensor-parallel-size 4 \ -$ --pipeline-parallel-size 2 + vllm serve gpt2 \ + --tensor-parallel-size 4 \ + --pipeline-parallel-size 2 ``` -## Multi-Node Inference and Serving +## Running vLLM on multiple nodes If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. Pick a node as the head node, and run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --head \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --head \ + /path/to/the/huggingface/home/in/this/node ``` On the rest of the worker nodes, run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --worker \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --worker \ + /path/to/the/huggingface/home/in/this/node ``` Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. @@ -80,22 +80,22 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 8 \ -$ --pipeline-parallel-size 2 + vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 8 \ + --pipeline-parallel-size 2 ``` You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 16 +vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 16 ``` To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. ```{warning} -After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. ``` ```{warning} diff --git a/docs/source/usage/engine_args.md b/docs/source/serving/engine_args.md similarity index 100% rename from docs/source/usage/engine_args.md rename to docs/source/serving/engine_args.md diff --git a/docs/source/usage/env_vars.md b/docs/source/serving/env_vars.md similarity index 100% rename from docs/source/usage/env_vars.md rename to docs/source/serving/env_vars.md diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md deleted file mode 100644 index d214c77254..0000000000 --- a/docs/source/serving/integrations.md +++ /dev/null @@ -1,17 +0,0 @@ -# Integrations - -```{toctree} -:maxdepth: 1 - -run_on_sky -deploying_with_kserve -deploying_with_kubeai -deploying_with_triton -deploying_with_bentoml -deploying_with_cerebrium -deploying_with_lws -deploying_with_dstack -serving_with_langchain -serving_with_llamaindex -serving_with_llamastack -``` diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md new file mode 100644 index 0000000000..371c284981 --- /dev/null +++ b/docs/source/serving/integrations/index.md @@ -0,0 +1,8 @@ +# External Integrations + +```{toctree} +:maxdepth: 1 + +langchain +llamaindex +``` diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/integrations/langchain.md similarity index 76% rename from docs/source/serving/serving_with_langchain.md rename to docs/source/serving/integrations/langchain.md index 96bd5943f3..03142d23b1 100644 --- a/docs/source/serving/serving_with_langchain.md +++ b/docs/source/serving/integrations/langchain.md @@ -1,13 +1,13 @@ -(run-on-langchain)= +(serving-langchain)= -# Serving with Langchain +# LangChain -vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) . +vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) . -To install langchain, run +To install LangChain, run ```console -$ pip install langchain langchain_community -q +pip install langchain langchain_community -q ``` To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/integrations/llamaindex.md similarity index 68% rename from docs/source/serving/serving_with_llamaindex.md rename to docs/source/serving/integrations/llamaindex.md index 98859d8e3f..8c72605202 100644 --- a/docs/source/serving/serving_with_llamaindex.md +++ b/docs/source/serving/integrations/llamaindex.md @@ -1,13 +1,13 @@ -(run-on-llamaindex)= +(serving-llamaindex)= -# Serving with llama_index +# LlamaIndex -vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) . +vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) . -To install llamaindex, run +To install LlamaIndex, run ```console -$ pip install llama-index-llms-vllm -q +pip install llama-index-llms-vllm -q ``` To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 2dc78643f6..6c84f6d135 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -4,10 +4,10 @@ vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. -You can start the server using Python, or using [Docker](deploying_with_docker.md): +You can start the server using Python, or using [Docker](#deployment-docker): ```console -$ vllm serve unsloth/Llama-3.2-1B-Instruct +vllm serve unsloth/Llama-3.2-1B-Instruct ``` Then query the endpoint to get the latest metrics from the server: diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md similarity index 91% rename from docs/source/usage/multimodal_inputs.md rename to docs/source/serving/multimodal_inputs.md index 4f45a9f448..0213b0a338 100644 --- a/docs/source/usage/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -14,11 +14,11 @@ and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/ch To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: - `prompt`: The prompt should follow the format that is documented on HuggingFace. -- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`. +- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`. ### Image -You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples: +You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: ```python llm = LLM(model="llava-hf/llava-1.5-7b-hf") @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -122,21 +122,21 @@ for o in outputs: ### Video -You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary +You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio -You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. +You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, -pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. +pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. ```python # Inference with image embeddings as input @@ -199,7 +199,7 @@ for o in outputs: print(generated_text) ``` -## Online Inference +## Online Serving Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). @@ -271,7 +271,7 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` -Full example: +Full example: ```{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, @@ -294,7 +294,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT= ### Video -Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf). +Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf). First, launch the OpenAI-compatible server: @@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ``` Then, you can use the OpenAI client as follows: + ```python from openai import OpenAI @@ -342,7 +343,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching videos through HTTP URL is `30` seconds. @@ -418,7 +419,7 @@ result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) ``` -Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input: +Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: ```python chat_completion_from_url = client.chat.completions.create( @@ -445,7 +446,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. @@ -529,4 +530,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th example below for details. ``` -Full example: +Full example: diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md new file mode 100644 index 0000000000..8a18598665 --- /dev/null +++ b/docs/source/serving/offline_inference.md @@ -0,0 +1,103 @@ +(offline-inference)= + +# Offline Inference + +You can run vLLM in your own code on a list of prompts. + +The offline API is based on the {class}`~vllm.LLM` class. +To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run. + +For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace +and runs it in vLLM using the default configuration. + +```python +llm = LLM(model="facebook/opt-125m") +``` + +After initializing the `LLM` instance, you can perform model inference using various APIs. +The available APIs depend on the type of model that is being run: + +- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text. +- [Pooling models](#pooling-models) output their hidden states directly. + +Please refer to the above pages for more details about each API. + +```{seealso} +[API Reference](/api/offline_inference/index) +``` + +## Configuration Options + +This section lists the most common options for running the vLLM engine. +For a full list, refer to the [Engine Arguments](#engine-args) page. + +(model-resolution)= + +### Model resolution + +vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository +and finding the corresponding implementation that is registered to vLLM. +Nevertheless, our model resolution may fail for the following reasons: + +- The `config.json` of the model repository lacks the `architectures` field. +- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. +- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. + +To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. +For example: + +```python +model = LLM( + model="cerebras/Cerebras-GPT-1.3B", + hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 +) +``` + +Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM. + +### Reducing memory usage + +Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. + +#### Tensor Parallelism (TP) + +Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs. + +The following code splits the model across 2 GPUs. + +```python +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", + tensor_parallel_size=2) +``` + +```{important} +To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`) +before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. + +To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. +``` + +#### Quantization + +Quantized models take less memory at the cost of lower precision. + +Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic)) +and used directly without extra configuration. + +Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details. + +#### Context length and batch size + +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) +and the maximum batch size (`max_num_seqs` option). + +```python +llm = LLM(model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2) +``` + +### Performance optimization and tuning + +You can potentially improve the performance of vLLM by finetuning various options. +Please refer to [this guide](#optimization-and-tuning) for more details. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index caf5e8cafd..e49bbb0669 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -1,13 +1,17 @@ -# OpenAI Compatible Server +(openai-compatible-server)= -vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! +# OpenAI-Compatible Server + +vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! + +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): -You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. + ```python from openai import OpenAI client = OpenAI( @@ -48,6 +52,7 @@ In addition, we have the following custom APIs: - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= + ## Chat Template In order for the language model to support chat protocol, vLLM requires the model to include @@ -69,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -78,7 +84,7 @@ completion = client.chat.completions.create( ) ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like *"Detected the chat template content format to be..."*, and internally converts incoming requests to match @@ -113,12 +119,12 @@ completion = client.chat.completions.create( ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled -with `--enable-request-id-headers`. +with `--enable-request-id-headers`. > Note that enablement of the headers can impact performance significantly at high QPS > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio), > rather than within the vLLM layer for this reason. -> See https://github.com/vllm-project/vllm/pull/11529 for more details. +> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. ```python completion = client.chat.completions.create( @@ -145,6 +151,7 @@ print(completion._request_id) ## CLI Reference (vllm-serve)= + ### `vllm serve` The `vllm serve` command is used to launch the OpenAI-compatible server. @@ -173,7 +180,7 @@ uvicorn-log-level: "info" To use the above config file: ```bash -$ vllm serve SOME_MODEL --config config.yaml +vllm serve SOME_MODEL --config config.yaml ``` ```{note} @@ -184,16 +191,17 @@ The order of priorities is `command line > config file values > defaults`. ## API Reference (completions-api)= + ### Completions API Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters](#sampling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -210,6 +218,7 @@ The following extra parameters are supported: ``` (chat-api)= + ### Chat API Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); @@ -217,14 +226,14 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. +see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - *Note: `image_url.detail` parameter is not supported.* -Code example: +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters](#sampling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -241,6 +250,7 @@ The following extra parameters are supported: ``` (embeddings-api)= + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); @@ -253,11 +263,11 @@ which will be treated as a single prompt to the model. This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` -Code example: +Code example: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters](#pooling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -282,6 +292,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s ``` (tokenizer-api)= + ### Tokenizer API Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). @@ -291,15 +302,17 @@ It consists of two endpoints: - `/detokenize` corresponds to calling `tokenizer.decode()`. (pooling-api)= + ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: (score-api)= + ### Score API Our Score API applies a cross-encoder model to predict scores for sentence pairs. @@ -307,7 +320,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: +Code example: #### Single inference @@ -445,7 +458,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters](#pooling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/docs/source/usage/usage_stats.md b/docs/source/serving/usage_stats.md similarity index 98% rename from docs/source/usage/usage_stats.md rename to docs/source/serving/usage_stats.md index 3d02fbab92..cfc3cb2576 100644 --- a/docs/source/usage/usage_stats.md +++ b/docs/source/serving/usage_stats.md @@ -45,7 +45,7 @@ You can preview the collected data by running the following command: tail ~/.config/vllm/usage_stats.json ``` -## Opt-out of Usage Stats Collection +## Opting out You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: diff --git a/examples/fp8/README.md b/examples/fp8/README.md deleted file mode 100644 index 181c36558f..0000000000 --- a/examples/fp8/README.md +++ /dev/null @@ -1,96 +0,0 @@ -# FP8 KV Cache - -This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. - -## Prerequisites - -- Python 3.x -- PyTorch -- NumPy -- Hugging Face Transformers -- Hugging Face Hub -- AMMO - -Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps: -1. Install all necessary prerequisites and dependencies. -2. Convert HF model into a quantized HF model. -3. Extract KV Cache Scaling Factors from quantized HF model. -4. Load KV Cache Scaling Factors into VLLM. - -### 2. Convert HF model into a quantized HF model. -Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). - -`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). - -The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`. - -### 3. Extract KV Cache Scaling Factors from quantized HF model. -`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: -1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. - -2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. - -3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks. - -```python -# prerequisites: -# - Quantized HF LLaMa 2 model -python3 examples/fp8/extract_scales.py --help -Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] - -KV Scale Extraction Example - -optional arguments: ---quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). -Optional arguments: ---cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) ---load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) ---revision: Specify the model's revision number. (Default: None) ---output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) ---output_name: Specify the output filename. (Default: kv_cache_scales.json) ---tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) -``` -```python -Example: -python3 examples/fp8/extract_scales.py --quantized_model --tp_size --output_dir -``` -### 4. Load KV Cache Scaling Factors into VLLM. -This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. -```python -# prerequisites: -# - LLaMa 2 kv_cache_scales.json file - -python3 benchmarks/benchmark_throughput.py --help -usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] - [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] - [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] - [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] - [--quantization-param-path KV_CACHE_quantization_param_path] - -Benchmark Throughput Example -optional arguments: - -h, --help show this help message and exit - --backend {vllm,hf,mii} - --dataset DATASET Path to the dataset. - --input-len INPUT_LEN Input prompt length for each request - --output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset. - --model MODEL - --tokenizer TOKENIZER - --quantization {awq,gptq,None}, -q {awq,gptq,None} - --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE - --n N Number of generated sequences per prompt. - --use-beam-search - --num-prompts NUM_PROMPTS Number of prompts to process. - --seed SEED - --hf-max-batch-size HF_MAX_BATCH_SIZE Maximum batch size for HF backend. - --trust-remote-code trust remote code from huggingface - --max-model-len MAX_MODEL_LEN Maximum length of a sequence (including prompt and output). If None, will be derived from the model. - --dtype {auto,half,float16,bfloat16,float,float32} data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. - --enforce-eager enforce eager execution - --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. - --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. -``` -``` -Example: -python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --quantization-param-path --model -```python diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py deleted file mode 100644 index 1dce9d7e99..0000000000 --- a/examples/fp8/extract_scales.py +++ /dev/null @@ -1,367 +0,0 @@ -import argparse -import glob -import json -import os -from typing import Any, Callable, Dict, List, Optional, Tuple - -import numpy as np -import torch -from safetensors.torch import safe_open - -from vllm.model_executor.layers.quantization.schema import QuantParamSchema - - -# Adapted from vllm/model_executor/model_loader/weight_utils.py -# The main differences are that we add the NPZ format and simplify -# its functionality drastically for our purposes (e.g. we assume that -# the quantized model exists locally and there is no need to download it) -def _prepare_hf_weights( - quantized_model_dir: str, - load_format: str = "auto", - fall_back_to_pt: bool = True, -) -> Tuple[List[str], bool]: - if not os.path.isdir(quantized_model_dir): - raise FileNotFoundError( - f"The quantized model directory `{quantized_model_dir}` " - "does not exist.") - use_safetensors = False - # Some quantized models use .pt files for storing the weights. - if load_format == "auto": - allow_patterns = ["*.safetensors", "*.bin"] - elif load_format == "safetensors": - use_safetensors = True - allow_patterns = ["*.safetensors"] - elif load_format == "pt": - allow_patterns = ["*.pt"] - elif load_format == "npz": - allow_patterns = ["*.npz"] - else: - raise ValueError(f"Unknown load_format: {load_format}") - if fall_back_to_pt: - allow_patterns += ["*.pt"] - - hf_weights_files: List[str] = [] - for pattern in allow_patterns: - hf_weights_files += glob.glob( - os.path.join(quantized_model_dir, pattern)) - if len(hf_weights_files) > 0: - if pattern == "*.safetensors": - use_safetensors = True - break - - if not use_safetensors: - # Exclude files that are not needed for inference. - # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 - blacklist = [ - "training_args.bin", - "optimizer.bin", - "optimizer.pt", - "scheduler.pt", - "scaler.pt", - ] - hf_weights_files = [ - f for f in hf_weights_files - if not any(f.endswith(x) for x in blacklist) - ] - - if len(hf_weights_files) == 0: - raise RuntimeError( - f"Cannot find any model weights with `{quantized_model_dir}`") - - return hf_weights_files, use_safetensors - - -# Adapted from vllm/model_executor/model_loader/weight_utils.py -def _hf_tensorfile_iterator(filename: str, load_format: str, - use_safetensors: bool): - if load_format == "npz": - assert not use_safetensors - with np.load(filename) as data: - for name in data.files: - param = torch.from_numpy(data[name]) - yield name, param - elif use_safetensors: - with safe_open(filename, framework="pt") as f: - for name in f.keys(): # NOQA: SIM118 - param = f.get_tensor(name) - yield name, param - else: - state = torch.load(filename, map_location="cpu") - for name, param in state.items(): - yield name, param - del state - torch.cuda.empty_cache() - - -def _kv_scales_extractor( - hf_tensor_files: List[str], - use_safetensors: bool, - rank_keyword: str = "rank", - expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: - """ - Given a list of files containing tensor data, attempt to extract KV cache - scales from these files. Intended as a helper function taking in the output - from _prepare_hf_weights. - Args: - rank_keyword Matches the number immediately after this keyword in the - tensor filename to determine the TP rank corresponding - to said tensor file - expected_tp_size If specified, the TP size of the tensor files is checked - against this and an error is raised if they don't match. - Returns a dictionary mapping TP ranks to their relevant KV cache scales. - The per-rank scales are themselves represented as a dictionary of layer - indices to the respective per-layer scale. - """ - for char in rank_keyword: - assert not char.isdecimal( - ), f"Rank keyword {rank_keyword} contains a numeric character!" - rank_scales_map: Dict[int, Dict[int, float]] = {} - for tensor_file in hf_tensor_files: - try: - rank_idx = tensor_file.find(rank_keyword) - if rank_idx != -1: - start_idx = rank_idx + len(rank_keyword) - stop_idx = start_idx - while stop_idx < len( - tensor_file) and tensor_file[stop_idx].isdecimal(): - stop_idx += 1 - if stop_idx == start_idx: - raise RuntimeError("Did not find rank # in filename.") - rank = int(tensor_file[start_idx:stop_idx]) - elif len(hf_tensor_files) == 1: - # Since there is only one tensor file, we can assume - # that it's intended for TP rank 0 - rank = 0 - else: - raise RuntimeError( - f"Filename does not contain '{rank_keyword}'.") - except RuntimeError: - print("Unable to determine TP rank " - f"corresponding to file '{tensor_file}'") - raise - - if rank not in rank_scales_map: - layer_scales_map: Dict[int, float] = {} - rank_scales_map[rank] = layer_scales_map - else: - raise RuntimeError( - f"Tensor file '{tensor_file}' shares TP rank {rank} " - "with another tensor file.") - - module_delimiter = ":" if args.load_format == "npz" else "." - for name, param in _hf_tensorfile_iterator(tensor_file, - args.load_format, - use_safetensors): - if "kv_cache_scaling_factor" in name: - nums = [ - int(s) for s in name.split(module_delimiter) - if s.isdecimal() - ] - assert len( - nums) == 1, f"Could not determine layer idx for {name}" - layer_idx = nums[0] - assert layer_idx not in layer_scales_map, f"Duplicate scaling"\ - f" factor corresponding to layer {layer_idx}" - try: - layer_scales_map[layer_idx] = param.item() - except RuntimeError: - print( - "This utility supports only per-tensor scalar scales " - f"for now. The tensor\n {name} = {param} \nis an " - "invalid scale factor.") - raise - - if all( - len(layer_scales_map) == 0 - for layer_scales_map in rank_scales_map.values()): - # Note: this is true even if the rank_scales_map is empty - print("WARNING: No KV cache scale factors found. No output saved.") - return None - empirical_tp_world_size = max(rank_scales_map.keys()) + 1 - if expected_tp_size is not None: - assert expected_tp_size == empirical_tp_world_size, \ - f"User expected TP world size = {expected_tp_size} " \ - "from model but tool is expecting TP world size = " \ - f"{empirical_tp_world_size} from model instead." - for i in range(empirical_tp_world_size): - assert i in rank_scales_map, "Expected TP world size = "\ - f"{empirical_tp_world_size} but did not find KV " \ - f"cache scaling factors for TP rank {i}" - print(f"Found TP world size = {empirical_tp_world_size} " - "when extracting KV cache scales!") - return rank_scales_map - - -def _metadata_extractor(quantized_model_dir: str, - metadata_extract_fns: \ - Dict[str, Callable[[Dict[str, Any]], Any]]) \ - -> Dict[str, Any]: - """ - Given a directory containing quantized model files, this function - aims to extract metadata from the JSON files within this directory. - Each JSON file is expected to represent a dictionary in JSON - format (referred to as a "JSON-dictionary"). Metadata extraction is - defined by a dictionary called metadata_extract_fns, where each - metadata field name is mapped to an extraction function. - - These extraction functions are designed to take a JSON-dictionary - as their only argument and return the corresponding metadata. - While extraction functions are permitted to raise exceptions, they - should only raise a KeyError or ValueError if the metadata field - cannot be extracted from the current JSON-dictionary, yet there's - a possibility of finding it in another JSON-dictionary. - - The function returns a dictionary that maps metadata fields to - their extracted data. The keys of this dictionary correspond exactly - to those in metadata_extract_fns. If any fields fail to be extracted, - their corresponding values are set to None, and a warning is printed. - """ - if not os.path.isdir(quantized_model_dir): - raise FileNotFoundError( - f"The quantized model directory `{quantized_model_dir}` " - "does not exist.") - metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) - - result: Dict[str, Any] = {} - for file in metadata_files: - with open(file) as f: - try: - metadata = json.load(f) - except json.JSONDecodeError: - print(f"Could not parse `{file}` as a valid metadata file," - " skipping it.") - continue - if not isinstance(metadata, dict): - print(f"The file `{file}` does not correspond to a " - "JSON-serialized dictionary, skipping it.") - continue - for metadata_name, extract_fn in metadata_extract_fns.items(): - try: - metadata_info = extract_fn(metadata) - if metadata_name not in result: - result[metadata_name] = metadata_info - elif metadata_info != result[metadata_name]: - raise RuntimeError( - "Metadata mismatch! Originally found " - f"{metadata_name} = {result[metadata_name]} but " - f"now found {metadata_name} = {metadata_info} in " - f"`{file}`") - except KeyError: - # It is possible that a given file does not contain some - # of our selected metadata as it could be located in some - # other metadata file. - # 'EFINAE': extract_fn failure is not an error. - pass - except ValueError: - # See above. - pass - - # Warn if we cannot find any of the requested metadata - for metadata_name in metadata_extract_fns: - if metadata_name not in result: - print("WARNING: Unable to find requested metadata field " - f"`{metadata_name}`, setting it to None.") - result[metadata_name] = None - - return result - - -def main(args): - metadata_extract_fns = { - "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"], - "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]), - "model_dtype": lambda json_dict: json_dict["dtype"] - } - recovered_metadata = _metadata_extractor(args.quantized_model, - metadata_extract_fns) - if args.tp_size is not None: - metadata_tp_size = recovered_metadata["tp_size"] - if metadata_tp_size is not None: - assert args.tp_size == metadata_tp_size, \ - f"User expected TP world size = {args.tp_size} " \ - f"but found TP world size = {metadata_tp_size} from metadata!" - expected_tp_size = args.tp_size or recovered_metadata["tp_size"] - rank_keyword = "rank" - hf_tensor_files, use_safetensors = _prepare_hf_weights( - args.quantized_model, args.load_format) - rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, - rank_keyword, expected_tp_size) - # Postprocess: formatting to the current schema. Consider pulling it - # out into a dedicated function should it ever become more complicated. - rank_scales_map = { - rank: {k: scale[k] - for k in sorted(scale.keys())} - for rank, scale in rank_scales_map.items() - } - # TODO: Expand this with activation and weights scaling factors when - # they are used in the future - schema = QuantParamSchema( - model_type=recovered_metadata["model_type"], - kv_cache={ - "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else - recovered_metadata["model_dtype"]), - "scaling_factor": - rank_scales_map - }, - ) - - if args.output_dir is None: - output_file = os.path.join(args.quantized_model, args.output_name) - else: - if not os.path.isdir(args.output_dir): - os.makedirs(args.output_dir, exist_ok=True) - output_file = os.path.join(args.output_dir, args.output_name) - - with open(output_file, 'w') as f: - f.write(schema.model_dump_json(indent=4)) - print(f"Completed! KV cache scaling factors saved to {output_file}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="This simple utility extracts the " - "KV cache scaling factors from a quantized HF model " - "and saves them to a JSON file compatible with later " - "use by vLLM (pass this file to the appropriate " - "runtime typically using the argument " - "--quantization-param-path ). This is only used " - "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") - parser.add_argument( - "--quantized-model", - help="Specify the directory containing a single quantized HF model. " - "It is expected that the quantization format is FP8_E4M3, for use " - "on ROCm (AMD GPU).", - required=True) - parser.add_argument( - "--load_format", - help="Optionally specify the format of the model's tensor files " - "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "npz", "pt"], - default="auto") - parser.add_argument( - "--output-dir", - help="Optionally specify the output directory. By default the " - "KV cache scaling factors will be saved in the model directory, " - "however you can override this behavior here.", - default=None) - parser.add_argument( - "--output-name", - help="Optionally specify the output filename.", - # TODO: Change this once additional scaling factors are enabled - default="kv_cache_scales.json") - parser.add_argument( - "--tp-size", - help="Optionally specify the tensor-parallel (TP) size that the " - "quantized model should correspond to. If specified, during KV " - "cache scaling factor extraction the observed TP size will be " - "checked against this and an error will be raised if there is " - "a mismatch. If not specified, the quantized model's expected " - "TP size is instead inferred from the largest TP rank observed. " - "The expected TP size is cross-checked against the TP ranks " - "observed in the quantized model and an error is raised if any " - "discrepancies are found.", - default=None, - type=int) - args = parser.parse_args() - - main(args) diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md deleted file mode 100644 index d0895e97dc..0000000000 --- a/examples/fp8/quantizer/README.md +++ /dev/null @@ -1,32 +0,0 @@ -### Quantizer Utilities -`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported -from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py) - -### Prerequisite - -#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later -`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` - -#### AMMO Download (code and docs) -`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` -`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` - -### Usage - -#### Run on H100 system for speed if FP8; number of GPUs depends on the model size - -#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: -`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1` - -Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) -``` -# ll ./ll2_7b_fp8/ -total 19998244 -drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ -drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ --rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json --rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz --rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors -# -``` - diff --git a/examples/fp8/quantizer/quantize.py b/examples/fp8/quantizer/quantize.py deleted file mode 100644 index d75cc8b3d1..0000000000 --- a/examples/fp8/quantizer/quantize.py +++ /dev/null @@ -1,367 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501 -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Adapted from examples/quantization/hf_ptq.py -""" - -import argparse -import copy -import json -import random -import time - -import ammo.torch.quantization as atq -import numpy as np -import torch -from ammo.torch.export import export_model_config -from datasets import load_dataset -from torch.utils.data import DataLoader -from transformers import AutoModelForCausalLM, AutoTokenizer - -RAND_SEED = 1234 -MAX_SEQ_LEN = 2048 - -EMPTY_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "enable": False, - }, - "*input_quantizer": { - "enable": False - }, - "*lm_head*": { - "enable": False - }, - "*output_layer*": { - "enable": False - }, - "default": { - "enable": False - }, - }, - "algorithm": "max", -} - -KV_CACHE_CFG = { - "*.query_key_value.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.Wqkv.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.W_pack.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.c_attn.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.k_proj.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.v_proj.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, -} - -QUANT_CFG_CHOICES = { - "int8_sq": atq.INT8_SMOOTHQUANT_CFG, - "fp8": atq.FP8_DEFAULT_CFG, - "int4_awq": atq.INT4_AWQ_CFG, - "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, - "int8_wo": EMPTY_CFG, - "int4_wo": EMPTY_CFG, - "full_prec": EMPTY_CFG, -} - -MODEL_NAME_PATTERN_MAP = { - "GPT2": "gpt2", - "Xverse": "llama", - "Llama": "llama", - "Mistral": "llama", - "GPTJ": "gptj", - "FalconForCausalLM": "falcon", - "RWForCausalLM": "falcon", - "baichuan": "baichuan", - "MPT": "mpt", - "Bloom": "bloom", - "ChatGLM": "chatglm", - "QWen": "qwen", -} - - -def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None): - print(f"Initializing tokenizer from {ckpt_path}") - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, - model_max_length=max_seq_len, - padding_side="left", - trust_remote_code=True, - ) - if model_type and model_type == "qwen": - # qwen use token id 151643 as pad and eos tokens - tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) - tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) - - # can't set attribute 'pad_token' for "" - if tokenizer.pad_token != "": - tokenizer.pad_token = tokenizer.eos_token - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - assert (tokenizer.pad_token - is not None), f"Pad token for {model_type} cannot be set!" - - return tokenizer - - -def get_model(ckpt_path, dtype="fp16", device="cuda"): - print(f"Initializing model from {ckpt_path}") - if dtype == "bf16" or dtype == "bfloat16": - dtype = torch.bfloat16 - elif dtype == "fp16" or dtype == "float16": - dtype = torch.float16 - elif dtype == "fp32" or dtype == "float32": - dtype = torch.float32 - else: - raise NotImplementedError(f"Unknown dtype {dtype}") - - # model_kwargs = {"torch_dtype": dtype} - model_kwargs = {"torch_dtype": "auto"} - - model = AutoModelForCausalLM.from_pretrained(ckpt_path, - device_map="auto", - **model_kwargs, - trust_remote_code=True) - model.eval() - - model_dtype = next(model.parameters()).dtype - if dtype != model_dtype: - print("[TensorRT-LLM][WARNING] The manually set model data type is " - f"{dtype}, but the data type of the HuggingFace model is " - f"{model_dtype}.") - - return model - - -def get_model_type(model): - for k, v in MODEL_NAME_PATTERN_MAP.items(): - if k.lower() in type(model).__name__.lower(): - return v - return None - - -def get_calib_dataloader(data="cnn_dailymail", - tokenizer=None, - batch_size=1, - calib_size=512, - block_size=512, - device=None): - print("Loading calibration dataset") - if data == "pileval": - dataset = load_dataset( - "json", - data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", - split="train") - dataset = dataset["text"][:calib_size] - elif data == "cnn_dailymail": - dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") - dataset = dataset["article"][:calib_size] - else: - raise NotImplementedError - - batch_encoded = tokenizer.batch_encode_plus(dataset, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=block_size) - if device: - batch_encoded = batch_encoded.to(device) - batch_encoded = batch_encoded["input_ids"] - - calib_dataloader = DataLoader(batch_encoded, - batch_size=batch_size, - shuffle=False) - - return calib_dataloader - - -def quantize_model(model, quant_cfg, calib_dataloader=None): - - def calibrate_loop(): - if calib_dataloader is None: - return - """Adjusts weights and scaling factors based on selected algorithms.""" - for idx, data in enumerate(calib_dataloader): - print(f"Calibrating batch {idx}") - model(data) - - print("Starting quantization...") - start_time = time.time() - atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - end_time = time.time() - print("Quantization done. Total time used: {:.2f} s.".format(end_time - - start_time)) - - return model - - -def main(args): - if not torch.cuda.is_available(): - raise OSError("GPU is required for inference.") - - random.seed(RAND_SEED) - np.random.seed(RAND_SEED) - - model = get_model(args.model_dir, args.dtype, args.device) - model_type = get_model_type(model) - tokenizer = get_tokenizer(args.model_dir, model_type=model_type) - - if args.qformat in ["full_prec", "int8_wo", "int4_wo" - ] and args.kv_cache_dtype is None: - print(f"No quantization applied, export {args.dtype} model") - else: - if "awq" in args.qformat: - if args.calib_size > 32: - print("AWQ calibration could take longer with calib_size = " - f"{args.calib_size}, Using calib_size=32 instead") - args.calib_size = 32 - print("\nAWQ calibration could take longer than other calibration " - "methods. Please increase the batch size to speed up the " - "calibration process. Batch size can be set by adding the " - "argument --batch_size to the command line.\n") - - calib_dataloader = get_calib_dataloader( - tokenizer=tokenizer, - batch_size=args.batch_size, - calib_size=args.calib_size, - device=args.device, - ) - - if args.qformat in QUANT_CFG_CHOICES: - quant_cfg = QUANT_CFG_CHOICES[args.qformat] - else: - raise ValueError( - f"Unsupported quantization format: {args.qformat}") - - if "awq" in args.qformat: - quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat]) - weight_quantizer = quant_cfg["quant_cfg"][ - "*weight_quantizer"] # type: ignore - if isinstance(weight_quantizer, list): - weight_quantizer = weight_quantizer[0] - weight_quantizer["block_sizes"][-1] = args.awq_block_size - - if args.kv_cache_dtype is not None: - if args.kv_cache_dtype == "fp8": - for value in KV_CACHE_CFG.values(): - value.update({"num_bits": (4, 3)}) # type: ignore - quant_cfg["quant_cfg"].update(KV_CACHE_CFG) # type: ignore - - print(quant_cfg) - - model = quantize_model(model, quant_cfg, calib_dataloader) - - with torch.inference_mode(): - if model_type is None: - print(f"Unknown model type {type(model).__name__}. Continue " - "exporting...") - model_type = f"unknown:{type(model).__name__}" - - export_path = args.output_dir - start_time = time.time() - - if args.qformat == "int4_awq" and model_type == "qwen": - torch.save(model.state_dict(), export_path) - else: - export_npz = (model_type not in [ - 'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan' - ]) - - # export safetensors - export_model_config( - model, - model_type, - getattr(torch, args.dtype), - export_dir=export_path, - inference_tensor_parallel=args.tp_size, - inference_pipeline_parallel=args.pp_size, - # export_tensorrt_llm_config=(not export_npz), - export_tensorrt_llm_config=False, - export_npz=export_npz) - - # Workaround for wo quantization - if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: - with open(f"{export_path}/config.json") as f: - tensorrt_llm_config = json.load(f) - if args.qformat == "int8_wo": - tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' - elif args.qformat == "int4_wo": - tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16' - else: - tensorrt_llm_config["quantization"]["quant_algo"] = None - with open(f"{export_path}/config.json", "w") as f: - json.dump(tensorrt_llm_config, f, indent=4) - - end_time = time.time() - print("Quantized model exported to {} \nTotal time used {:.2f} s.". - format(export_path, end_time - start_time)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--model-dir", - help="Specify where the HuggingFace model is", - required=True) - parser.add_argument("--device", default="cuda") - parser.add_argument("--dtype", help="Model data type.", default="float16") - parser.add_argument( - "--qformat", - help="Quantization format.", - default="full_prec", - choices=[ - "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", - "full_prec" - ], - ) - parser.add_argument("--batch-size", - help="Batch size for calibration.", - type=int, - default=1) - parser.add_argument("--calib-size", - help="Number of samples for calibration.", - type=int, - default=512) - parser.add_argument("--output-dir", default="exported_model") - parser.add_argument("--tp-size", type=int, default=1) - parser.add_argument("--pp-size", type=int, default=1) - parser.add_argument("--awq-block-size", type=int, default=128) - parser.add_argument("--kv-cache-dtype", - help="KV Cache dtype.", - default=None, - choices=["int8", "fp8", None]) - args = parser.parse_args() - - main(args) diff --git a/examples/gguf_inference.py b/examples/gguf_inference.py deleted file mode 100644 index 09a5fcc22e..0000000000 --- a/examples/gguf_inference.py +++ /dev/null @@ -1,38 +0,0 @@ -from huggingface_hub import hf_hub_download - -from vllm import LLM, SamplingParams - - -def run_gguf_inference(model_path): - PROMPT_TEMPLATE = "<|system|>\n{system_message}\n<|user|>\n{prompt}\n<|assistant|>\n" # noqa: E501 - system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501 - # Sample prompts. - prompts = [ - "How many helicopters can a human eat in one sitting?", - "What's the future of AI?", - ] - prompts = [ - PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt) - for prompt in prompts - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0, max_tokens=128) - - # Create an LLM. - llm = LLM(model=model_path, - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - gpu_memory_utilization=0.95) - - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -if __name__ == "__main__": - repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" - filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" - model = hf_hub_download(repo_id, filename=filename) - run_gguf_inference(model) diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py similarity index 100% rename from examples/aqlm_example.py rename to examples/offline_inference/aqlm_example.py diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/arctic.py similarity index 100% rename from examples/offline_inference_arctic.py rename to examples/offline_inference/arctic.py diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py similarity index 100% rename from examples/offline_inference_audio_language.py rename to examples/offline_inference/audio_language.py diff --git a/examples/offline_inference.py b/examples/offline_inference/basic.py similarity index 100% rename from examples/offline_inference.py rename to examples/offline_inference/basic.py diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py similarity index 100% rename from examples/offline_inference_with_default_generation_config.py rename to examples/offline_inference/basic_with_model_default_sampling.py diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/chat.py similarity index 100% rename from examples/offline_inference_chat.py rename to examples/offline_inference/chat.py diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py similarity index 100% rename from examples/offline_chat_with_tools.py rename to examples/offline_inference/chat_with_tools.py diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/classification.py similarity index 100% rename from examples/offline_inference_classification.py rename to examples/offline_inference/classification.py diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/cli.py similarity index 100% rename from examples/offline_inference_cli.py rename to examples/offline_inference/cli.py diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py similarity index 100% rename from examples/cpu_offload.py rename to examples/offline_inference/cpu_offload.py diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/distributed.py similarity index 100% rename from examples/offline_inference_distributed.py rename to examples/offline_inference/distributed.py diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/embedding.py similarity index 100% rename from examples/offline_inference_embedding.py rename to examples/offline_inference/embedding.py diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py similarity index 100% rename from examples/offline_inference_encoder_decoder.py rename to examples/offline_inference/encoder_decoder.py diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py similarity index 93% rename from examples/florence2_inference.py rename to examples/offline_inference/florence2_inference.py index b58ac2e1f7..c24096e900 100644 --- a/examples/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -3,7 +3,8 @@ Demonstrate prompting of text-to-text encoder/decoder models, specifically Florence-2 ''' # TODO(Isotr0py): -# Move to offline_inference_vision_language.py after porting vision backbone +# Move to offline_inference/vision_language.py +# after porting vision backbone from vllm import LLM, SamplingParams dtype = "float" diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py new file mode 100644 index 0000000000..aa05c4c0bf --- /dev/null +++ b/examples/offline_inference/gguf_inference.py @@ -0,0 +1,32 @@ +from huggingface_hub import hf_hub_download + +from vllm import LLM, SamplingParams + + +def run_gguf_inference(model_path, tokenizer): + # Sample prompts. + prompts = [ + "How many helicopters can a human eat in one sitting?", + "What's the future of AI?", + ] + prompts = [[{"role": "user", "content": prompt}] for prompt in prompts] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0, max_tokens=128) + + # Create an LLM. + llm = LLM(model=model_path, tokenizer=tokenizer) + + outputs = llm.chat(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" + filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" + tokenizer = "microsoft/Phi-3-medium-4k-instruct" + model = hf_hub_download(repo_id, filename=filename) + run_gguf_inference(model, tokenizer) diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py similarity index 100% rename from examples/llm_engine_example.py rename to examples/offline_inference/llm_engine_example.py diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py similarity index 100% rename from examples/lora_with_quantization_inference.py rename to examples/offline_inference/lora_with_quantization_inference.py diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py similarity index 100% rename from examples/offline_inference_mlpspeculator.py rename to examples/offline_inference/mlpspeculator.py diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py similarity index 100% rename from examples/multilora_inference.py rename to examples/offline_inference/multilora_inference.py diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/neuron.py similarity index 81% rename from examples/offline_inference_neuron.py rename to examples/offline_inference/neuron.py index 2856be7c86..f098c8e5fe 100644 --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference/neuron.py @@ -1,12 +1,5 @@ -import os - from vllm import LLM, SamplingParams -# creates XLA hlo graphs for all the context length buckets. -os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" -# creates XLA hlo graphs for all the token gen buckets. -os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" - # Sample prompts. prompts = [ "Hello, my name is", @@ -26,8 +19,8 @@ llm = LLM( # Currently, this is a known limitation in continuous batching support # in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, + max_model_len=1024, + block_size=1024, # The device can be automatically detected when AWS Neuron SDK is installed. # The device argument can be either unspecified for automated detection, # or explicitly assigned. diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py similarity index 100% rename from examples/offline_inference_neuron_int8_quantization.py rename to examples/offline_inference/neuron_int8_quantization.py diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md similarity index 92% rename from examples/offline_inference_openai.md rename to examples/offline_inference/openai/openai_batch.md index 2436417cb5..a4774e57cd 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ``` -python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -66,10 +66,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run ``` -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ``` -aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl similarity index 100% rename from examples/openai_example_batch.jsonl rename to examples/offline_inference/openai/openai_example_batch.jsonl diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py similarity index 100% rename from examples/offline_inference_pixtral.py rename to examples/offline_inference/pixtral.py diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py similarity index 100% rename from examples/offline_inference_with_prefix.py rename to examples/offline_inference/prefix_caching.py diff --git a/examples/offline_profile.py b/examples/offline_inference/profiling.py similarity index 99% rename from examples/offline_profile.py rename to examples/offline_inference/profiling.py index 46afe8aa26..8a94b5c2a8 100644 --- a/examples/offline_profile.py +++ b/examples/offline_inference/profiling.py @@ -363,7 +363,7 @@ Profile a model example: ``` - python examples/offline_profile.py \\ + python examples/offline_inference/profiling.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --enforce-eager run_num_steps -n 2 diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py new file mode 100644 index 0000000000..5c4918008d --- /dev/null +++ b/examples/offline_inference/rlhf.py @@ -0,0 +1,186 @@ +""" +a simple demonstration of RLHF with vLLM, inspired by +the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . +It follows the design that, training processes and inference processes +are different, and they live on different GPUs. +Training processes send prompts to inference processes to generate data, +and also synchronize the weights of the model by broadcasting the weights +from the training process to the inference process. +Note that this is a simple demonstration of one training instance and one +inference instance. In practice, there could be multiple training instances +and multiple inference instances. For the full implementation, please refer +to the OpenRLHF framework. +""" +import os + +import ray +import torch +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from transformers import AutoModelForCausalLM + +from vllm import LLM, SamplingParams +from vllm.utils import get_ip, get_open_port +from vllm.worker.worker import Worker + + +def stateless_init_process_group(master_address, master_port, rank, world_size, + device): + """ + vLLM provides `StatelessProcessGroup` to create a process group + without considering the global process group in torch.distributed. + It is recommended to create `StatelessProcessGroup`, and then initialize + the data-plane communication (NCCL) between external (train processes) + and vLLM workers. + """ + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.utils import StatelessProcessGroup + pg = StatelessProcessGroup.create(host=master_address, + port=master_port, + rank=rank, + world_size=world_size) + pynccl = PyNcclCommunicator(pg, device=device) + return pynccl + + +class MyWorker(Worker): + """ + The `MyWorker` class inherits from `Worker` to provide custom functions. + For simplicity, we define the `MyWorker` class in this self-contained + script. Normally, we should define the `MyWorker` class in a separate + file and pass the qualified name of the class to the `worker_cls` + parameter. + """ + + def init_weight_update_group(self, master_address, master_port, + rank_offset, world_size): + from vllm.distributed.parallel_state import get_world_group + rank = get_world_group().rank + rank_offset + self.model_update_group = stateless_init_process_group( + master_address, + master_port, + rank, + world_size, + self.device, + ) + + def update_weight(self, name, dtype, shape): + weight = torch.empty(shape, dtype=dtype, device="cuda") + self.model_update_group.broadcast(weight, + src=0, + stream=torch.cuda.current_stream()) + + self.model_runner.model.load_weights(weights=[(name, weight)]) + + del weight + + def check_weights_changed(self): + """ + Check if the weights are updated to 0. + """ + weights_updated = True + for name, p in self.model_runner.model.named_parameters(): + weights_updated = weights_updated and torch.allclose( + p, torch.zeros_like(p)) + return weights_updated + + +class MyLLM(LLM): + + def __init__(self, *args, **kwargs): + # a hack to make the script work. + # stop ray from manipulating CUDA_VISIBLE_DEVICES + # at the top-level + del os.environ["CUDA_VISIBLE_DEVICES"] + super().__init__(*args, **kwargs) + + +""" +Start the training process, here we use huggingface transformers +as an example to hold a model on GPU 0. +""" + +train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +train_model.to("cuda:0") +""" +Start the inference process, here we use vLLM to hold a model on GPU 1 and +GPU 2. For the details on how to use ray, please refer to the ray +documentation https://docs.ray.io/en/latest/ . +""" +os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" +ray.init() + +pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) +ray.get(pg_inference.ready()) +scheduling_inference = PlacementGroupSchedulingStrategy( + placement_group=pg_inference, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=0, +) +""" +launch the vLLM inference engine. +here we use `enforce_eager` to reduce the start time. +""" +llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=scheduling_inference, +)(MyLLM).remote( + model="facebook/opt-125m", + enforce_eager=True, + worker_cls=MyWorker, + tensor_parallel_size=2, + distributed_executor_backend="ray", +) + +# Generate texts from the prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0) + +outputs = ray.get(llm.generate.remote(prompts, sampling_params)) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + +# set up the communication between the training process +# and the inference engine. +master_address = get_ip() +master_port = get_open_port() + +handle = llm.collective_rpc.remote("init_weight_update_group", + args=(master_address, master_port, 1, 3)) +model_update_group = stateless_init_process_group(master_address, master_port, + 0, 3, torch.device("cuda:0")) +ray.get(handle) + +# simulate training, modify the weights of the model. +for name, p in train_model.named_parameters(): + p.data.zero_() + +# sync weight from the training process to the inference engine. +for name, p in train_model.named_parameters(): + handle = llm.collective_rpc.remote("update_weight", + args=(name, p.dtype, p.shape)) + model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) + ray.get(handle) + +# check if the weights are updated. +assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) + +# use the updated model to generate texts, they will be nonsense +# because the weights are all zeros. +outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +for output in outputs_updated: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py similarity index 100% rename from examples/save_sharded_state.py rename to examples/offline_inference/save_sharded_state.py diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/scoring.py similarity index 100% rename from examples/offline_inference_scoring.py rename to examples/offline_inference/scoring.py diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py similarity index 100% rename from examples/offline_inference_with_profiler.py rename to examples/offline_inference/simple_profiling.py diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py similarity index 100% rename from examples/offline_inference_structured_outputs.py rename to examples/offline_inference/structured_outputs.py diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py new file mode 100644 index 0000000000..b6de73eb72 --- /dev/null +++ b/examples/offline_inference/torchrun_example.py @@ -0,0 +1,64 @@ +""" +experimental support for tensor-parallel inference with torchrun, +see https://github.com/vllm-project/vllm/issues/11400 for +the motivation and use case for this example. +run the script with `torchrun --nproc-per-node=2 torchrun_example.py`, +the argument 2 should match the `tensor_parallel_size` below. +see `tests/distributed/test_torchrun_example.py` for the unit test. +""" + +from vllm import LLM, SamplingParams + +# Create prompts, the same across all ranks +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create sampling parameters, the same across all ranks +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Use `distributed_executor_backend="external_launcher"` so that +# this llm engine/instance only creates one worker. +llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", +) + +outputs = llm.generate(prompts, sampling_params) + +# all ranks will have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") +""" +Further tips: + +1. to communicate control messages across all ranks, use the cpu group, +a PyTorch ProcessGroup with GLOO backend. + +```python +from vllm.distributed.parallel_state import get_world_group +cpu_group = get_world_group().cpu_group +torch_rank = dist.get_rank(group=cpu_group) +if torch_rank == 0: + # do something for rank 0, e.g. saving the results to disk. +``` + +2. to communicate data across all ranks, use the model's device group, +a PyTorch ProcessGroup with NCCL backend. +```python +from vllm.distributed.parallel_state import get_world_group +device_group = get_world_group().device_group +``` + +3. to access the model directly in every rank, use the following code: +```python +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model +``` +""" diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/tpu.py similarity index 100% rename from examples/offline_inference_tpu.py rename to examples/offline_inference/tpu.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py similarity index 97% rename from examples/offline_inference_vision_language.py rename to examples/offline_inference/vision_language.py index b51bfae455..415439e88e 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -26,14 +26,12 @@ def run_aria(question: str, modality: str): # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, - tokenizer_mode="slow", - dtype="bfloat16", max_model_len=4096, max_num_seqs=2, - trust_remote_code=True, + dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - prompt = (f"<|im_start|>user\n<|img|>\n{question}" + prompt = (f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] @@ -66,6 +64,23 @@ def run_chameleon(question: str, modality: str): return llm, prompt, stop_token_ids +# Deepseek-VL2 +def run_deepseek_vl2(question: str, modality: str): + assert modality == "image" + + model_name = "deepseek-ai/deepseek-vl2-tiny" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}) + + prompt = f"<|User|>: \n{question}\n\n<|Assistant|>:" + stop_token_ids = None + return llm, prompt, stop_token_ids + + # Fuyu def run_fuyu(question: str, modality: str): assert modality == "image" @@ -308,7 +323,6 @@ def run_mllama(question: str, modality: str): model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) @@ -498,6 +512,7 @@ model_example_map = { "aria": run_aria, "blip-2": run_blip2, "chameleon": run_chameleon, + "deepseek_vl_v2": run_deepseek_vl2, "fuyu": run_fuyu, "glm4v": run_glm4v, "h2ovl_chat": run_h2ovl, diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py similarity index 100% rename from examples/offline_inference_vision_language_embedding.py rename to examples/offline_inference/vision_language_embedding.py diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py similarity index 89% rename from examples/offline_inference_vision_language_multi_image.py rename to examples/offline_inference/vision_language_multi_image.py index 6af8d7768e..43c44fa867 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -23,7 +23,7 @@ IMAGE_URLS = [ class ModelRequestData(NamedTuple): llm: LLM prompt: str - stop_token_ids: Optional[List[str]] + stop_token_ids: Optional[List[int]] image_data: List[Image] chat_template: Optional[str] @@ -44,12 +44,36 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" "<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return ModelRequestData( llm=llm, prompt=prompt, stop_token_ids=stop_token_ids, image_data=[fetch_image(url) for url in image_urls], - chat_template=None) + chat_template=None, + ) + + +def load_deepseek_vl2(question: str, image_urls: List[str]): + model_name = "deepseek-ai/deepseek-vl2-tiny" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=2, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + limit_mm_per_prompt={"image": len(image_urls)}) + + placeholder = "".join(f"image_{i}:\n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: @@ -162,11 +186,11 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, limit_mm_per_prompt={"image": len(image_urls)}, ) - prompt = f"<|image|><|image|><|begin_of_text|>{question}" + placeholders = "<|image|>" * len(image_urls) + prompt = f"{placeholders}<|begin_of_text|>{question}" return ModelRequestData( llm=llm, prompt=prompt, @@ -209,6 +233,31 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) +def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "mistral-community/pixtral-12b" + + # Adjust this as necessary to fit in GPU + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + tensor_parallel_size=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "[IMG]" * len(image_urls) + prompt = f"[INST]{question}\n{placeholders}[/INST]" + stop_token_ids = None + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended @@ -244,7 +293,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: +def load_qwen_vl_chat(question: str, + image_urls: List[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" llm = LLM( model=model_name, @@ -274,6 +324,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return ModelRequestData( llm=llm, prompt=prompt, @@ -342,13 +393,15 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, + "deepseek_vl_v2": load_deepseek_vl2, "h2ovl_chat": load_h2onvl, "idefics3": load_idefics3, "internvl_chat": load_internvl, "mllama": load_mllama, "NVLM_D": load_nvlm_d, "phi3_v": load_phi3v, - "qwen_vl_chat": load_qwenvl_chat, + "pixtral_hf": load_pixtral_hf, + "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, } diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/whisper.py similarity index 100% rename from examples/offline_inference_whisper.py rename to examples/offline_inference/whisper.py diff --git a/examples/api_client.py b/examples/online_serving/api_client.py similarity index 100% rename from examples/api_client.py rename to examples/online_serving/api_client.py diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore similarity index 100% rename from examples/chart-helm/.helmignore rename to examples/online_serving/chart-helm/.helmignore diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml similarity index 100% rename from examples/chart-helm/Chart.yaml rename to examples/online_serving/chart-helm/Chart.yaml diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md new file mode 100644 index 0000000000..6aa126d4fd --- /dev/null +++ b/examples/online_serving/chart-helm/README.md @@ -0,0 +1,21 @@ +# Helm Charts + +This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more. + +## Files + +- Chart.yaml: Defines the chart metadata including name, version, and maintainers. +- ct.yaml: Configuration for chart testing. +- lintconf.yaml: Linting rules for YAML files. +- values.schema.json: JSON schema for validating values.yaml. +- values.yaml: Default values for the Helm chart. +- templates/_helpers.tpl: Helper templates for defining common configurations. +- templates/configmap.yaml: Template for creating ConfigMaps. +- templates/custom-objects.yaml: Template for custom Kubernetes objects. +- templates/deployment.yaml: Template for creating Deployments. +- templates/hpa.yaml: Template for Horizontal Pod Autoscaler. +- templates/job.yaml: Template for Kubernetes Jobs. +- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget. +- templates/pvc.yaml: Template for Persistent Volume Claims. +- templates/secrets.yaml: Template for Kubernetes Secrets. +- templates/service.yaml: Template for creating Services. \ No newline at end of file diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml similarity index 100% rename from examples/chart-helm/ct.yaml rename to examples/online_serving/chart-helm/ct.yaml diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml similarity index 100% rename from examples/chart-helm/lintconf.yaml rename to examples/online_serving/chart-helm/lintconf.yaml diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl similarity index 100% rename from examples/chart-helm/templates/_helpers.tpl rename to examples/online_serving/chart-helm/templates/_helpers.tpl diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml similarity index 100% rename from examples/chart-helm/templates/configmap.yaml rename to examples/online_serving/chart-helm/templates/configmap.yaml diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml similarity index 100% rename from examples/chart-helm/templates/custom-objects.yaml rename to examples/online_serving/chart-helm/templates/custom-objects.yaml diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml similarity index 100% rename from examples/chart-helm/templates/deployment.yaml rename to examples/online_serving/chart-helm/templates/deployment.yaml diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml similarity index 100% rename from examples/chart-helm/templates/hpa.yaml rename to examples/online_serving/chart-helm/templates/hpa.yaml diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml similarity index 100% rename from examples/chart-helm/templates/job.yaml rename to examples/online_serving/chart-helm/templates/job.yaml diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml similarity index 100% rename from examples/chart-helm/templates/poddisruptionbudget.yaml rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml similarity index 100% rename from examples/chart-helm/templates/pvc.yaml rename to examples/online_serving/chart-helm/templates/pvc.yaml diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml similarity index 100% rename from examples/chart-helm/templates/secrets.yaml rename to examples/online_serving/chart-helm/templates/secrets.yaml diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml similarity index 100% rename from examples/chart-helm/templates/service.yaml rename to examples/online_serving/chart-helm/templates/service.yaml diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json similarity index 100% rename from examples/chart-helm/values.schema.json rename to examples/online_serving/chart-helm/values.schema.json diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml similarity index 100% rename from examples/chart-helm/values.yaml rename to examples/online_serving/chart-helm/values.yaml diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh similarity index 97% rename from examples/disaggregated_prefill.sh rename to examples/online_serving/disaggregated_prefill.sh index 87155273a8..2bb2824c6c 100644 --- a/examples/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -3,6 +3,8 @@ # We will launch 2 vllm instances (1 for prefill and 1 for decode), # and then transfer the KV cache between them. +set -xe + echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" sleep 1 @@ -69,7 +71,7 @@ wait_for_server 8200 # instance # NOTE: the usage of this API is subject to change --- in the future we will # introduce "vllm connect" to connect between prefill and decode instances -python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & +python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & sleep 1 # serve two example requests diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py similarity index 100% rename from examples/gradio_openai_chatbot_webserver.py rename to examples/online_serving/gradio_openai_chatbot_webserver.py diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py similarity index 100% rename from examples/gradio_webserver.py rename to examples/online_serving/gradio_webserver.py diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py similarity index 100% rename from examples/openai_chat_completion_client.py rename to examples/online_serving/openai_chat_completion_client.py diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py similarity index 98% rename from examples/openai_chat_completion_client_for_multimodal.py rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py index 213d075542..03cc037bb6 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,5 +1,5 @@ """An example showing how to use vLLM to serve multimodal models -and run online inference with OpenAI client. +and run online serving with OpenAI client. Launch the vLLM server with the following command: @@ -309,7 +309,7 @@ def main(args) -> None: if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Demo on using OpenAI client for online inference with ' + description='Demo on using OpenAI client for online serving with ' 'multimodal language models served with vLLM.') parser.add_argument('--chat-type', '-c', diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py similarity index 100% rename from examples/openai_chat_completion_client_with_tools.py rename to examples/online_serving/openai_chat_completion_client_with_tools.py diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py similarity index 100% rename from examples/openai_chat_completion_structured_outputs.py rename to examples/online_serving/openai_chat_completion_structured_outputs.py diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py similarity index 100% rename from examples/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py similarity index 100% rename from examples/openai_completion_client.py rename to examples/online_serving/openai_completion_client.py diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py similarity index 100% rename from examples/openai_cross_encoder_score.py rename to examples/online_serving/openai_cross_encoder_score.py diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py similarity index 100% rename from examples/openai_embedding_client.py rename to examples/online_serving/openai_embedding_client.py diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py similarity index 100% rename from examples/openai_pooling_client.py rename to examples/online_serving/openai_pooling_client.py diff --git a/examples/production_monitoring/Otel.md b/examples/online_serving/opentelemetry/Otel.md similarity index 100% rename from examples/production_monitoring/Otel.md rename to examples/online_serving/opentelemetry/Otel.md diff --git a/examples/production_monitoring/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py similarity index 100% rename from examples/production_monitoring/dummy_client.py rename to examples/online_serving/opentelemetry/dummy_client.py diff --git a/examples/production_monitoring/README.md b/examples/online_serving/prometheus_grafana/README.md similarity index 95% rename from examples/production_monitoring/README.md rename to examples/online_serving/prometheus_grafana/README.md index 807c0470e7..c49e5306a1 100644 --- a/examples/production_monitoring/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -1,4 +1,4 @@ -# vLLM + Prometheus/Grafana +# Prometheus and Grafana This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. @@ -6,7 +6,7 @@ Install: - [`docker`](https://docs.docker.com/engine/install/) - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository) -### Launch +## Launch Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: ```bash @@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \ Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM. -### Grafana Dashboard +## Grafana Dashboard Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`). -#### Add Prometheus Data Source +### Add Prometheus Data Source Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. @@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". -#### Import Dashboard +### Import Dashboard Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following: diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml similarity index 100% rename from examples/production_monitoring/docker-compose.yaml rename to examples/online_serving/prometheus_grafana/docker-compose.yaml diff --git a/examples/production_monitoring/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json similarity index 100% rename from examples/production_monitoring/grafana.json rename to examples/online_serving/prometheus_grafana/grafana.json diff --git a/examples/production_monitoring/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml similarity index 100% rename from examples/production_monitoring/prometheus.yaml rename to examples/online_serving/prometheus_grafana/prometheus.yaml diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh similarity index 100% rename from examples/run_cluster.sh rename to examples/online_serving/run_cluster.sh diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh similarity index 100% rename from examples/sagemaker-entrypoint.sh rename to examples/online_serving/sagemaker-entrypoint.sh diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md similarity index 100% rename from examples/logging_configuration.md rename to examples/other/logging_configuration.md diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py similarity index 96% rename from examples/tensorize_vllm_model.py rename to examples/other/tensorize_vllm_model.py index dd77a4ad0c..5fff1fdf50 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -25,7 +25,7 @@ https://github.com/coreweave/tensorizer To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -45,7 +45,7 @@ providing a `--keyfile` argument. To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -63,11 +63,11 @@ shard's rank. Sharded models serialized with this script will be named as model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.tensorize_vllm_model serialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model serialize --help`. Or for deserializing: -`python -m examples.tensorize_vllm_model deserialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -88,7 +88,7 @@ TensorizerConfig arguments desired. In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.tensorize_vllm_model deserialize --help` +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja new file mode 100644 index 0000000000..fbf3d32009 --- /dev/null +++ b/examples/template_deepseek_vl2.jinja @@ -0,0 +1,23 @@ +{%- if messages[0]['role'] == 'system' -%} + {%- set system_message = messages[0]['content'] -%} + {%- set messages = messages[1:] -%} +{%- else -%} + {% set system_message = '' -%} +{%- endif -%} + +{{ bos_token + system_message }} +{%- for message in messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif -%} + + {%- if message['role'] == 'user' -%} + {{ '<|User|>: ' + message['content'] + '\n' }} + {%- elif message['role'] == 'assistant' -%} + {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{ '<|Assistant|>: ' }} +{% endif %} diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja new file mode 100644 index 0000000000..e94661cb39 --- /dev/null +++ b/examples/template_pixtral_hf.jinja @@ -0,0 +1,38 @@ +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} + +{{- bos_token }} +{%- for message in loop_messages %} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif %} + {%- if message["role"] == "user" %} + {%- if loop.last and system_message is defined %} + {{- "[INST]" + system_message + "\n" }} + {%- else %} + {{- "[INST]" }} + {%- endif %} + {%- if message["content"] is not string %} + {%- for chunk in message["content"] %} + {%- if chunk["type"] == "text" %} + {{- chunk["text"] }} + {%- elif chunk["type"] == "image" %} + {{- "[IMG]" }} + {%- else %} + {{- raise_exception("Unrecognized content type!") }} + {%- endif %} + {%- endfor %} + {%- else %} + {{- message["content"] }} + {%- endif %} + {{- "[/INST]" }} + {%- elif message["role"] == "assistant" %} + {{- message["content"] + eos_token}} + {%- else %} + {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} + {%- endif %} +{%- endfor %} diff --git a/format.sh b/format.sh index 0b196de9d0..4bcd0be0c9 100755 --- a/format.sh +++ b/format.sh @@ -1,321 +1,5 @@ -#!/usr/bin/env bash -# YAPF formatter, adapted from ray and skypilot. -# -# Usage: -# # Do work and commit your work. +#!/bin/bash -# # Format files that differ from origin/main. -# bash format.sh - -# # Commit changed files with message 'Run yapf and ruff' -# -# -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. -# You are encouraged to run this locally before pushing changes for review. - -# Cause the script to exit if a single command fails -set -eo pipefail - -# this stops git rev-parse from failing if we run this from the .git directory -builtin cd "$(dirname "${BASH_SOURCE:-$0}")" -ROOT="$(git rev-parse --show-toplevel)" -builtin cd "$ROOT" || exit 1 - -check_command() { - if ! command -v "$1" &> /dev/null; then - echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" - exit 1 - fi -} - -check_command yapf -check_command ruff -check_command mypy -check_command codespell -check_command isort -check_command clang-format - -YAPF_VERSION=$(yapf --version | awk '{print $2}') -RUFF_VERSION=$(ruff --version | awk '{print $2}') -MYPY_VERSION=$(mypy --version | awk '{print $2}') -CODESPELL_VERSION=$(codespell --version) -ISORT_VERSION=$(isort --vn) -CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') - -# # params: tool name, tool version, required version -tool_version_check() { - expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) - if [[ "$2" != "$expected" ]]; then - echo "❓❓Wrong $1 version installed: $expected is required, not $2." - exit 1 - fi -} - -tool_version_check "yapf" "$YAPF_VERSION" -tool_version_check "ruff" "$RUFF_VERSION" -tool_version_check "mypy" "$MYPY_VERSION" -tool_version_check "isort" "$ISORT_VERSION" -tool_version_check "codespell" "$CODESPELL_VERSION" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" - -YAPF_FLAGS=( - '--recursive' - '--parallel' -) - -YAPF_EXCLUDES=( - '--exclude' 'build/**' -) - -# Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" "$@" -} - -# Format files that differ from main branch. Ignores dirs that are not slated -# for autoformat yet. -format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" - fi - -} - -# Format all files -format_all() { - yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . -} - -## This flag formats individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all -else - # Format only the files that changed in last commit. - format_changed -fi -echo 'vLLM yapf: Done' - -# Run mypy -echo 'vLLM mypy:' -tools/mypy.sh -echo 'vLLM mypy: Done' - - -# If git diff returns a file that is in the skip list, the file may be checked anyway: -# https://github.com/codespell-project/codespell/issues/1915 -# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem -CODESPELL_EXCLUDES=( - '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' -) - -# check spelling of specified files -spell_check() { - codespell "$@" -} - -spell_check_all(){ - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" -} - -# Spelling check of files that differ from main branch. -spell_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell "${CODESPELL_EXCLUDES[@]}" - fi -} - -# Run Codespell -## This flag runs spell check of individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - spell_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - spell_check_all -else - # Check spelling only of the files that changed in last commit. - spell_check_changed -fi -echo 'vLLM codespell: Done' - - -# Lint specified files -lint() { - ruff check "$@" -} - -# Lint files that differ from main branch. Ignores dirs that are not slated -# for autolint yet. -lint_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff check - fi - -} - -# Run Ruff -### This flag lints individual files. --files *must* be the first command line -### arg to use this option. -if [[ "$1" == '--files' ]]; then - lint "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - lint vllm tests -else - # Format only the files that changed in last commit. - lint_changed -fi -echo 'vLLM ruff: Done' - -# check spelling of specified files -isort_check() { - isort "$@" -} - -isort_check_all(){ - isort . -} - -# Spelling check of files that differ from main branch. -isort_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - isort - fi -} - -# Run Isort -# This flag runs spell check of individual files. --files *must* be the first command line -# arg to use this option. -if [[ "$1" == '--files' ]]; then - isort_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - isort_check_all -else - # Check spelling only of the files that changed in last commit. - isort_check_changed -fi -echo 'vLLM isort: Done' - -# Clang-format section -# Exclude some files for formatting because they are vendored -# NOTE: Keep up to date with .github/workflows/clang-format.yml -CLANG_FORMAT_EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' -) - -# Format specified files with clang-format -clang_format() { - clang-format -i "$@" -} - -# Format files that differ from main branch with clang-format. -clang_format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause clang-format to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - # Get the list of changed files, excluding the specified ones - changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) - if [ -n "$changed_files" ]; then - echo "$changed_files" | xargs -P 5 clang-format -i - fi -} - -# Format all files with clang-format -clang_format_all() { - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ - | xargs clang-format -i -} - -# Run clang-format -if [[ "$1" == '--files' ]]; then - clang_format "${@:2}" -elif [[ "$1" == '--all' ]]; then - clang_format_all -else - clang_format_changed -fi -echo 'vLLM clang-format: Done' - -echo 'vLLM actionlint:' -tools/actionlint.sh -color -echo 'vLLM actionlint: Done' - -echo 'vLLM shellcheck:' -tools/shellcheck.sh -echo 'vLLM shellcheck: Done' - -echo 'excalidraw png check:' -tools/png-lint.sh -echo 'excalidraw png check: Done' - -if ! git diff --quiet &>/dev/null; then - echo - echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" - git --no-pager diff --name-only - echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." - - exit 1 -else - echo "✨🎉 Format check passed! Congratulations! 🎉✨" -fi - -echo 'vLLM sphinx-lint:' -tools/sphinx-lint.sh -echo 'vLLM sphinx-lint: Done' +echo "vLLM linting system has been moved from format.sh to pre-commit hook." +echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook." +echo "Then linters will run automatically before each commit." diff --git a/pyproject.toml b/pyproject.toml index 45fa4bff4e..8f2e20d0f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,12 +15,17 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` +[tool.yapfignore] +ignore_patterns = [ + "build/**", +] + [tool.ruff] # Allow lines to be as long as 80. line-length = 80 exclude = [ # External file, leaving license intact - "examples/fp8/quantizer/quantize.py" + "examples/other/fp8/quantizer/quantize.py" ] [tool.ruff.lint.per-file-ignores] @@ -52,6 +57,9 @@ ignore = [ "B007", # f-string format "UP032", + # Python 3.8 typing + "UP006", "UP035", + ] [tool.mypy] @@ -101,3 +109,9 @@ markers = [ "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", ] + +[tool.pymarkdown] +plugins.md013.enabled = false # line-length +plugins.md041.enabled = false # first-line-h1 +plugins.md033.enabled = false # inline-html +plugins.md024.allow_different_nesting = true # no-duplicate-headers diff --git a/python_only_dev.py b/python_only_dev.py index f70b498402..7d95ac96e6 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -7,7 +7,7 @@ VLLM_USE_PRECOMPILED=1 pip install -e . or export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install -e . """ # noqa diff --git a/requirements-common.txt b/requirements-common.txt index 6c390bcfd1..7051ca8cb5 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.11 # Requires pytorch +outlines == 0.1.11 lark == 1.2.2 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 @@ -34,6 +34,6 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch +compressed-tensors == 0.9.0 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py diff --git a/requirements-cpu.txt b/requirements-cpu.txt index e62f313297..056fbf5a7a 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for CPUs -torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" -torch==2.5.1; platform_machine == "aarch64" +torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" +torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch -datasets # for benchmark scripts \ No newline at end of file +datasets # for benchmark scripts diff --git a/requirements-hpu.txt b/requirements-hpu.txt index f4fb89ef42..63a5f8b18f 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -3,7 +3,7 @@ # Dependencies for HPU code ray -triton +triton==3.1.0 pandas tabulate setuptools>=61 diff --git a/requirements-lint.txt b/requirements-lint.txt index 711bb50a0e..62446f9404 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,15 +1,2 @@ # formatting -yapf==0.32.0 -toml==0.10.2 -tomli==2.0.2 -ruff==0.6.5 -codespell==2.3.0 -isort==5.13.2 -clang-format==18.1.5 -sphinx-lint==1.0.0 - -# type checking -mypy==1.11.1 -types-PyYAML -types-requests -types-setuptools +pre-commit==4.0.1 diff --git a/requirements-test.in b/requirements-test.in index fb4179c3d8..bc76a91ad5 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests peft +pqdm ray[adag]==2.40.0 sentence-transformers # required for embedding tests soundfile # required for audio tests @@ -28,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 +genai_perf==0.0.8 +tritonclient==2.51.0 + numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 3771577fe8..09e009c2e2 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -37,7 +37,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements-test.in -bitsandbytes>=0.45.0 +bitsandbytes==0.45.0 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator @@ -48,6 +48,8 @@ botocore==1.35.57 # awscli # boto3 # s3transfer +bounded-pool-executor==0.0.3 + # via pqdm buildkite-test-collector==0.1.9 # via -r requirements-test.in certifi==2024.8.30 @@ -73,6 +75,8 @@ colorama==0.4.6 # tqdm-multiprocess contourpy==1.3.0 # via matplotlib +cramjam==2.9.0 + # via fastparquet cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 @@ -107,6 +111,8 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval +fastparquet==2024.11.0 + # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -128,8 +134,11 @@ fsspec[http]==2024.9.0 # via # datasets # evaluate + # fastparquet # huggingface-hub # torch +genai-perf==0.0.8 + # via -r requirements-test.in genson==1.3.0 # via datamodel-code-generator h11==0.14.0 @@ -184,6 +193,8 @@ jsonschema==4.23.0 # ray jsonschema-specifications==2024.10.1 # via jsonschema +kaleido==0.2.1 + # via genai-perf kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 @@ -198,6 +209,8 @@ lm-eval[api]==0.4.4 # via -r requirements-test.in lxml==5.3.0 # via sacrebleu +markdown-it-py==3.0.0 + # via rich markupsafe==3.0.2 # via jinja2 matplotlib==3.9.2 @@ -207,6 +220,8 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy +mdurl==0.1.2 + # via markdown-it-py mistral-common[opencv]==1.5.1 # via # -r requirements-test.in @@ -247,6 +262,8 @@ numpy==1.26.4 # datasets # decord # evaluate + # fastparquet + # genai-perf # librosa # matplotlib # mistral-common @@ -254,15 +271,18 @@ numpy==1.26.4 # numexpr # opencv-python-headless # pandas + # patsy # peft # rouge-score # sacrebleu # scikit-learn # scipy # soxr + # statsmodels # tensorizer # torchvision # transformers + # tritonclient nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -304,30 +324,39 @@ packaging==24.1 # datamodel-code-generator # datasets # evaluate + # fastparquet # huggingface-hub # lazy-loader # matplotlib # peft + # plotly # pooch # pytest # pytest-rerunfailures # ray + # statsmodels # transformers # typepy pandas==2.2.3 # via # datasets # evaluate + # fastparquet + # genai-perf + # statsmodels pathspec==0.12.1 # via black pathvalidate==3.2.1 # via pytablewriter +patsy==1.0.1 + # via statsmodels peft==0.13.2 # via # -r requirements-test.in # lm-eval pillow==10.4.0 # via + # genai-perf # matplotlib # mistral-common # sentence-transformers @@ -336,12 +365,16 @@ platformdirs==4.3.6 # via # black # pooch +plotly==5.24.1 + # via genai-perf pluggy==1.5.0 # via pytest pooch==1.8.2 # via librosa portalocker==2.10.1 # via sacrebleu +pqdm==0.2.0 + # via -r requirements-test.in propcache==0.2.0 # via yarl protobuf==5.28.3 @@ -356,7 +389,9 @@ psutil==6.1.0 py==1.11.0 # via pytest-forked pyarrow==18.0.0 - # via datasets + # via + # datasets + # genai-perf pyasn1==0.6.1 # via rsa pybind11==2.13.6 @@ -369,6 +404,8 @@ pydantic[email]==2.9.2 # mistral-common pydantic-core==2.23.4 # via pydantic +pygments==2.18.0 + # via rich pyparsing==3.2.0 # via matplotlib pytablewriter==1.2.0 @@ -377,14 +414,18 @@ pytest==8.3.3 # via # -r requirements-test.in # buildkite-test-collector + # genai-perf # pytest-asyncio # pytest-forked + # pytest-mock # pytest-rerunfailures # pytest-shard pytest-asyncio==0.24.0 # via -r requirements-test.in pytest-forked==1.6.0 # via -r requirements-test.in +pytest-mock==3.14.0 + # via genai-perf pytest-rerunfailures==14.0 # via -r requirements-test.in pytest-shard==0.1.2 @@ -395,6 +436,8 @@ python-dateutil==2.9.0.post0 # matplotlib # pandas # typepy +python-rapidjson==1.20 + # via tritonclient pytz==2024.2 # via # pandas @@ -405,9 +448,11 @@ pyyaml==6.0.2 # awscli # datamodel-code-generator # datasets + # genai-perf # huggingface-hub # peft # ray + # responses # timm # transformers ray[adag]==2.40.0 @@ -434,8 +479,13 @@ requests==2.32.3 # mistral-common # pooch # ray + # responses # tiktoken # transformers +responses==0.25.3 + # via genai-perf +rich==13.9.4 + # via genai-perf rouge-score==0.1.2 # via lm-eval rpds-py==0.20.1 @@ -466,6 +516,7 @@ scipy==1.13.1 # librosa # scikit-learn # sentence-transformers + # statsmodels sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -486,6 +537,8 @@ soxr==0.5.0.post1 # via librosa sqlitedict==2.1.0 # via lm-eval +statsmodels==0.14.4 + # via genai-perf sympy==1.13.1 # via torch tabledata==1.3.3 @@ -495,7 +548,9 @@ tabulate==0.9.0 tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 - # via lm-eval + # via + # lm-eval + # plotly tensorizer==2.9.0 # via -r requirements-test.in threadpoolctl==3.5.0 @@ -536,6 +591,7 @@ tqdm-multiprocess==0.0.11 # via lm-eval transformers==4.47.0 # via + # genai-perf # lm-eval # peft # sentence-transformers @@ -544,6 +600,10 @@ transformers-stream-generator==0.0.5 # via -r requirements-test.in triton==3.1.0 # via torch +tritonclient==2.51.0 + # via + # -r requirements-test.in + # genai-perf typepy[datetime]==1.3.2 # via # dataproperty @@ -551,6 +611,7 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via + # bitsandbytes # huggingface-hub # librosa # mistral-common @@ -559,10 +620,12 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -urllib3==1.26.20 +urllib3==2.2.3 # via # botocore # requests + # responses + # tritonclient word2number==1.1 # via lm-eval xxhash==3.5.0 diff --git a/requirements-tpu.txt b/requirements-tpu.txt index b8f0b15469..8ab18b3770 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -18,6 +18,8 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.6.0.dev20241126+cpu torchvision==0.20.0.dev20241126+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" jaxlib==0.4.36.dev20241122 jax==0.4.36.dev20241122 diff --git a/setup.py b/setup.py index ba6953dbdc..36c89d435c 100644 --- a/setup.py +++ b/setup.py @@ -34,9 +34,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE -if not sys.platform.startswith("linux"): +if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu": logger.warning( - "vLLM only supports Linux platform (including WSL). " + "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS") + VLLM_TARGET_DEVICE = "cpu" +elif not (sys.platform.startswith("linux") + or sys.platform.startswith("darwin")): + logger.warning( + "vLLM only supports Linux platform (including WSL) and MacOS." "Building on %s, " "so vLLM may not be able to run correctly", sys.platform) VLLM_TARGET_DEVICE = "empty" @@ -223,8 +228,11 @@ class cmake_build_ext(build_ext): # CMake appends the extension prefix to the install path, # and outdir already contains that prefix, so we need to remove it. + # We assume only the final component of extension prefix is added by + # CMake, this is currently true for current extensions but may not + # always be the case. prefix = outdir - for i in range(ext.name.count('.')): + if '.' in ext.name: prefix = prefix.parent # prefix here should actually be the same for all components @@ -252,7 +260,7 @@ class cmake_build_ext(build_ext): class repackage_wheel(build_ext): """Extracts libraries and other files from an existing wheel.""" - default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" def run(self) -> None: wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", @@ -293,9 +301,11 @@ class repackage_wheel(build_ext): files_to_copy = [ "vllm/_C.abi3.so", "vllm/_moe_C.abi3.so", - "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/__init__.py", + "vllm/cumem_allocator.abi3.so", # "vllm/_version.py", # not available in nightly wheels yet ] file_members = filter(lambda x: x.filename in files_to_copy, @@ -319,21 +329,26 @@ class repackage_wheel(build_ext): def _is_hpu() -> bool: - is_hpu_available = True + # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection + if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE: + return VLLM_TARGET_DEVICE == "hpu" + + # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds, + # and if it doesn't, check if habanalabs driver is loaded + is_hpu_available = False try: - subprocess.run(["hl-smi"], capture_output=True, check=True) + out = subprocess.run(["hl-smi"], capture_output=True, check=True) + is_hpu_available = out.returncode == 0 except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if not os.path.exists('/dev/accel/accel0') and not os.path.exists( - '/dev/accel/accel_controlD0'): - # last resort... + if sys.platform.startswith("linux"): try: output = subprocess.check_output( 'lsmod | grep habanalabs | wc -l', shell=True) is_hpu_available = int(output) > 0 except (ValueError, FileNotFoundError, PermissionError, subprocess.CalledProcessError): - is_hpu_available = False - return is_hpu_available or VLLM_TARGET_DEVICE == "hpu" + pass + return is_hpu_available def _no_device() -> bool: @@ -462,13 +477,9 @@ def get_gaudi_sw_version(): def get_vllm_version() -> str: - # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236 - try: - version = get_version( - write_to="vllm/_version.py", # TODO: move this to pyproject.toml - ) - except LookupError: - version = "0.0.0" + version = get_version( + write_to="vllm/_version.py", # TODO: move this to pyproject.toml + ) sep = "+" if "+" not in version else "." # dev versions might contain + @@ -543,7 +554,7 @@ def get_requirements() -> List[str]: return resolved_requirements if _no_device(): - requirements = _read_requirements("requirements-cuda.txt") + requirements = _read_requirements("requirements-cpu.txt") elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") @@ -586,8 +597,9 @@ if _is_hip(): ext_modules.append(CMakeExtension(name="vllm._rocm_C")) if _is_cuda(): - ext_modules.append( - CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c")) + ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) + ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) + ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C")) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 1c2193bb17..31a101e48e 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -44,7 +44,6 @@ def test_vllm_gc_ed(): assert weak_llm() is None -@pytest.mark.skip_v1 @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py new file mode 100644 index 0000000000..53f4ef08f3 --- /dev/null +++ b/tests/basic_correctness/test_cumem.py @@ -0,0 +1,112 @@ +import torch + +from vllm import LLM, SamplingParams +from vllm.device_allocator.cumem import CuMemAllocator +from vllm.utils import GiB_bytes + +from ..utils import fork_new_process_for_each_test + + +@fork_new_process_for_each_test +def test_basic_cumem(): + # some tensors from default memory pool + shape = (1024, 1024) + x = torch.empty(shape, device='cuda') + x.zero_() + + # some tensors from custom memory pool + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + # custom memory pool + y = torch.empty(shape, device='cuda') + y.zero_() + y += 1 + z = torch.empty(shape, device='cuda') + z.zero_() + z += 2 + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + +@fork_new_process_for_each_test +def test_cumem_with_cudagraph(): + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + weight = torch.eye(1024, device='cuda') + with allocator.use_memory_pool(tag="discard"): + cache = torch.empty(1024, 1024, device='cuda') + + def model(x): + out = x @ weight + cache[:out.size(0)].copy_(out) + return out + 1 + + x = torch.empty(128, 1024, device='cuda') + + # warmup + model(x) + + # capture cudagraph + model_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(model_graph): + y = model(x) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # after waking up, the content in the weight tensor + # should be restored, but the content in the cache tensor + # should be discarded + + # this operation is also compatible with cudagraph + + x.random_() + model_graph.replay() + + # cache content is as expected + assert torch.allclose(x, cache[:x.size(0)]) + + # output content is as expected + assert torch.allclose(y, x + 1) + + +@fork_new_process_for_each_test +def test_end_to_end(): + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) + assert used_bytes < 2 * GiB_bytes + + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/conftest.py b/tests/conftest.py index 917151ddcb..279c1bf9a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,12 +28,13 @@ from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompts) + TokensPrompt, to_enc_dec_tuple_list, + zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity) + identity, is_list_of) logger = init_logger(__name__) @@ -243,6 +244,7 @@ def video_assets() -> _VideoAssets: _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") class HfRunner: @@ -886,6 +888,12 @@ class VllmRunner: beam_width: int, max_tokens: int, ) -> List[Tuple[List[List[int]], List[str]]]: + if is_list_of(prompts, str, check="all"): + prompts = [TextPrompt(prompt=prompt) for prompt in prompts] + else: + prompts = [ + TokensPrompt(prompt_token_ids=tokens) for tokens in prompts + ] outputs = self.model.beam_search( prompts, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) @@ -923,6 +931,10 @@ class VllmRunner: req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + executor = self.model.llm_engine.model_executor + return executor.apply_model(func) + def __enter__(self): return self diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 29ac3a3c86..6642174c17 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator: block_hashes=block_hashes_seq1) assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks + # Test reset prefix cache + @staticmethod + @pytest.mark.parametrize("num_blocks", [10]) + @pytest.mark.parametrize("block_size", [16]) + def test_reset_prefix_cache(num_blocks: int, block_size: int): + """This test case simulates the case of resetting the prefix cache.""" + + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(3 * block_size)) + + first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + + # Free each block in the first chain. + for block in first_chain: + allocator.free(block) + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() > 0.0 + + # Free each block in the second chain. + for block in second_chain: + allocator.free(block) + + # Reset prefix cache. + assert allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() == 0.0 + @staticmethod def create_immutable_chain( block_size: int, diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 86ca1948ef..4072616fd3 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): for sz in test_sizes: for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with graph_capture() as graph_capture_context: + with graph_capture(device=device) as graph_capture_context: # use integers so result matches NCCL exactly inp1 = torch.randint(1, 16, (sz, ), diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 3e9b0e10a1..a8571a1157 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -59,8 +59,7 @@ def worker_fn(): device=get_world_group().device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) - with pynccl_comm.change_state(enable=True): - tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize() assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn(): group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1] pynccl_comm = PyNcclCommunicator(group=group, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) - with pynccl_comm.change_state(enable=True): - # two groups can communicate independently - if torch.distributed.get_rank() in [0, 1]: - tensor = pynccl_comm.all_reduce(tensor) - tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == 4).cpu().item() - else: - tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == 2).cpu().item() + # two groups can communicate independently + if torch.distributed.get_rank() in [0, 1]: + tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() + else: + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -107,7 +105,7 @@ def multiple_allreduce_with_vllm_worker_fn(): device = torch.device(f"cuda:{torch.distributed.get_rank()}") ensure_model_parallel_initialized(2, 2) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) - with graph_capture(): + with graph_capture(device=device): # two tp groups can communicate independently if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) @@ -137,9 +135,7 @@ def worker_fn_with_cudagraph(): # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') torch.cuda.synchronize() - with torch.cuda.graph( - graph, stream=pynccl_comm.stream), pynccl_comm.change_state( - enable=True): + with torch.cuda.graph(graph): a_out = pynccl_comm.all_reduce(a) torch.cuda.synchronize() graph.replay() @@ -168,8 +164,7 @@ def all_gather_worker_fn(): for r in range(world_size) ]).to(device) - with pynccl_comm.change_state(enable=True): - pynccl_comm.all_gather(result, tensor) + pynccl_comm.all_gather(result, tensor) torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -206,8 +201,7 @@ def reduce_scatter_worker_fn(): expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] for tensor in all_tensors).to(device) - with pynccl_comm.change_state(enable=True): - pynccl_comm.reduce_scatter(result, tensor) + pynccl_comm.reduce_scatter(result, tensor) torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -234,15 +228,13 @@ def send_recv_worker_fn(): else: tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) - with pynccl_comm.change_state(enable=True): - if pynccl_comm.rank == 0: - pynccl_comm.send(tensor, - dst=(pynccl_comm.rank + 1) % - pynccl_comm.world_size) - else: - pynccl_comm.recv(tensor, - src=(pynccl_comm.rank - 1) % - pynccl_comm.world_size) + + if pynccl_comm.rank == 0: + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) + else: + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() assert torch.all(tensor == 1).cpu().item() @@ -273,15 +265,12 @@ def multiple_send_recv_worker_fn(): 1024, dtype=torch.float32, device=device) - with pynccl_comm.change_state(enable=True): - if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.send(tensor, - dst=(pynccl_comm.rank + 1) % - pynccl_comm.world_size) - else: - pynccl_comm.recv(tensor, - src=(pynccl_comm.rank - 1) % - pynccl_comm.world_size) + if torch.distributed.get_rank() in [0, 1]: + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) + else: + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() if torch.distributed.get_rank() in [0, 2]: assert torch.all(tensor == 1).cpu().item() diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py new file mode 100644 index 0000000000..7aa03d7f04 --- /dev/null +++ b/tests/distributed/test_torchrun_example.py @@ -0,0 +1,56 @@ +# unit test for `examples/offline_inference/torchrun_example.py` + +import random + +import torch.distributed as dist + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import get_world_group + +# Create prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# to test if all ranks agree on the same kv cache configuration. +llm = LLM(model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", + gpu_memory_utilization=random.uniform(0.7, 0.9), + swap_space=random.randint(1, 4)) + +outputs = llm.generate(prompts, sampling_params) + +cpu_group = get_world_group().cpu_group + +torch_rank = dist.get_rank(group=cpu_group) + + +def test_consistent_across_ranks(obj): + if torch_rank == 0: + dist.broadcast_object_list([obj], src=0, group=cpu_group) + else: + container = [None] + dist.broadcast_object_list(container, src=0, group=cpu_group) + assert container[0] == obj + + +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_cpu_blocks) +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) + +# all ranks should have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + test_consistent_across_ranks(prompt) + test_consistent_across_ranks(generated_text) + print(f"Rank {torch_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index bbabb936e9..0e33f3662d 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -1,12 +1,13 @@ import asyncio import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine -from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync +from vllm.executor.uniproc_executor import UniProcExecutor from vllm.sampling_params import SamplingParams @@ -14,21 +15,20 @@ class Mock: ... -class CustomGPUExecutor(GPUExecutor): +class CustomUniExecutor(UniProcExecutor): - def execute_model(self, *args, **kwargs): + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: # Drop marker to show that this was ran with open(".marker", "w"): ... - return super().execute_model(*args, **kwargs) + return super().collective_rpc(method, timeout, args, kwargs) -class CustomGPUExecutorAsync(GPUExecutorAsync): - - async def execute_model_async(self, *args, **kwargs): - with open(".marker", "w"): - ... - return await super().execute_model_async(*args, **kwargs) +CustomUniExecutorAsync = CustomUniExecutor @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @@ -41,10 +41,6 @@ def test_custom_executor_type_checking(model): engine_args = AsyncEngineArgs(model=model, distributed_executor_backend=Mock) AsyncLLMEngine.from_engine_args(engine_args) - with pytest.raises(TypeError): - engine_args = AsyncEngineArgs( - model=model, distributed_executor_backend=CustomGPUExecutor) - AsyncLLMEngine.from_engine_args(engine_args) @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @@ -55,7 +51,9 @@ def test_custom_executor(model, tmp_path): assert not os.path.exists(".marker") engine_args = EngineArgs( - model=model, distributed_executor_backend=CustomGPUExecutor) + model=model, + distributed_executor_backend=CustomUniExecutor, + ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) @@ -75,7 +73,7 @@ def test_custom_executor_async(model, tmp_path): assert not os.path.exists(".marker") engine_args = AsyncEngineArgs( - model=model, distributed_executor_backend=CustomGPUExecutorAsync) + model=model, distributed_executor_backend=CustomUniExecutorAsync) engine = AsyncLLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index e07dd6deef..04505fcaae 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -6,16 +6,15 @@ from typing import Any, List, Tuple import pytest +from vllm.config import VllmConfig from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) +from vllm.worker.worker_base import WorkerWrapperBase -class DummyWorker: +class DummyWorkerWrapper(WorkerWrapperBase): """Dummy version of vllm.worker.worker.Worker""" - def __init__(self, rank: int): - self.rank = rank - def worker_method(self, worker_input: Any) -> Tuple[int, Any]: sleep(0.05) @@ -23,14 +22,15 @@ class DummyWorker: # simulate error case raise worker_input - return self.rank, input + return self.rpc_rank, input def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: result_handler = ResultHandler() + vllm_config = VllmConfig() workers = [ - ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank)) - for rank in range(8) + ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config, + rank) for rank in range(8) ] worker_monitor = WorkerMonitor(workers, result_handler) diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py new file mode 100644 index 0000000000..22473ce275 --- /dev/null +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -0,0 +1,36 @@ +import pytest + +from vllm import LLM + +from ...utils import fork_new_process_for_each_test + + +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("backend", ["mp", "ray"]) +@fork_new_process_for_each_test +def test_collective_rpc(tp_size, backend): + if tp_size == 1 and backend == "ray": + pytest.skip("Skip duplicate test case") + if tp_size == 1: + backend = None + + # intentionally define the method and class in the test function, + # to test if they can be serialized and sent to the workers + def echo_rank(self): + return self.rank + + from vllm.worker.worker import Worker + + class MyWorker(Worker): + + def echo_rank(self): + return self.rank + + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + load_format="dummy", + tensor_parallel_size=tp_size, + distributed_executor_backend=backend, + worker_cls=MyWorker) + for method in ["echo_rank", echo_rank]: + assert llm.collective_rpc(method) == list(range(tp_size)) diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 4116380923..3906ad766e 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -105,3 +105,10 @@ def test_multiple_pooling_params(llm: LLM): # pooling_params is None, default params should be applied outputs = llm.encode(PROMPTS, pooling_params=None) assert len(PROMPTS) == len(outputs) + + +@pytest.mark.skip_global_cleanup +def test_right_side_truncation(llm: LLM): + # Embeddings models should truncate the end of the prompt + tokenizer = llm.get_tokenizer() + assert tokenizer.truncation_side == "right" diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py new file mode 100644 index 0000000000..6ff99f6faa --- /dev/null +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -0,0 +1,300 @@ +import asyncio +import json +import shutil +from contextlib import suppress + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio +# downloading lora to test lora requests +from huggingface_hub import snapshot_download + +from ...utils import RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" + +BADREQUEST_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.fixture(scope="module") +def server_with_lora_modules_json(zephyr_lora_files): + # Define the json format LoRA module configurations + lora_module_1 = { + "name": "zephyr-lora", + "path": zephyr_lora_files, + "base_model_name": MODEL_NAME + } + + lora_module_2 = { + "name": "zephyr-lora2", + "path": zephyr_lora_files, + "base_model_name": MODEL_NAME + } + + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + json.dumps(lora_module_1), + json.dumps(lora_module_2), + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "64", + ] + + # Enable the /v1/load_lora_adapter endpoint + envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server_with_lora_modules_json): + async with server_with_lora_modules_json.get_async_client( + ) as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_static_lora_lineage(client: openai.AsyncOpenAI, + zephyr_lora_files): + models = await client.models.list() + models = models.data + served_model = models[0] + lora_models = models[1:] + assert served_model.id == MODEL_NAME + assert served_model.root == MODEL_NAME + assert served_model.parent is None + assert all(lora_model.root == zephyr_lora_files + for lora_model in lora_models) + assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) + assert lora_models[0].id == "zephyr-lora" + assert lora_models[1].id == "zephyr-lora2" + + +@pytest.mark.asyncio +async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, + zephyr_lora_files): + + response = await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "zephyr-lora-3", + "lora_path": zephyr_lora_files + }) + # Ensure adapter loads before querying /models + assert "success" in response + + models = await client.models.list() + models = models.data + dynamic_lora_model = models[-1] + assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.parent == MODEL_NAME + assert dynamic_lora_model.id == "zephyr-lora-3" + + +@pytest.mark.asyncio +async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI): + with pytest.raises(openai.NotFoundError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "notfound", + "lora_path": "/not/an/adapter" + }) + + +@pytest.mark.asyncio +async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, + tmp_path): + invalid_files = tmp_path / "invalid_files" + invalid_files.mkdir() + (invalid_files / "adapter_config.json").write_text("this is not json") + + with pytest.raises(openai.BadRequestError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid-json", + "lora_path": str(invalid_files) + }) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_name,config_change,expected_error", + BADREQUEST_CASES) +async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files, test_name: str, + config_change: dict, + expected_error: str): + # Create test directory + test_dir = tmp_path / test_name + + # Copy adapter files + shutil.copytree(zephyr_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + + # Test loading the adapter + with pytest.raises(openai.BadRequestError, match=expected_error): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": test_name, + "lora_path": str(test_dir) + }) + + +@pytest.mark.asyncio +async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files): + """Validate that many loras can be dynamically registered and inferenced + with concurrently""" + + # This test file configures the server with --max-cpu-loras=2 and this test + # will concurrently load 10 adapters, so it should flex the LRU cache + async def load_and_run_adapter(adapter_name: str): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": str(zephyr_lora_files) + }) + for _ in range(3): + await client.completions.create( + model=adapter_name, + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + + lora_tasks = [] + for i in range(10): + lora_tasks.append( + asyncio.create_task(load_and_run_adapter(f"adapter_{i}"))) + + results, _ = await asyncio.wait(lora_tasks) + + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + +@pytest.mark.asyncio +async def test_loading_invalid_adapters_does_not_break_others( + client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files): + + invalid_files = tmp_path / "invalid_files" + invalid_files.mkdir() + (invalid_files / "adapter_config.json").write_text("this is not json") + + stop_good_requests_event = asyncio.Event() + + async def run_good_requests(client): + # Run chat completions requests until event set + + results = [] + + while not stop_good_requests_event.is_set(): + try: + batch = await client.completions.create( + model="zephyr-lora", + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + results.append(batch) + except Exception as e: + results.append(e) + + return results + + # Create task to run good requests + good_task = asyncio.create_task(run_good_requests(client)) + + # Run a bunch of bad adapter loads + for _ in range(25): + with suppress(openai.NotFoundError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "notfound", + "lora_path": "/not/an/adapter" + }) + for _ in range(25): + with suppress(openai.BadRequestError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid", + "lora_path": str(invalid_files) + }) + + # Ensure all the running requests with lora adapters succeeded + stop_good_requests_event.set() + results = await good_task + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + # Ensure we can load another adapter and run it + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "valid", + "lora_path": zephyr_lora_files + }) + await client.completions.create( + model="valid", + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py deleted file mode 100644 index ce4f85c13f..0000000000 --- a/tests/entrypoints/openai/test_lora_lineage.py +++ /dev/null @@ -1,109 +0,0 @@ -import json - -import openai # use the official client for correctness check -import pytest -import pytest_asyncio -# downloading lora to test lora requests -from huggingface_hub import snapshot_download - -from ...utils import RemoteOpenAIServer - -# any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" - - -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def server_with_lora_modules_json(zephyr_lora_files): - # Define the json format LoRA module configurations - lora_module_1 = { - "name": "zephyr-lora", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - - lora_module_2 = { - "name": "zephyr-lora2", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - - args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - json.dumps(lora_module_1), - json.dumps(lora_module_2), - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "64", - ] - - # Enable the /v1/load_lora_adapter endpoint - envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} - - with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client_for_lora_lineage(server_with_lora_modules_json): - async with server_with_lora_modules_json.get_async_client( - ) as async_client: - yield async_client - - -@pytest.mark.asyncio -async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - zephyr_lora_files): - models = await client_for_lora_lineage.models.list() - models = models.data - served_model = models[0] - lora_models = models[1:] - assert served_model.id == MODEL_NAME - assert served_model.root == MODEL_NAME - assert served_model.parent is None - assert all(lora_model.root == zephyr_lora_files - for lora_model in lora_models) - assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) - assert lora_models[0].id == "zephyr-lora" - assert lora_models[1].id == "zephyr-lora2" - - -@pytest.mark.asyncio -async def test_dynamic_lora_lineage( - client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): - - response = await client_for_lora_lineage.post("load_lora_adapter", - cast_to=str, - body={ - "lora_name": - "zephyr-lora-3", - "lora_path": - zephyr_lora_files - }) - # Ensure adapter loads before querying /models - assert "success" in response - - models = await client_for_lora_lineage.models.list() - models = models.data - dynamic_lora_model = models[-1] - assert dynamic_lora_model.root == zephyr_lora_files - assert dynamic_lora_model.parent == MODEL_NAME - assert dynamic_lora_model.id == "zephyr-lora-3" diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index a803ea4a8d..06e0f93dbe 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,6 +12,9 @@ MODEL_NAME = "BAAI/bge-reranker-v2-m3" def server(): args = [ "--enforce-eager", + # Will be used on tests to compare prompt input length + "--max-model-len", + "100" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -20,8 +23,7 @@ def server(): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." @@ -45,8 +47,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = [ "What is the capital of the United States?", "What is the capital of France?" @@ -73,8 +74,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." @@ -91,3 +91,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, assert score.data is not None assert len(score.data) == 1 assert score.data[0].score >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str): + + text_1 = "What is the capital of France?" * 20 + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + score_response.text + + # Test truncation + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + "truncate_prompt_tokens": 101 + }) + assert score_response.status_code == 400 + assert "Please, select a smaller truncation size." in \ + score_response.text diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 97248f1150..85f485364a 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -35,6 +35,7 @@ class MockModelConfig: logits_processor_pattern = None diff_sampling_param: Optional[dict] = None allowed_local_media_path: str = "" + encoder_config = None def get_diff_sampling_param(self): return self.diff_sampling_param or {} @@ -51,7 +52,7 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() - models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) + models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat(engine, model_config, models, @@ -72,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, model_config=MockModelConfig()) serving_chat = OpenAIServingChat(mock_engine, MockModelConfig(), @@ -115,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config(): mock_engine.errored = False # Initialize the serving chat - models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config) serving_chat = OpenAIServingChat(mock_engine, mock_model_config, diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 96897dc730..657ea20213 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -4,6 +4,7 @@ from unittest.mock import MagicMock import pytest from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, UnloadLoraAdapterRequest) @@ -21,13 +22,16 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( async def _async_serving_models_init() -> OpenAIServingModels: mock_model_config = MagicMock(spec=ModelConfig) + mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute mock_model_config.max_model_len = 2048 - serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + serving_models = OpenAIServingModels(engine_client=mock_engine_client, + base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config, lora_modules=None, prompt_adapters=None) + await serving_models.init_static_loras() return serving_models @@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found(): request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.type == "NotFoundError" + assert response.code == HTTPStatus.NOT_FOUND diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 6fcc920228..090523a836 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,6 +1,3 @@ -import json -import os - import openai import pytest @@ -10,16 +7,7 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B" @pytest.mark.asyncio -async def test_shutdown_on_engine_failure(tmp_path): - # Use a bad adapter to crash the engine - # (This test will fail when that bug is fixed) - adapter_path = tmp_path / "bad_adapter" - os.mkdir(adapter_path) - with open(adapter_path / "adapter_model_config.json", "w") as f: - json.dump({"not": "real"}, f) - with open(adapter_path / "adapter_model.safetensors", "wb") as f: - f.write(b"this is fake") - +async def test_shutdown_on_engine_failure(): # dtype, max-len etc set so that this can run in CI args = [ "--dtype", @@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path): "--enforce-eager", "--max-num-seqs", "128", - "--enable-lora", - "--lora-modules", - f"bad-adapter={tmp_path / 'bad_adapter'}", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path): with pytest.raises( (openai.APIConnectionError, openai.InternalServerError)): - # This crashes the engine - await client.completions.create(model="bad-adapter", - prompt="Hello, my name is") + # Asking for lots of prompt logprobs will currently crash the + # engine. This may change in the future when that bug is fixed + prompt = "Hello " * 4000 + await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + extra_body={"prompt_logprobs": 10}) # Now the server should shut down return_code = remote_server.proc.wait(timeout=8) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index d63b963522..513b466c10 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -754,10 +754,12 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_chatglm.jinja", "string"), ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), + ("template_deepseek_vl2.jinja", "string"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_llava.jinja", "string"), + ("template_pixtral_hf.jinja", "openai"), ("template_vlm2vec.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), diff --git a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json deleted file mode 100644 index a548f0a961..0000000000 --- a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json +++ /dev/null @@ -1,90 +0,0 @@ -{ - "model_type": "llama", - "kv_cache": { - "dtype": "float8_e4m3fn", - "scaling_factor": { - "0": { - "0": 0.0230364128947258, - "1": 0.01979283057153225, - "2": 0.0241350457072258, - "3": 0.0308314748108387, - "4": 0.0430733822286129, - "5": 0.0370396226644516, - "6": 0.0306222103536129, - "7": 0.0357491634786129, - "8": 0.0358189195394516, - "9": 0.0443289652466774, - "10": 0.0433175228536129, - "11": 0.0416782945394516, - "12": 0.0366908498108387, - "13": 0.0432477705180645, - "14": 0.0410505048930645, - "15": 0.0457589291036129, - "16": 0.0418526791036129, - "17": 0.0432477705180645, - "18": 0.0469447560608387, - "19": 0.0514787957072258, - "20": 0.0541294664144516, - "21": 0.0587681382894516, - "22": 0.0625, - "23": 0.0585588738322258, - "24": 0.0600237175822258, - "25": 0.0588030144572258, - "26": 0.0531180277466774, - "27": 0.06396484375, - "28": 0.0603027381002903, - "29": 0.0582101047039032, - "30": 0.0625348836183548, - "31": 0.0585588738322258, - "32": 0.0582798570394516, - "33": 0.0575125589966774, - "34": 0.0590820349752903, - "35": 0.0614188089966774, - "36": 0.0631975457072258, - "37": 0.0615931935608387, - "38": 0.0601283498108387, - "39": 0.0571986623108387, - "40": 0.0670340433716774, - "41": 0.0523507259786129, - "42": 0.0547223798930645, - "43": 0.0631975457072258, - "44": 0.0663713738322258, - "45": 0.0603376142680645, - "46": 0.0652204304933548, - "47": 0.0734514519572258, - "48": 0.0693708211183548, - "49": 0.0725446492433548, - "50": 0.0627790242433548, - "51": 0.0691266804933548, - "52": 0.0688825398683548, - "53": 0.068429134786129, - "54": 0.0605119988322258, - "55": 0.0799386203289032, - "56": 0.0853097140789032, - "57": 0.0661969929933548, - "58": 0.0689871683716774, - "59": 0.0724051371216774, - "60": 0.0541643425822258, - "61": 0.0626743882894516, - "62": 0.0628487765789032, - "63": 0.0607212632894516, - "64": 0.0589076466858387, - "65": 0.0451660193502903, - "66": 0.0453055277466774, - "67": 0.0414341539144516, - "68": 0.0385044664144516, - "69": 0.0414341539144516, - "70": 0.0466308631002903, - "71": 0.0399693101644516, - "72": 0.0437011756002903, - "73": 0.0434221550822258, - "74": 0.0428989976644516, - "75": 0.0401785746216774, - "76": 0.0431082621216774, - "77": 0.0484444759786129, - "78": 0.0417829267680645, - "79": 0.0418178029358387 - } - } - } -} \ No newline at end of file diff --git a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json deleted file mode 100644 index bb734039e9..0000000000 --- a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "model_type": "llama", - "kv_cache": { - "dtype": "float8_e4m3fn", - "scaling_factor": { - "0": { - "0": 0.0152239128947258, - "1": 0.0188860222697258, - "2": 0.0354178324341774, - "3": 0.0376674123108387, - "4": 0.0418526791036129, - "5": 0.0433175228536129, - "6": 0.0397600457072258, - "7": 0.0424455925822258, - "8": 0.0415387861430645, - "9": 0.0408412404358387, - "10": 0.0395856611430645, - "11": 0.0377371683716774, - "12": 0.0400739423930645, - "13": 0.040771484375, - "14": 0.0393415205180645, - "15": 0.0369001142680645, - "16": 0.03857421875, - "17": 0.0387486070394516, - "18": 0.0403180830180645, - "19": 0.0396205373108387, - "20": 0.0375627800822258, - "21": 0.0407366082072258, - "22": 0.0432477705180645, - "23": 0.0377022884786129, - "24": 0.0399693101644516, - "25": 0.0374581478536129, - "26": 0.0413295216858387, - "27": 0.0442243330180645, - "28": 0.0424804724752903, - "29": 0.0456891767680645, - "30": 0.0409109964966774, - "31": 0.0482352152466774 - } - } - } -} diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a84501f9c3..dac26efe86 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -6,8 +6,9 @@ import torch from tests.kernels.utils import opcheck from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, - GeluAndMul, NewGELU, - QuickGELU, SiluAndMul) + GeluAndMul, MulAndSilu, + NewGELU, QuickGELU, + SiluAndMul) from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -21,8 +22,9 @@ CUDA_DEVICES = [ ] -@pytest.mark.parametrize("activation", - ["silu", "gelu", "gelu_tanh", "fatrelu"]) +@pytest.mark.parametrize( + "activation", + ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -40,9 +42,12 @@ def test_act_and_mul( current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) - if activation == "silu": + if activation == "silu_and_mul": layer = SiluAndMul() fn = torch.ops._C.silu_and_mul + if activation == "mul_and_silu": + layer = MulAndSilu() + fn = torch.ops._C.mul_and_silu elif activation == "gelu": layer = GeluAndMul(approximate="none") fn = torch.ops._C.gelu_and_mul @@ -55,8 +60,9 @@ def test_act_and_mul( fn = torch.ops._C.fatrelu_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiLU, GELU and FatReLU implementations are equivalent to the native - # PyTorch implementations, so we can do exact comparison. + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 3e3c066819..574a0f223e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -31,9 +31,9 @@ NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing -# FlashAttention forward only supports head dimension at most 128 -# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 120, 256] +# This should be sync with get_supported_head_sizes() in +# vllm.attention.ops.paged_attn.PagedAttention +HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] @@ -182,7 +182,7 @@ def test_paged_attention( key_cache, value_cache = key_caches[0], value_caches[0] # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Call the paged attention kernel. output = torch.empty_like(query) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 916cc2efa3..492acb91e8 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -1,10 +1,10 @@ -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest import torch from tests.kernels.utils import override_backend_env_variable -from vllm.attention.selector import which_attn_to_use +from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.openvino import OpenVinoPlatform @@ -12,6 +12,13 @@ from vllm.platforms.rocm import RocmPlatform from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + @pytest.mark.parametrize( "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @@ -24,67 +31,75 @@ def test_env(name: str, device: str, monkeypatch): if device == "cpu": with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "TORCH_SDPA" + backend = get_attn_backend(16, torch.float16, torch.float16, 16, + False) + assert backend.get_name() == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.current_platform", RocmPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "ROCM_FLASH" + backend = get_attn_backend(16, torch.float16, torch.float16, 16, + False) + assert backend.get_name() == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.current_platform", - OpenVinoPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "OPENVINO" + OpenVinoPlatform()), patch.dict('sys.modules', + {'openvino': Mock()}): + backend = get_attn_backend(16, torch.float16, torch.float16, 16, + False) + assert backend.get_name() == "OPENVINO" else: - with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == name + if name in ["XFORMERS", "FLASHINFER"]: + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + assert backend.get_name() == name def test_flash_attn(monkeypatch): """Test FlashAttn validation.""" # TODO: When testing for v1, pipe in `use_v1` as an argument to - # which_attn_to_use + # get_attn_backend override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) # Unsupported CUDA arch with patch("torch.cuda.get_device_capability", return_value=(7, 5)): - backend = which_attn_to_use(16, torch.float16, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported data type - backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = which_attn_to_use(16, torch.float16, "fp8", 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported block size - backend = which_attn_to_use(16, torch.float16, None, 8, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, None, 8, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # flash-attn is not installed with patch.dict('sys.modules', {'vllm_flash_attn': None}): - backend = which_attn_to_use(16, torch.float16, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported head size - backend = which_attn_to_use(17, torch.float16, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(17, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + assert backend.get_name() != STR_FLASH_ATTN_VAL def test_invalid_env(monkeypatch): - """Throw an exception if the backend name is invalid.""" + """Ignore the invalid env variable if it is set.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) - with pytest.raises(ValueError): - which_attn_to_use(16, torch.float16, None, 16, False) + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + backend = get_attn_backend(32, torch.float16, None, 16, False) + assert backend.get_name() == "FLASH_ATTN" + + # when block size == 16, backend will fall back to XFORMERS + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() == "XFORMERS" diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index fad342d1b5..08f31219e3 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -210,7 +210,7 @@ def test_paged_attention( key_cache, value_cache = key_caches[0], value_caches[0] # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) tp_rank = 0 # Call the paged attention kernel. diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 40550ed51e..c848be4f9d 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -160,7 +160,7 @@ def test_reshape_and_cache( cloned_value_cache = value_cache.clone() # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Call the reshape_and_cache kernel. opcheck(torch.ops._C_cache_ops.reshape_and_cache, @@ -258,8 +258,8 @@ def test_reshape_and_cache_flash( del key_caches del value_caches - k_scale = key.amax().item() / 256 - v_scale = value.amax().item() / 256 + k_scale = (key.amax() / 256.0).to(torch.float32) + v_scale = (value.amax() / 256.0).to(torch.float32) # Clone the KV caches. if kv_cache_dtype == "fp8": @@ -284,12 +284,12 @@ def test_reshape_and_cache_flash( result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) ops.convert_fp8(result_key_cache, key_cache, - k_scale, + k_scale.item(), kv_dtype=kv_cache_dtype) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) ops.convert_fp8(result_value_cache, value_cache, - v_scale, + v_scale.item(), kv_dtype=kv_cache_dtype) # Run the reference implementation. diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py index 45ec6df4e7..00eb927205 100644 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -78,6 +78,7 @@ CASES = [ @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("soft_cap", [None, 50]) @pytest.mark.parametrize("num_blocks", [2048]) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_cascade( seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], @@ -87,8 +88,14 @@ def test_cascade( block_size: int, soft_cap: Optional[float], num_blocks: int, + fa_version: int, ) -> None: torch.set_default_device("cuda") + if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6) + or torch.cuda.get_device_capability() == (8, 9)): + pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to " + "insufficient shared memory for some shapes") + current_platform.seed_everything(0) window_size = (-1, -1) @@ -118,9 +125,7 @@ def test_cascade( cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) - cu_kv_lens = torch.tensor([0] + kv_lens, - dtype=torch.int32).cumsum(dim=0, - dtype=torch.int32) + kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size block_tables = torch.randint(0, num_blocks, @@ -140,7 +145,7 @@ def test_cascade( k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_kv_lens, + seqused_k=kv_lens_tensor, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len, softmax_scale=scale, @@ -154,10 +159,8 @@ def test_cascade( assert all(common_prefix_len < kv_len for kv_len in kv_lens) cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], dtype=torch.int32) - cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32) - cu_suffix_kv_lens = ( - cu_kv_lens - - torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len) + prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32) + suffix_kv_lens = kv_lens_tensor - common_prefix_len output = torch.empty_like(query) cascade_attention( output=output, @@ -167,8 +170,8 @@ def test_cascade( cu_query_lens=cu_query_lens, max_query_len=max_query_len, cu_prefix_query_lens=cu_prefix_query_lens, - cu_prefix_kv_lens=cu_prefix_kv_lens, - cu_suffix_kv_lens=cu_suffix_kv_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, max_kv_len=max_kv_len, softmax_scale=scale, alibi_slopes=None, @@ -176,6 +179,7 @@ def test_cascade( logits_soft_cap=soft_cap if soft_cap is not None else 0, block_table=block_tables, common_prefix_len=common_prefix_len, + fa_version=fa_version, ) # Compare the results. diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index d943b048b7..e008a56de6 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -13,8 +13,7 @@ import pytest import torch from tests.kernels.utils import * -from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, - AttentionType) +from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) @@ -64,6 +63,7 @@ class TestPoint(NamedTuple): max_dec_seq_len: int max_enc_seq_len: int num_blocks: int + attn_type: AttentionType class TestResources(NamedTuple): @@ -96,7 +96,6 @@ class TestResources(NamedTuple): ''' scale: float - attn_backend: AttentionBackend attn: Attention kv_cache: torch.Tensor @@ -129,26 +128,33 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: ''' scale = float(1.0 / (test_pt.head_size**0.5)) - attn_backend = make_backend(test_pt.backend_name) attn = Attention( test_pt.num_heads, test_pt.head_size, scale=scale, + prefix=f"{test_pt.attn_type}", + attn_type=test_pt.attn_type, ) if test_pt.num_blocks is None or test_pt.num_heads is None: # Caller does not require a KV cache return TestResources( - scale, attn_backend, attn, + scale, attn, torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) # Construct KV cache - kv_cache = make_kv_cache(test_pt.num_blocks, - test_pt.num_heads, - test_pt.head_size, - test_pt.block_size, - device=CUDA_DEVICE, - backend=test_pt.backend_name) - return TestResources(scale, attn_backend, attn, kv_cache) + if test_pt.attn_type in (AttentionType.DECODER, + AttentionType.ENCODER_DECODER): + kv_cache = make_kv_cache(test_pt.num_blocks, + test_pt.num_heads, + test_pt.head_size, + test_pt.block_size, + device=CUDA_DEVICE, + backend=test_pt.backend_name) + else: + kv_cache = torch.tensor([]) + + attn.kv_cache = [kv_cache] + return TestResources(scale, attn, kv_cache) def _encoder_attn_setup( @@ -193,6 +199,7 @@ def _encoder_attn_setup( _, max_q_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -301,6 +308,7 @@ def _decoder_attn_setup( max_q_seq_len, _, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -488,6 +496,7 @@ def _enc_dec_cross_attn_setup_reuses_query( max_decoder_seq_len, max_encoder_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -622,7 +631,6 @@ def _run_encoder_attention_test( & attn_metadata ''' assert attn_metadata.num_decode_tokens == 0 - attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None with set_forward_context(attn_metadata, vllm_config): @@ -635,14 +643,11 @@ def _run_encoder_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + return attn.forward( + reshaped_query, packed_qkv.key, packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), attn_metadata) def _run_decoder_self_attention_test( @@ -675,7 +680,6 @@ def _run_decoder_self_attention_test( * Attention.forward() applied to packed_{query,key,value}, kv_cache & attn_metadata ''' - attn_type = AttentionType.DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv @@ -690,12 +694,8 @@ def _run_decoder_self_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value, + kv_cache, attn_metadata) def _run_encoder_decoder_cross_attention_test( @@ -742,7 +742,6 @@ def _run_encoder_decoder_cross_attention_test( ''' assert decoder_test_params.packed_qkvo.packed_qkv is not None - attn_type = AttentionType.ENCODER_DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache if cross_test_params is None: @@ -762,12 +761,8 @@ def _run_encoder_decoder_cross_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, key, value, kv_cache, + attn_metadata) @pytest.fixture(autouse=True) @@ -839,7 +834,7 @@ def test_encoder_only( # is not part of this test test_pt = TestPoint(num_heads, head_size, attn_backend.name, batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + max_enc_seq_len, 4096, AttentionType.ENCODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init @@ -855,7 +850,7 @@ def test_encoder_only( # Shared prefill metadata structure prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, None, decoder_test_params=None, @@ -961,20 +956,29 @@ def test_e2e_enc_dec_attn( # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test - test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.ENCODER) + enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, + AttentionType.ENCODER_DECODER) + dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.DECODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): - test_rsrcs = _make_test_resources(test_pt) + enc_test_rsrcs = _make_test_resources(enc_test_pt) + enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt) + dec_test_rsrcs = _make_test_resources(dec_test_pt) # Construct encoder attention test params (only used # during prefill) - enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs) + enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs) # Construct Decoder self-attention prefill-phase & decode-phase # test params, including query/key/value tensors, decoder self-attention @@ -987,7 +991,7 @@ def test_e2e_enc_dec_attn( prephase_dec_test_params, decphase_dec_test_params, cross_block_base_addr, - ) = _decoder_attn_setup(test_pt, test_rsrcs) + ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs) # Construct encoder/decoder cross-attention prefill-phase # & decode-phase test params, including key/value tensors, @@ -1000,14 +1004,14 @@ def test_e2e_enc_dec_attn( dec_qkv, enc_test_params, prephase_dec_test_params, - test_pt, - test_rsrcs, + enc_dec_test_pt, + enc_dec_test_rsrcs, block_base_addr=cross_block_base_addr) # Shared prefill metadata structure assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens, decoder_test_params=prephase_dec_test_params, @@ -1017,10 +1021,10 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder attention - enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, + enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn, enc_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_test_pt, vllm_config=vllm_config) # - Is encoder attention result correct? @@ -1030,10 +1034,10 @@ def test_e2e_enc_dec_attn( # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, prephase_dec_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is prefill decoder self-attention correct? @@ -1044,11 +1048,11 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is prefill encoder/decoder cross-attention correct? @@ -1059,7 +1063,7 @@ def test_e2e_enc_dec_attn( # DECODE: build decode-phase attention metadata decphase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, False, dec_qkv.q_seq_lens, decoder_test_params=decphase_dec_test_params, @@ -1070,10 +1074,10 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, decphase_dec_test_params, decphase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is decode-phase decoder self-attention correct? @@ -1084,11 +1088,11 @@ def test_e2e_enc_dec_attn( # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is decode-phase encoder/decoder cross-attention correct? diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 1ae78d7b46..b22153c86b 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -80,6 +80,7 @@ def ref_paged_attn( @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_flash_attn_with_paged_kv( use_out: bool, @@ -91,8 +92,14 @@ def test_flash_attn_with_paged_kv( soft_cap: Optional[float], num_blocks: int, sliding_window: Optional[int], + fa_version: int, ) -> None: torch.set_default_device("cuda") + if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6) + or torch.cuda.get_device_capability() == (8, 9)): + pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to " + "insufficient shared memory for some shapes") + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] @@ -131,6 +138,7 @@ def test_flash_attn_with_paged_kv( cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, window_size=window_size, + fa_version=fa_version, ) output = output if not use_out else out output = output.squeeze(1) @@ -159,6 +167,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_varlen_with_paged_kv( use_out: bool, @@ -170,8 +179,14 @@ def test_varlen_with_paged_kv( block_size: int, soft_cap: Optional[float], num_blocks: int, + fa_version: int, ) -> None: torch.set_default_device("cuda") + if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6) + or torch.cuda.get_device_capability() == (8, 9)): + pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to " + "insufficient shared memory for some shapes") + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] @@ -198,9 +213,7 @@ def test_varlen_with_paged_kv( cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) - cu_kv_lens = torch.tensor([0] + kv_lens, - dtype=torch.int32).cumsum(dim=0, - dtype=torch.int32) + kv_lens = torch.tensor(kv_lens, dtype=torch.int32) max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size block_tables = torch.randint(0, @@ -215,7 +228,7 @@ def test_varlen_with_paged_kv( v=value_cache, out=out, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_kv_lens, + seqused_k=kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len, softmax_scale=scale, @@ -223,6 +236,7 @@ def test_varlen_with_paged_kv( window_size=window_size, block_table=block_tables, softcap=soft_cap if soft_cap is not None else 0, + fa_version=fa_version, ) output = output if not use_out else out diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 8b23b62826..7fa5de1984 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -14,6 +14,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) +from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( + fused_moe as iterative_moe) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE @@ -46,6 +48,11 @@ def test_fused_moe( triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) + iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False) + torch.testing.assert_close(iterative_output, + torch_output, + atol=2e-2, + rtol=0) @pytest.mark.parametrize("dtype", diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3fdb7996ba..10e73ab950 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -138,6 +138,7 @@ def test_contexted_kv_attention( # to V_cache[num_blocks, num_kv_heads, head_size, block_size] v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Warm up the Triton kernel by calling it once before actually measuring # generation time @@ -153,6 +154,8 @@ def test_contexted_kv_attention( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, sliding_window=sliding_window) torch.cuda.synchronize() start_time = time.time() @@ -168,6 +171,8 @@ def test_contexted_kv_attention( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, sliding_window=sliding_window) torch.cuda.synchronize() end_time = time.time() @@ -366,6 +371,7 @@ def test_contexted_kv_attention_alibi( # to V_cache[num_blocks, num_kv_heads, head_size, block_size] v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Warm up the Triton kernel by calling it once before actually measuring # generation time @@ -381,6 +387,8 @@ def test_contexted_kv_attention_alibi( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, alibi_slopes=alibi_slopes) torch.cuda.synchronize() start_time = time.time() @@ -396,6 +404,8 @@ def test_contexted_kv_attention_alibi( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, alibi_slopes=alibi_slopes) torch.cuda.synchronize() end_time = time.time() diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index 8e96a2f70d..a5aab3c2ea 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -39,6 +39,23 @@ def get_8bit_types(): return types +# This test is to check regressions for int8 support on ROCm. +@pytest.mark.parametrize("model_path", [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8", +]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [10]) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="Should only run on ROCm") +def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path, + max_tokens, num_logprobs): + dtype = "bfloat16" + + with vllm_runner(model_path, dtype=dtype) as vllm_model: + vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, + num_logprobs) + + @pytest.mark.parametrize("M", [1, 33, 64, 512]) @pytest.mark.parametrize("N", [256, 971, 20486]) @pytest.mark.parametrize("K", [128, 496, 1024]) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index e7865fb250..8011398551 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,6 +13,7 @@ from torch._prims_common import TensorLikeType from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms.interface import _Backend from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) @@ -790,7 +791,7 @@ def make_block_tables_slot_mapping( def make_test_metadata( - attn_backend: AttentionBackend, + attn_backend: _Backend, is_prompt: bool, seq_lens: Optional[List[int]], decoder_test_params: Optional[PhaseTestParameters], @@ -815,7 +816,7 @@ def make_test_metadata( Arguments: - * attn_backend: Backend for sourcing attention kernels + * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence * decoder_test_params: decoder self-attention test params; @@ -882,6 +883,8 @@ def make_test_metadata( # (kv_mmap) cross_kv_mmap = cross_test_params.kv_mmap + attn_backend_obj = make_backend(attn_backend.name) + if is_prompt: # Prefill-phase scenario @@ -902,11 +905,11 @@ def make_test_metadata( context_lens, encoder_seq_lens, device=device) - - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, @@ -952,10 +955,11 @@ def make_test_metadata( encoder_seq_lens, device=device) - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 4beba4dc05..1cc1ced996 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -22,13 +22,13 @@ def test_run(my_rank, pipe): x2 = pipe.recv_tensor() print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", y2) else: x2 = pipe.recv_tensor() print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", y2) pipe.send_tensor(x) print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 57ebaa424f..e7378d0076 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model +from vllm.platforms import current_platform class ContextIDInfo(TypedDict): @@ -65,13 +66,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): temp_file = tempfile.mkstemp()[1] - init_distributed_environment( - world_size=1, - rank=0, - distributed_init_method=f"file://{temp_file}", - local_rank=0, - backend="nccl", - ) + + backend = "nccl" + if current_platform.is_cpu(): + backend = "gloo" + + init_distributed_environment(world_size=1, + rank=0, + distributed_init_method=f"file://{temp_file}", + local_rank=0, + backend=backend) initialize_model_parallel(1, 1) yield cleanup_dist_env_and_memory(shutdown_ray=True) @@ -81,13 +85,15 @@ def dist_init(): def dist_init_torch_only(): if torch.distributed.is_initialized(): return + backend = "nccl" + if current_platform.is_cpu(): + backend = "gloo" + temp_file = tempfile.mkstemp()[1] - torch.distributed.init_process_group( - backend="nccl", - world_size=1, - rank=0, - init_method=f"file://{temp_file}", - ) + torch.distributed.init_process_group(world_size=1, + rank=0, + init_method=f"file://{temp_file}", + backend=backend) @pytest.fixture diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index fb8c0b2a7b..08a589d7ee 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -48,10 +48,14 @@ TOLERANCES = { torch.float32: (5e-3, 5e-3), torch.bfloat16: (3e-2, 2e-2), } -# TODO: Modify this based on platform -DEVICES = [ + +pytestmark = pytest.mark.skipif( + not (current_platform.is_cuda_alike() or current_platform.is_cpu()), + reason="Backend not supported") + +DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if current_platform.is_cuda_alike() else ["cpu"]) #For GPU, we will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) @@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool: from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU return type(punica_wrapper) is PunicaWrapperGPU + elif current_platform.is_cpu(): + from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU + + return type(punica_wrapper) is PunicaWrapperCPU else: return False @@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA # device, see: https://github.com/triton-lang/triton/issues/2925 # Same below. - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 @@ -313,7 +322,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: def test_embeddings_with_new_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) max_loras = 8 punica_wrapper = get_punica_wrapper(8192, 256, device) @@ -450,7 +461,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) max_loras = 8 punica_wrapper = get_punica_wrapper(8192, 256, device) @@ -582,7 +595,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, def test_linear_replicated(dist_init, num_loras, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -695,7 +710,9 @@ def test_linear_replicated(dist_init, num_loras, device, stage, def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -818,7 +835,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -971,6 +990,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, @pytest.mark.parametrize("rotary_dim", [None, 32]) @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Only CUDA backends are supported") def test_rotary_embedding_long_context(dist_init, num_loras, device, scaling_factors, max_position, is_neox_style, rotary_dim, head_size, diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 537d95b025..b907af47d0 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -3,6 +3,7 @@ from typing import List import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.utils import WeightsMapper @@ -30,11 +31,14 @@ def test_load_checkpoints( else: expected_lora_modules.append(module) if lora_name == "baichuan7B": + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) # For the baichuan7B model, load it's LoRA, # and the test should pass. LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -43,9 +47,12 @@ def test_load_checkpoints( # Test that the target_modules contain prefix # such as "model.layers.0.self_atten.W_pack", and # the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_zero_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -53,9 +60,12 @@ def test_load_checkpoints( elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, # such as `model\\..*(W_pack|o_proj)`, and the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_regex_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -64,10 +74,13 @@ def test_load_checkpoints( # For the baichuan7B model, load chatglm3-6b's LoRA, # and the test should raise the following error. expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501 + peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files, + max_position_embeddings=4096) with pytest.raises(ValueError, match=expected_error): LoRAModel.from_local_checkpoint( chatglm3_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files): ".layers.": ".baichuan_layers.", }, ) + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index e2daf9d135..1c0ee01c03 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -3,6 +3,7 @@ from typing import List import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path from vllm.model_executor.models.llama import LlamaForCausalLM @@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) # lora loading should work for either absolute path and hugggingface id. + peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index a099f36b0a..9a5b9aabf5 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,3 @@ -import json -import math import os from typing import Dict, List @@ -20,6 +18,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.platforms import current_platform EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", @@ -28,73 +27,20 @@ EMBEDDING_MODULES = { EMBEDDING_PADDING_MODULES = ["lm_head"] -CUDA_DEVICES = [ +DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if current_platform.is_cuda_alike() else ["cpu"]) -def test_peft_helper(sql_lora_files): - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - peft_helper = PEFTHelper.from_dict(config) - assert peft_helper.r == 8 - assert peft_helper.lora_alpha == 16 - assert peft_helper.target_modules == [ - "q_proj", - "v_proj", - "k_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - "embed_tokens", - "lm_head", - ] - scaling = peft_helper.lora_alpha / peft_helper.r - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - # test RSLoRA - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_rslora=True) - peft_helper = PEFTHelper.from_dict(config) - - scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - expected_error = "vLLM only supports modules_to_save being None." - with pytest.raises(ValueError, match=expected_error): - config = dict( - r=8, - lora_alpha=16, - target_modules=["gate_proj"], - modules_to_save=["lm_head"], - ) - PEFTHelper.from_dict(config) - - expected_error = "vLLM does not yet support DoRA." - with pytest.raises(ValueError, match=expected_error): - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_dora=True) - PEFTHelper.from_dict(config) - - -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( os.path.join(sql_lora_files, "new_embeddings.safetensors")) - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - - peft_helper = PEFTHelper.from_dict(config) + peft_helper = PEFTHelper.from_local_dir(sql_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_lora_tensors( 1, tensors, @@ -171,7 +117,7 @@ def test_replace_submodules(dist_init, dummy_model): manager = LoRAModelManager( model, 1, 1, 1, LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), - torch.device("cuda")) + torch.device(DEVICES[0])) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -183,7 +129,7 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -244,7 +190,7 @@ def test_lora_model_manager(dist_init, dummy_model, device): assert manager.punica_wrapper.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -336,7 +282,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager @@ -466,7 +412,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) @@ -545,7 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): # Should remove every LoRA not specified in the request. @@ -621,7 +567,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py deleted file mode 100644 index 78bf5a1617..0000000000 --- a/tests/lora/test_minicpmv.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import List - -import pytest - -import vllm -from vllm.assets.image import ImageAsset -from vllm.lora.request import LoRARequest -from vllm.platforms import current_platform - -MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" - -PROMPT_TEMPLATE = ( - "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" - "(./)\nWhat is in the image?<|eot_id|>" - "<|start_header_id|>assistant<|end_header_id|>\n\n") - -IMAGE_ASSETS = [ - ImageAsset("stop_sign"), - ImageAsset("cherry_blossom"), -] - -# After fine-tuning with LoRA, all generated content should start begin `A`. -EXPECTED_OUTPUT = [ - "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 - "A pink cherry blossom tree with a blue sky in the background.", -] - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - sampling_params = vllm.SamplingParams( - temperature=0, - max_tokens=5, - stop_token_ids=[128001, 128009], # eos_id, eot_id - ) - - inputs = [{ - "prompt": PROMPT_TEMPLATE, - "multi_modal_data": { - "image": asset.pil_image - }, - } for asset in IMAGE_ASSETS] - - outputs = llm.generate( - inputs, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None, - ) - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="MiniCPM-V dependency xformers incompatible with ROCm") -def test_minicpmv_lora(minicpmv_lora_files): - llm = vllm.LLM( - MODEL_PATH, - max_num_seqs=2, - enable_lora=True, - max_loras=4, - max_lora_rank=64, - trust_remote_code=True, - enable_chunked_prefill=True, - ) - output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) - for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output1[i]) - output2 = do_sample(llm, minicpmv_lora_files, lora_id=2) - for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output2[i]) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 930f177953..3b0f18325a 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -3,10 +3,10 @@ from typing import List import pytest import vllm +from tests.utils import fork_new_process_for_each_test from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest - -from ..utils import multi_gpu_test +from vllm.platforms import current_platform MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -17,13 +17,11 @@ PROMPT_TEMPLATE = ( IMAGE_ASSETS = [ ImageAsset("stop_sign"), - ImageAsset("cherry_blossom"), ] # After fine-tuning with LoRA, all generated content should start begin `A`. EXPECTED_OUTPUT = [ "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 - "A pink cherry blossom tree with a blue sky in the background.", ] @@ -50,37 +48,40 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: # Print the outputs. generated_texts: List[str] = [] for output in outputs: - prompt = output.prompt generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Generated text: {generated_text!r}") return generated_texts -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded): +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, - enable_lora=True, max_num_seqs=2, - max_loras=4, - max_lora_rank=64, - tensor_parallel_size=2, + enable_lora=True, + max_loras=2, + max_lora_rank=8, + enforce_eager=True, trust_remote_code=True, - fully_sharded_loras=fully_sharded, enable_chunked_prefill=True, ) - - output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) - + output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + assert EXPECTED_OUTPUT[i].startswith(output1[i]) + output2 = do_sample(llm, minicpmv_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output2[i]) -@multi_gpu_test(num_gpus=4) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, enable_lora=True, @@ -89,9 +90,33 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): max_lora_rank=64, tensor_parallel_size=4, trust_remote_code=True, - fully_sharded_loras=fully_sharded, + enforce_eager=True, enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + + +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=2, + max_loras=2, + max_lora_rank=8, + tensor_parallel_size=4, + trust_remote_code=True, + fully_sharded_loras=True, + enable_chunked_prefill=True, + ) + output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 797a495201..940a865228 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -5,6 +5,7 @@ import torch import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" @@ -31,7 +32,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, @pytest.mark.parametrize("tp_size", [4]) def test_mixtral_lora(mixtral_lora_files, tp_size): """Original test, the LoRA model has the common target modules, not all""" - if torch.cuda.device_count() < tp_size: + if torch.cuda.device_count( + ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike(): pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") prompts = [ diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py new file mode 100644 index 0000000000..a524d5ce5f --- /dev/null +++ b/tests/lora/test_peft_helper.py @@ -0,0 +1,109 @@ +import json +import math +import shutil + +import pytest + +from vllm.config import LoRAConfig +from vllm.lora.peft_helper import PEFTHelper + +ERROR_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + + +def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): + peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1, + max_position_embeddings=4096) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + peft_helper.validate_legal(lora_config) + assert peft_helper.r == 8 + assert peft_helper.lora_alpha == 16 + assert peft_helper.target_modules == [ + "q_proj", + "v_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + assert peft_helper.context_length == 16384 + assert peft_helper.vllm_max_position_embeddings == 4096 + assert peft_helper.vllm_long_context_scaling_factor == float( + math.ceil(peft_helper.context_length / + peft_helper.vllm_max_position_embeddings)) + # test RSLoRA + rslora_config = dict(use_rslora=True) + test_dir = tmp_path / "test_rslora" + shutil.copytree(long_context_lora_files_16k_1, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(rslora_config) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + + peft_helper = PEFTHelper.from_local_dir(test_dir, + max_position_embeddings=4096) + peft_helper.validate_legal(lora_config) + scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 + + +@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES) +def test_peft_helper_error( + sql_lora_files, + tmp_path, + test_name: str, + config_change: dict, + expected_error: str, +): + test_dir = tmp_path / test_name + shutil.copytree(sql_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + # Test loading the adapter + with pytest.raises(ValueError, match=expected_error): + PEFTHelper.from_local_dir( + test_dir, max_position_embeddings=4096).validate_legal(lora_config) diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_ops_sizes.py similarity index 63% rename from tests/lora/test_punica_sizes.py rename to tests/lora/test_punica_ops_sizes.py index 66b5f82bbb..433ca7577d 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_ops_sizes.py @@ -4,19 +4,21 @@ hidden_sizes included in the LoRA models currently supported by vLLM. It tests whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. """ +from threading import Lock + import pytest import torch -from vllm.lora.ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice -from vllm.lora.ops.sgmv_shrink import sgmv_shrink +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.platforms import current_platform -from .utils import (generate_data, generate_data_for_expand_nslices, - ref_torch_groupgemm) +from .utils import (assert_close, generate_data, + generate_data_for_expand_nslices, + generate_data_for_nslices) HIDDEN_SIZES = [ 128, @@ -110,16 +112,9 @@ DTYPES = [torch.float16, torch.bfloat16] MAX_RANKS = [32] SCALES = [0.5] SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] +DEVICES = [f"cuda:{0}"] - -def assert_close(a, b): - rtol, atol = { - torch.float16: (6e-2, 6e-2), - torch.bfloat16: (6e-2, 6e-2), - torch.float32: (1e-2, 1e-2), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) +_dict_lock = Lock() @pytest.mark.parametrize("batches", BATCHES) @@ -127,16 +122,18 @@ def assert_close(a, b): @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("nslices", [1, 2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_sgmv( batches: int, num_loras: int, rank: int, hidden_size: int, scaling: float, + nslices: int, dtype: torch.dtype, op_type: str, seed: int, @@ -148,19 +145,20 @@ def test_punica_sgmv( seq_length = 128 ( inputs_tensor, - lora_weights, + lora_weights_lst, our_out_tensor, ref_out_tensor, b_seq_start_loc, lora_indices_tensor, seq_len_tensor, indices, - ) = generate_data( + ) = generate_data_for_nslices( batches, hidden_size, num_loras, rank, seq_length, + nslices, dtype, op_type, device, @@ -172,43 +170,85 @@ def test_punica_sgmv( else: max_seq_length = max_seq_length.item() if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) + # Preventing cache error pointer. + with _dict_lock: + _LORA_A_PTR_DICT.clear() + torch.ops.vllm.sgmv_shrink( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + for index in range(nslices): + sgmv_shrink( + inputs_tensor, + lora_weights_lst[index], + ref_out_tensor[index], + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) + with _dict_lock: + _LORA_B_PTR_DICT.clear() + torch.ops.vllm.sgmv_expand( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + offset_start=0, + add_inputs=True, + ) + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + add_inputs=True, + ) + else: + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + slice_offset, + hidden_size, + add_inputs=True, + ) + slice_offset += hidden_size + assert_close(our_out_tensor, ref_out_tensor) @@ -220,7 +260,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv( batches: int, num_loras: int, @@ -256,31 +296,38 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - bgmv_shrink( + torch.ops.vllm.bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, indices, scaling, ) + + bgmv_shrink( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + scaling, + ) + else: - bgmv_expand( + torch.ops.vllm.bgmv_expand( inputs_tensor, lora_weights, our_out_tensor, indices, add_inputs=True, ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + add_inputs=True, + ) + if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -292,25 +339,22 @@ def test_punica_bgmv( @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_punica_expand_nslices( +@pytest.mark.parametrize("device", DEVICES) +def test_punica_bgmv_expand_nslices( batches: int, num_loras: int, rank: int, hidden_size: int, nslices: int, dtype: torch.dtype, - op_type: str, seed: int, device: str, ): - torch.set_default_device(device) current_platform.seed_everything(seed) - seq_length = 128 if op_type == "sgmv" else 1 + seq_length = 1 ( inputs_tensor, lora_weights_lst, @@ -330,50 +374,26 @@ def test_punica_expand_nslices( nslices, device, ) - max_seq_length = seq_len_tensor.max() - token_nums = seq_len_tensor.sum().item() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - if op_type == "sgmv": - sgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - slice_offset, - hidden_size, - add_inputs=True, - ) - else: - - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_outputs[:, slice_offset:slice_offset + hidden_size], + torch.ops.vllm.bgmv_expand_slice( inputs_tensor, lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - 1.0, - op_type="expand", + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + ref_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, ) slice_offset += hidden_size diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_ops_variation.py similarity index 58% rename from tests/lora/test_punica_variation.py rename to tests/lora/test_punica_ops_variation.py index 3b20033271..2bb84c1cf1 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_ops_variation.py @@ -3,22 +3,24 @@ This script is mainly used to test whether trtion kernels can run normally under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ +from threading import Lock + import pytest import torch # Enable custom op register -import vllm.lora.ops.bgmv_expand -import vllm.lora.ops.bgmv_expand_slice -import vllm.lora.ops.bgmv_shrink -import vllm.lora.ops.sgmv_expand -import vllm.lora.ops.sgmv_expand_slice -import vllm.lora.ops.sgmv_shrink # noqa: F401 +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.platforms import current_platform -from .utils import (generate_data, generate_data_for_expand_nslices, - ref_torch_groupgemm) +from .utils import (assert_close, generate_data, + generate_data_for_expand_nslices, + generate_data_for_nslices) -HIDDEN_SIZES = [4097] +HIDDEN_SIZES = [2049] BATCHES = [1, 4, 16, 32] NUM_LORA = [1, 8, 32, 128] @@ -26,26 +28,9 @@ DTYPES = [torch.float16, torch.bfloat16] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256] SCALES = [0.5] SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] +DEVICES = [f"cuda:{0}"] - -def assert_close(a, b): - rtol, atol = { - torch.float16: (6e-2, 6e-2), - torch.bfloat16: (6e-2, 6e-2), - torch.float32: (1e-2, 1e-2), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - - -# Unlike test_punica_sizes.py, we directly utilize custom op for -# testing, which verifies the correct registration of these ops. -bgmv_expand = torch.ops.vllm.bgmv_expand -bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice -bgmv_shrink = torch.ops.vllm.bgmv_shrink -sgmv_expand = torch.ops.vllm.sgmv_expand -sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice -sgmv_shrink = torch.ops.vllm.sgmv_shrink +_dict_lock = Lock() @pytest.mark.parametrize("batches", BATCHES) @@ -53,16 +38,18 @@ sgmv_shrink = torch.ops.vllm.sgmv_shrink @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("nslices", [1, 2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_sgmv( batches: int, num_loras: int, rank: int, hidden_size: int, scaling: float, + nslices: int, dtype: torch.dtype, op_type: str, seed: int, @@ -74,19 +61,20 @@ def test_punica_sgmv( seq_length = 128 ( inputs_tensor, - lora_weights, + lora_weights_lst, our_out_tensor, ref_out_tensor, b_seq_start_loc, lora_indices_tensor, seq_len_tensor, indices, - ) = generate_data( + ) = generate_data_for_nslices( batches, hidden_size, num_loras, rank, seq_length, + nslices, dtype, op_type, device, @@ -98,43 +86,85 @@ def test_punica_sgmv( else: max_seq_length = max_seq_length.item() if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) + # Preventing cache error pointer. + with _dict_lock: + _LORA_A_PTR_DICT.clear() + torch.ops.vllm.sgmv_shrink( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + for index in range(nslices): + sgmv_shrink( + inputs_tensor, + lora_weights_lst[index], + ref_out_tensor[index], + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) + with _dict_lock: + _LORA_B_PTR_DICT.clear() + torch.ops.vllm.sgmv_expand( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + offset_start=0, + add_inputs=True, + ) + slice_offset = 0 + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + add_inputs=True, + ) + else: + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + slice_offset, + hidden_size, + add_inputs=True, + ) + slice_offset += hidden_size + assert_close(our_out_tensor, ref_out_tensor) @@ -146,7 +176,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv( batches: int, num_loras: int, @@ -158,7 +188,6 @@ def test_punica_bgmv( seed: int, device: str, ): - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -183,32 +212,38 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - bgmv_shrink( + torch.ops.vllm.bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, indices, scaling, ) - else: - bgmv_expand( + bgmv_shrink( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + scaling, + ) + + else: + torch.ops.vllm.bgmv_expand( inputs_tensor, lora_weights, our_out_tensor, indices, add_inputs=True, ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + add_inputs=True, + ) + if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -220,24 +255,22 @@ def test_punica_bgmv( @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_punica_expand_nslices( +@pytest.mark.parametrize("device", DEVICES) +def test_punica_bgmv_expand_nslices( batches: int, num_loras: int, rank: int, hidden_size: int, nslices: int, dtype: torch.dtype, - op_type: str, seed: int, device: str, ): torch.set_default_device(device) current_platform.seed_everything(seed) - seq_length = 128 if op_type == "sgmv" else 1 + seq_length = 1 ( inputs_tensor, lora_weights_lst, @@ -257,49 +290,26 @@ def test_punica_expand_nslices( nslices, device, ) - max_seq_length = seq_len_tensor.max() - token_nums = seq_len_tensor.sum().item() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - if op_type == "sgmv": - sgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - slice_offset, - hidden_size, - add_inputs=True, - ) - else: - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_outputs[:, slice_offset:slice_offset + hidden_size], + torch.ops.vllm.bgmv_expand_slice( inputs_tensor, lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - 1.0, - op_type="expand", + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + ref_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, ) slice_offset += hidden_size diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 026269667b..26bf770cc0 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -72,7 +72,8 @@ def do_sample(llm: vllm.LLM, @pytest.mark.parametrize("tp_size", [1]) def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, tp_size): - if num_gpus_available < tp_size: + if num_gpus_available < tp_size and \ + tp_size > 1 and current_platform.is_cuda_alike(): pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") llm = vllm.LLM( diff --git a/tests/lora/utils.py b/tests/lora/utils.py index e394c33b3f..ce47546f21 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -18,11 +18,13 @@ class DummyLoRAManager: def get_module_lora(self, module_name: str) -> LoRALayerWeights: return self._loras[module_name] - def init_random_lora(self, - module_name: str, - weight: torch.Tensor, - rank: int = 8, - generate_embeddings_tensor: int = 0): + def init_random_lora( + self, + module_name: str, + weight: torch.Tensor, + rank: int = 8, + generate_embeddings_tensor: int = 0, + ): lora = LoRALayerWeights( module_name, rank=rank, @@ -35,21 +37,25 @@ class DummyLoRAManager: device=self._device), ) if generate_embeddings_tensor: - lora.embeddings_tensor = torch.rand(5, - generate_embeddings_tensor, - dtype=weight.dtype, - device=self._device) + lora.embeddings_tensor = torch.rand( + 5, + generate_embeddings_tensor, + dtype=weight.dtype, + device=self._device, + ) self.set_module_lora(module_name, lora) return lora - def init_lora(self, - module_name: str, - input_dim: int, - output_dim: int, - rank=8, - noop=False, - embeddings_tensor=None): + def init_lora( + self, + module_name: str, + input_dim: int, + output_dim: int, + rank=8, + noop=False, + embeddings_tensor=None, + ): lora = LoRALayerWeights( module_name, rank=rank, @@ -98,35 +104,16 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) -def ref_torch_groupgemm( - out_tensor, - inputs, - lora_weights, - lora_indices_tensor, - seq_len_tensor, +def generate_data( batches, - scaling, + hidden_size, + lora_nums, + max_rank, + seq_length, + dtype, op_type, -) -> torch.Tensor: - out_list = [] - current_offset = 0 - for lora_index, b_length in zip(range(batches), seq_len_tensor): - input_weight = inputs[current_offset:b_length + current_offset, :] - current_offset += b_length - lora_weight = lora_weights[lora_indices_tensor[lora_index]] - result = torch.nn.functional.linear(input_weight, lora_weight) - result *= scaling - out_list.append(result) - cat_result = torch.cat(out_list, dim=0) - if op_type == "expand": - out_tensor += cat_result - else: - out_tensor.copy_(cat_result) - return - - -def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, - op_type, device): + device, +): seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -187,8 +174,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, ) -def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, - seq_length, dtype, nslices, device): +def generate_data_for_expand_nslices( + batches, + hidden_size, + lora_nums, + max_rank, + seq_length, + dtype, + nslices, + device, +): seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -221,7 +216,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, for b_id in range(batches): lora_index = lora_indices_tensor[b_id] indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() + seq_len_tensor[b_id]] = (lora_index.item()) + current_offset += seq_len_tensor[b_id].item() + + lora_indices_tensor = lora_indices_tensor.to(device) + return ( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) + + +def generate_data_for_nslices( + batches, + hidden_size, + lora_nums, + max_rank, + seq_length, + nslices, + dtype, + op_type, + device, +): + seq_len_tensor = torch.randint(seq_length, seq_length + 1, + (batches, )).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + + lora_weights_lst = [] + if op_type == "shrink": + + inputs_tensor = torch.rand((total_tokens, hidden_size), + dtype=dtype).to(device) + + for _ in range(nslices): + if op_type == "shrink": + lora_weights_lst.append( + torch.rand( + (lora_nums, max_rank, hidden_size), # col-major + dtype=dtype, + ).to(device)) + # NOTE shrink kernel using torch.float32 as output type + # shrink op need atomic_add, so output is initinized by 0 + our_out_tensor = torch.zeros( + (nslices, total_tokens, max_rank), + dtype=torch.float32, + ).to(device) + else: + inputs_tensor = torch.rand( + (nslices, total_tokens, max_rank), + dtype=dtype, + ).to(device) + for _ in range(nslices): + lora_weights_lst.append( + torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device)) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + our_out_tensor = torch.rand((total_tokens, hidden_size * nslices), + dtype=dtype).to(device) + + # Ensure the same input. + ref_out_tensor = our_out_tensor.clone() + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batches, )) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batches): + lora_index = lora_indices_tensor[b_id] + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = (lora_index.item()) current_offset += seq_len_tensor[b_id].item() lora_indices_tensor = lora_indices_tensor.to(device) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index ed321ba9f0..9c1f784c1c 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -2,7 +2,7 @@ import os import pytest -from vllm.model_executor.layers.pooler import PoolingType +from vllm.model_executor.layers.pooler import CLSPool, PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform @@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME, revision=REVISION, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + + vllm_model.apply_model(check_model) + # assert output assert output @@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME_ROBERTA, revision=REVISION_ROBERTA, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -84,11 +84,38 @@ def test_roberta_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + vllm_model.apply_model(check_model) # assert output assert output + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_facebook_roberta_model_loading_with_params(vllm_runner): + """ + Test loading roberta-base model with no lm_head. + """ + model_name = "FacebookAI/roberta-base" + with vllm_runner(model_name=model_name, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_tokenizer = vllm_model.model.llm_engine.tokenizer + assert model_tokenizer.tokenizer_id == model_name + + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert not hasattr(model, "lm_head") + assert isinstance(model._pooler, CLSPool) + + vllm_model.apply_model(check_model) + + assert output diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 0bb98df1b5..1e329dc4cb 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, @pytest.mark.asyncio -async def test_online_inference(client, audio_assets): - """Exercises online inference with/without chunked prefill enabled.""" +async def test_online_serving(client, audio_assets): + """Exercises online serving with/without chunked prefill enabled.""" messages = [{ "role": diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 53f23e2451..5f06f1e3a2 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -19,18 +19,17 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize( - "kv_cache_dtype,base_model,test_model,scale_path", + "kv_cache_dtype,base_model,test_model", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", - "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-1B-Instruct", None), + "meta-llama/Llama-3.2-1B-Instruct"), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", - "meta-llama/Llama-2-7b-chat-hf", - "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + "meta-llama/Llama-2-7b-chat-hf") ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @@ -48,7 +47,6 @@ def test_models( kv_cache_dtype: str, base_model: str, test_model: str, - scale_path: Optional[str], max_tokens: int, enforce_eager: bool, backend: str, @@ -76,10 +74,6 @@ def test_models( baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) - extra_kwargs = {} - if scale_path is not None: - extra_kwargs["quantization_param_path"] = scale_path - with vllm_runner( test_model, max_model_len=MAX_MODEL_LEN, @@ -87,7 +81,6 @@ def test_models( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 2b8f5e2faa..ad8f8a0c32 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -4,6 +4,7 @@ Note: To pass the test, quantization higher than Q4 should be used """ import os +from typing import List, NamedTuple, Type import pytest from huggingface_hub import hf_hub_download @@ -11,6 +12,7 @@ from transformers import AutoTokenizer from tests.quantization.utils import is_quant_method_supported +from ....conftest import VllmRunner from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -18,31 +20,78 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 +class GGUFTestConfig(NamedTuple): + original_model: str + gguf_repo: str + gguf_filename: str + + @property + def gguf_model(self): + return hf_hub_download(self.gguf_repo, filename=self.gguf_filename) + + +LLAMA_CONFIG = GGUFTestConfig( + original_model="meta-llama/Llama-3.2-1B-Instruct", + gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", + gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf", +) + +QWEN2_CONFIG = GGUFTestConfig( + original_model="Qwen/Qwen2.5-1.5B-Instruct", + gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF", + gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", +) + +PHI3_CONFIG = GGUFTestConfig( + original_model="microsoft/Phi-3.5-mini-instruct", + gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", + gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf", +) + +GPT2_CONFIG = GGUFTestConfig( + original_model="openai-community/gpt2-large", + gguf_repo="QuantFactory/gpt2-large-GGUF", + gguf_filename="gpt2-large.Q4_K_M.gguf", +) + +STABLELM_CONFIG = GGUFTestConfig( + original_model="stabilityai/stablelm-3b-4e1t", + gguf_repo="afrideva/stablelm-3b-4e1t-GGUF", + gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf", +) + +STARCODER_CONFIG = GGUFTestConfig( + original_model="bigcode/starcoder2-3b", + gguf_repo="QuantFactory/starcoder2-3b-GGUF", + gguf_filename="starcoder2-3b.Q6_K.gguf", +) + +DOLPHIN_CONFIG = GGUFTestConfig( + # Test VocabParallelEmbedding sharding issue. + original_model="cognitivecomputations/TinyDolphin-2.8-1.1b", + gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF", + gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", +) + +MODELS = [ + LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, + DOLPHIN_CONFIG + # STARCODER_CONFIG, # broken +] + + @pytest.mark.skipif(not is_quant_method_supported("gguf"), reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-Q4_K_M.gguf"), - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-IQ4_XS.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF", - "qwen2-1_5b-instruct-q4_k_m.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", - "Qwen2-1.5B-Instruct.IQ4_XS.gguf"), -]) +@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("tp_size", [1, 2]) def test_models( - num_gpus_available, - vllm_runner, - example_prompts, - original_model, - gguf_id, - gguf_path, + num_gpus_available: int, + vllm_runner: Type[VllmRunner], + example_prompts: List[str], + model: GGUFTestConfig, dtype: str, max_tokens: int, num_logprobs: int, @@ -51,28 +100,29 @@ def test_models( if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - gguf_model = hf_hub_download(gguf_id, filename=gguf_path) - - tokenizer = AutoTokenizer.from_pretrained(original_model) - messages = [[{ - 'role': 'user', - 'content': prompt - }] for prompt in example_prompts] - example_prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model.original_model) + if tokenizer.chat_template is not None: + messages = [[{ + 'role': 'user', + 'content': prompt + }] for prompt in example_prompts] + example_prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=original_model, - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size) as original_model: - + with vllm_runner( + model_name=model.original_model, + enforce_eager=True, # faster tests + dtype=dtype, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tp_size) as original_model: original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. - with vllm_runner(model_name=gguf_model, + with vllm_runner(model_name=model.gguf_model, + enforce_eager=True, + tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as gguf_model: diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 057b04349e..2e06b10fbb 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,10 +33,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 06739e8f02..1ad4f5aae8 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,10 +51,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 2a7ed8826d..c7efa4edbb 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -48,6 +48,10 @@ from ...utils import check_logprobs_close ), pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm pytest.param("bigcode/starcoder2-3b"), # starcoder2 + pytest.param( + "ehristoforu/Falcon3-MoE-2x7B-Insruct", # mixtral + marks=[pytest.mark.cpu_model], + ) ]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @@ -69,10 +73,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py deleted file mode 100644 index 3edf96d111..0000000000 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Tests for phi3v's multimodal preprocessing kwargs.""" -from typing import Optional - -import pytest -from transformers import AutoTokenizer - -from vllm.inputs import InputProcessingContext -from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID - -from .....conftest import _ImageAssets -from ....utils import build_model_context - -models = ["microsoft/Phi-3.5-vision-instruct"] - - -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_phi3v(): - from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor - return Phi3VMultiModalProcessor - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "num_crops,expected_toks_per_img", - [ - (4, 757), - (16, 1921), - # the default num_crops of phi-3.5-vision is 4 - (None, 757), - ]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, - model: str, num_crops: Optional[int], - expected_toks_per_img: int, num_imgs: int): - """Ensure input_processor_for_phi3v handles num_crops properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - limit_mm_per_prompt={"image": num_imgs}, - ) - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) - # Build the image str / prompt based on the number of images we pass - img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) - prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" - images = [image_assets[0].pil_image] * num_imgs - - mm_data = {"image": images} - mm_processor_kwargs = {} - if num_crops is not None: - mm_processor_kwargs = {"num_crops": num_crops} - - processor = processor_for_phi3v(ctx) - processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) - - # Ensure we have the right number of placeholders per num_crops size - img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) - assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py deleted file mode 100644 index 1f0b482666..0000000000 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ /dev/null @@ -1,73 +0,0 @@ -from typing import Any, Dict, Tuple - -import pytest -from transformers import AutoTokenizer - -from vllm.inputs import InputProcessingContext - -from .....conftest import _ImageAssets -from ....utils import build_model_context - -MODEL = "Qwen/Qwen2-VL-2B-Instruct" -MIN_PIXELS = "min_pixels" -MAX_PIXELS = "max_pixels" - - -# Fixtures lazy import to avoid initializing CUDA during test collection -# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple -# input mappers. -@pytest.fixture() -def processor_for_qwen2_vl(): - from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor - return Qwen2VLMultiModalProcessor - - -@pytest.mark.parametrize( - "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [ - ({}, 1426, (5704, 1176)), - ({ - MIN_PIXELS: 64**2, - MAX_PIXELS: 512**2 - }, 330, (1320, 1176)), - ]) -@pytest.mark.parametrize("model", [MODEL]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_override( - processor_for_qwen2_vl, - image_assets: _ImageAssets, - model: str, - mm_processor_kwargs: Dict[str, Any], - expected_toks_per_img: int, - expected_pixels_shape: Tuple[int, int], - num_imgs: int, -): - """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - mm_processor_kwargs=None, - limit_mm_per_prompt={"image": num_imgs}, - ) - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) - # Build the image str / prompt based on the number of images we pass - prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs - images = [image_assets[0].pil_image] * num_imgs - - mm_data = {"image": images} - - processor = processor_for_qwen2_vl(ctx) - processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) - - # Ensure we have the right number of placeholders per num_crops size - hf_processor = processor._get_hf_processor(**mm_processor_kwargs) - image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) - img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape - - assert img_tok_count == expected_toks_per_img * num_imgs - assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs - assert pixel_shape[1] == expected_pixels_shape[1] diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7db0816682..14d9a739be 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,7 +9,7 @@ from typing import Type import pytest from transformers import AutoModelForVision2Seq -from transformers.utils import is_flash_attn_2_available +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.platforms import current_platform from vllm.utils import identity @@ -139,9 +139,7 @@ VLM_TEST_SETTINGS = { #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], - tokenizer_mode="slow", test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", max_model_len=4096, @@ -157,8 +155,8 @@ VLM_TEST_SETTINGS = { max_tokens=64, marks=[ pytest.mark.skipif( - not is_flash_attn_2_available(), - reason="Model needs flash-attn for numeric convergence.", + TRANSFORMERS_VERSION < "4.48.0", + reason="HF model requires transformers>=4.48.0", ), large_gpu_mark(min_gb=64), ], @@ -188,6 +186,30 @@ VLM_TEST_SETTINGS = { max_tokens=8, dtype="bfloat16", ), + "deepseek_vl_v2": VLMTestInfo( + models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 + }), + multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 + vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501 + patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, + postprocess_inputs=model_utils.cast_dtype_post_processor("images"), + hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, + stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 + image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], + marks=[ + pytest.mark.skipif( + TRANSFORMERS_VERSION >= "4.48.0", + reason="HF model is not compatible with transformers>=4.48.0", + ) + ], + ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], test_type=VLMTestType.IMAGE, @@ -274,10 +296,8 @@ VLM_TEST_SETTINGS = { ), limit_mm_per_prompt={"image": 4}, )], - # Llava-next tests fixed sizes & the default size factors - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], ), - "llava_one_vision": VLMTestInfo( + "llava_onevision": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 @@ -288,8 +308,6 @@ VLM_TEST_SETTINGS = { ), auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, - # Llava-one-vision tests fixed sizes & the default size factors - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 @@ -306,7 +324,6 @@ VLM_TEST_SETTINGS = { max_model_len=4096, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], ), "mantis": VLMTestInfo( models=["TIGER-Lab/Mantis-8B-siglip-llama3"], @@ -346,6 +363,16 @@ VLM_TEST_SETTINGS = { ), hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), + "molmo": VLMTestInfo( + models=["allenai/Molmo-7B-D-0924"], + test_type=(VLMTestType.IMAGE), + prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + image_size_factors=[(),(1.0, 1.0, 1.0)], + patch_hf_runner=model_utils.mlomo_patch_hf_runner, + postprocess_inputs=model_utils.molmo_post_processor, + ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 @@ -431,7 +458,7 @@ VLM_TEST_SETTINGS = { ) for inp in custom_inputs.different_patch_input_cases_internvl() ], ), - "llava_one_vision-multiple-images": VLMTestInfo( + "llava_onevision-multiple-images": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 51fe7d2ad3..5a485f3d81 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -5,7 +5,6 @@ import pytest import torch from PIL import Image -from vllm.entrypoints.llm import LLM from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video @@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, @@ -105,17 +104,19 @@ def batch_make_image_embeddings( pixel_values = preprocess_result["pixel_values"] image_grid_thw = preprocess_result["image_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker. \ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to(visual.device, - dtype=torch.int64) - image_embeds = visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptImageEmbeddingInput] = [] @@ -124,11 +125,10 @@ def batch_make_image_embeddings( for image_batch in image_batches_: cur_batch_image_count = len(image_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in image_grid_thw[image_counter:image_counter + - cur_batch_image_count] - ]) + cur_batch_image_count]) result.append({ "image_embeds": @@ -151,7 +151,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -187,17 +187,19 @@ def batch_make_video_embeddings( pixel_values = preprocess_result["pixel_values_videos"] video_grid_thw = preprocess_result["video_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker.\ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to(visual.device, - dtype=torch.int64) - video_embeds = visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) + + video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptVideoEmbeddingInput] = [] @@ -206,11 +208,10 @@ def batch_make_video_embeddings( for video_batch in video_batches_: cur_batch_video_count = len(video_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in video_grid_thw[video_counter:video_counter + - cur_batch_video_count] - ]) + cur_batch_video_count]) result.append({ "video_embeds": @@ -280,9 +281,9 @@ def run_embedding_input_test( max_tokens, num_logprobs=num_logprobs, images=batch_make_image_embeddings( - images, processor, vllm_model.model) if images else None, + images, processor, vllm_model) if images else None, videos=batch_make_video_embeddings( - videos, processor, vllm_model.model) if videos else None) + videos, processor, vllm_model) if videos else None) for prompts, images, videos in inputs ] @@ -427,130 +428,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, mm_limit=1, tensor_parallel_size=1, ) - - -def run_chunked_prefill_test( - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Compare inference result between - chunked prefill disabled and chunked prefill enabled - """ - - # NOTE: - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - task="generate", - max_model_len=4000, - max_num_seqs=4, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - - outputs_per_case = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images or None, - videos=videos or None) - for prompts, images, videos in inputs - ] - - with vllm_runner( - model, - task="generate", - max_model_len=4000, - max_num_seqs=4, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enable_chunked_prefill=True, - # should be small enough to ensure prefilling is chunked - max_num_batched_tokens=32, - mm_processor_kwargs={ - "max_pixels": 16 * 28 * 28, - }) as vllm_model_chunked: - outputs_per_case_chunked = [ - vllm_model_chunked.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images or None, - videos=videos or None) for prompts, images, videos in inputs - ] - - for outputs, \ - outputs_chunked \ - in zip(outputs_per_case, - outputs_per_case_chunked): - check_logprobs_close( - outputs_0_lst=outputs, - outputs_1_lst=outputs_chunked, - name_0="non_chunked", - name_1="chunked", - ) - - -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [1]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts, - model: str, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: - """ - Test Qwen2-VL's chunked prefill with M-RoPE - """ - prompts = [ - qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt) - for prompt in example_prompts[:1] - ] - - # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs, - # so an image is included in the inputs - # 2. however, Qwen2-VL currently won't work properly - # when chunked prefill is enabled and there are some multi-modal inputs, - # here use a hacky way: provide a **zero-length** image to make it happy - # - # and finally we achieved: - # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests - zero_len_image = { - "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)), - "image_grid_thw": torch.tensor([[0, 0, 0]]) - } - images = [zero_len_image] * len(prompts) - - inputs_per_case: List[Tuple[List[str], PromptImageInput, - PromptVideoInput]] = [ - (prompts, images, []), - ] - - run_chunked_prefill_test( - vllm_runner, - inputs_per_case, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 3eca8fb9dc..1ca85c7bb2 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -5,17 +5,20 @@ typically specific to a small subset of models. import re import types from pathlib import PosixPath -from typing import Callable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from PIL.Image import Image -from transformers import AutoConfig, AutoTokenizer, BatchEncoding +from transformers import (AutoConfig, AutoTokenizer, BatchEncoding, + GenerationConfig) from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from .....conftest import HfRunner, ImageAsset, _ImageAssets +from .....conftest import (HfRunner, ImageAsset, PromptAudioInput, + PromptImageInput, PromptVideoInput, _ImageAssets) +from ....utils import TokensTextLogprobs from .types import RunnerOutput @@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, ####### Post-processors for HF outputs +def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, + model: str) -> RunnerOutput: + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith("<|end▁of▁sentence|>"): + output_str = output_str.split("<|end▁of▁sentence|>")[0] + return output_ids, output_str, out_logprobs + + def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput: output_ids, output_str, out_logprobs = hf_output @@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str): return {"model_inputs": hf_inputs} +def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str): + hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype) + return {k: v.unsqueeze(0) for k, v in hf_inputs.items()} + + ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], @@ -253,6 +269,34 @@ def qwen_prompt_path_encoder( ####### Model-specific HuggingFace runner patchers +def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for GLM4.""" + hf_processor = hf_model.processor + + def processor(*args, text="", images=None, **kwargs): + if isinstance(images, Image): + images = [images] + # inputs is a custom class instead of dict or BatchFeature + inputs = hf_processor( + *args, + prompt=text, + images=images, + **kwargs, + ) + inputs = { + k: inputs[k] + for k in inputs.keys() # noqa + if k not in ("seq_lens", "sft_format") + } + inputs = BatchEncoding(data=inputs, tensor_type="pt") + return inputs + + hf_model.processor = processor + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language.model.embed_tokens + return hf_model + + def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner to use for GLM4.""" hf_processor = hf_model.processor @@ -451,3 +495,88 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hf_model.model.generate = types.MethodType(_generate, hf_model.model) return hf_model + + +def _generate_greedy_logprobs_limit( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + **kwargs: Any, +) -> List[TokensTextLogprobs]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + # Process in batches for inference. + if len(all_inputs): + input_ids_lst = [] + images_lst = [] + images_input_idx_lst = [] + imges_masks_lst = [] + for inputs in all_inputs: + input_ids_lst.append(inputs["input_ids"]) + images_lst.append(inputs["images"]) + images_input_idx_lst.append(inputs["image_input_idx"]) + imges_masks_lst.append(inputs["image_masks"]) + batch_inputs = {} + batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0) + batch_inputs['images'] = torch.cat(images_lst, dim=0) + batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst, + dim=0) + batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0) + + outputs = self.model.generate_from_batch( + batch=self.wrap_device(batch_inputs, + device=self.model.device.type), + generation_config=GenerationConfig( + max_new_tokens=max_tokens, + stop_strings="<|endoftext|>", + do_sample=False, + ), + tokenizer=self.tokenizer, + output_hidden_states=True, + return_dict_in_generate=True, + ) + + all_logprobs: List[List[Dict[int, float]]] = [] + all_output_ids: List[List[int]] = [] + all_output_strs: List[str] = [] + + for index in range(len(all_inputs)): + ( + seq_logprobs_lst, + output_len, + ) = self._hidden_states_to_logprobs(outputs.hidden_states, + num_logprobs) + all_logprobs.append(seq_logprobs_lst) + seq_ids = outputs.sequences[index] + output_ids = seq_ids[-output_len:] + all_output_ids.append(output_ids.tolist()) + all_output_strs.append(self.tokenizer.decode(output_ids)) + outputs = zip(all_output_ids, all_output_strs, all_logprobs) + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + + +####### Molmo-specific HuggingFace runner patchers +def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for Molmo.""" + hf_processor = hf_model.processor + + def _processor(*args, **kwargs): + return hf_processor.process(*args, **kwargs) + + hf_model.processor = _processor + + setattr( # noqa: B010 + hf_model, + "generate_greedy_logprobs_limit", + types.MethodType(_generate_greedy_logprobs_limit, hf_model), + ) + + return hf_model diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6673a9fc22..0cbe4afe96 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -24,10 +24,13 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) with hf_runner(model, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index f458ef5ef5..e17198e385 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -15,15 +15,18 @@ from ..utils import check_embeddings_close # [Encoder-only] pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-large"), - # [Encoder-decoder] - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-decoder] + pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) @pytest.mark.parametrize("dtype", ["half"]) @@ -59,10 +62,13 @@ def test_models( max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/multimodal/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py rename to tests/models/multimodal/__init__.py diff --git a/tests/models/multimodal/processing/__init__.py b/tests/models/multimodal/processing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py new file mode 100644 index 0000000000..fe5b733c75 --- /dev/null +++ b/tests/models/multimodal/processing/test_common.py @@ -0,0 +1,202 @@ +from functools import partial + +import numpy as np +import pytest +from PIL import Image + +from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.processing import ProcessingCache +from vllm.multimodal.utils import cached_get_tokenizer + +from ....multimodal.utils import random_audio, random_image, random_video +from ...registry import HF_EXAMPLE_MODELS + + +def _test_processing_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + + limit_mm_per_prompt = { + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() + } + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=model_info.trust_remote_code, + seed=0, + dtype="float16", + revision=None, + hf_overrides=model_info.hf_overrides, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + # Ensure that it can fit all of the data + cache = ProcessingCache(capacity=1 << 30) + + baseline_processor = factories.build_processor(ctx, cache=None) + cached_processor = factories.build_processor(ctx, cache=cache) + dummy_inputs = baseline_processor.dummy_inputs + tokenizer = baseline_processor.info.get_tokenizer() + + rng = np.random.RandomState(0) + + input_to_hit = { + "image": Image.new("RGB", size=(128, 128)), + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), + "audio": (np.zeros((512, )), 16000), + } + input_factory = { + "image": + partial(random_image, rng, min_wh=128, max_wh=256), + "video": + partial(random_video, + rng, + min_frames=2, + max_frames=8, + min_wh=128, + max_wh=256), + "audio": + partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), + } + + for batch_idx in range(num_batches): + mm_data = { + k: + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) + for _ in range(rng.randint(limit_mm_per_prompt[k]))] + for k in modalities + } + + mm_counts = {k: len(vs) for k, vs in mm_data.items()} + prompt = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt_text + + # Drop unnecessary keys and test single -> multi conversion + if rng.rand() < simplify_rate: + for k in list(mm_data.keys()): + if not mm_data[k]: + del mm_data[k] + elif len(mm_data[k]) == 1: + mm_data[k] = mm_data[k][0] + + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == cached_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + baseline_tokenized_result = baseline_processor.apply( + tokenizer.encode(prompt), + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == baseline_tokenized_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + cached_tokenized_result = cached_processor.apply( + tokenizer.encode(prompt), + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert cached_result == cached_tokenized_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + +# yapf: disable +# True if the model supports multiple data items of the modality per request +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", {"image": False}), + ("facebook/chameleon-7b", {"image": False}), + ("deepseek-ai/deepseek-vl2-tiny", {"image": True}), + ("adept/fuyu-8b", {"image": False}), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), + ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + _test_processing_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) + + +# yapf: disable +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness_phi3v( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + # HACK - this is an attempted workaround for the following bug + # https://github.com/huggingface/transformers/issues/34307 + from transformers import AutoImageProcessor # noqa: F401 + from transformers import AutoProcessor # noqa: F401 + + AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) + + _test_processing_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py similarity index 98% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py rename to tests/models/multimodal/processing/test_idefics3.py index c71a2d3590..69b91ad4a5 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -8,8 +8,8 @@ from transformers import AutoImageProcessor, AutoTokenizer from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context models = ["HuggingFaceM4/Idefics3-8B-Llama3"] diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py similarity index 98% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py rename to tests/models/multimodal/processing/test_internvl.py index af0c2aa211..d6c60595ca 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -7,8 +7,8 @@ from transformers import AutoTokenizer from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context models = ["OpenGVLab/InternVL2-2B"] diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py new file mode 100644 index 0000000000..6de649f872 --- /dev/null +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -0,0 +1,193 @@ +import itertools +from functools import partial + +import pytest +from PIL import Image +from pqdm.threads import pqdm + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import cached_get_tokenizer + +from ...utils import build_model_context + + +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + + try: + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 1 + assert first_placeholder["length"] == ( + len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs + + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, + num_imgs: int, + image_sizes: list[ImageSize], +) -> None: + """ + Ensure LlavaNextMultiModalProcessor + handles prompt replacement properly for input images. + """ + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_prompt_replacements_one, + processor, + num_imgs, + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) + + +@pytest.mark.skip("This test takes around 2 hours to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("num_imgs", [1]) +def test_processor_prompt_replacements_all(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(64, 1024), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py new file mode 100644 index 0000000000..806437d35e --- /dev/null +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -0,0 +1,194 @@ +import itertools +from functools import partial + +import pytest +from PIL import Image +from pqdm.threads import pqdm + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import cached_get_tokenizer + +from ...utils import build_model_context + + +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + + try: + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + assert first_placeholder["offset"] == 0 + assert first_placeholder["length"] == len( + processed_inputs["prompt_token_ids"]) // num_imgs + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, + num_imgs: int, + image_sizes: list[ImageSize], +) -> None: + """ + Ensure LlavaOnevisionMultiModalProcessor + handles prompt replacement properly for input images. + """ + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_prompt_replacements_one, + processor, + num_imgs, + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) + + +@pytest.mark.skip("This test takes around 2 hours to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("num_imgs", [1]) +def test_processor_prompt_replacements_all(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(64, 1024), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py new file mode 100644 index 0000000000..7f82a8f18f --- /dev/null +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -0,0 +1,55 @@ +"""Tests for phi3v's multimodal preprocessing kwargs.""" +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer + +from ....conftest import _ImageAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) +# yapf: disable +@pytest.mark.parametrize( + ("mm_processor_kwargs", "expected_toks_per_img"), + [ + ({"num_crops": 4}, 757), + ({"num_crops": 16}, 1921), + # the default num_crops of phi-3.5-vision is 4 + ({}, 757), + ]) +# yapf: enable +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + image_assets: _ImageAssets, + model_id: str, + mm_processor_kwargs: dict[str, int], + expected_toks_per_img: int, + num_imgs: int, +): + """Ensure input_processor_for_phi3v handles num_crops properly.""" + # Avoid initializing CUDA early + from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID + + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + trust_remote_code=True, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) + + # Build the image str / prompt based on the number of images we pass + img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) + prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" + mm_data = {"image": [image_assets[0].pil_image] * num_imgs} + + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py similarity index 98% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py rename to tests/models/multimodal/processing/test_qwen.py index 163220c91a..af0ace711b 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/multimodal/processing/test_qwen.py @@ -9,8 +9,8 @@ from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import IMAGE_ASSETS -from ....utils import build_model_context +from ....conftest import IMAGE_ASSETS +from ...utils import build_model_context ### Multimodal preprocessing tests SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py new file mode 100644 index 0000000000..de14fbbffe --- /dev/null +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -0,0 +1,54 @@ +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer + +from ....conftest import _ImageAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) +# yapf: disable +@pytest.mark.parametrize( + ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [ + ({}, 1426, (5704, 1176)), + ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)), + ]) +# yapf: enable +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + image_assets: _ImageAssets, + model_id: str, + mm_processor_kwargs: dict[str, object], + expected_toks_per_img: int, + expected_pixels_shape: tuple[int, int], + num_imgs: int, +): + """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) + + # Build the image str / prompt based on the number of images we pass + prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs + mm_data = {"image": [image_assets[0].pil_image] * num_imgs} + + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) + image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape + + assert img_tok_count == expected_toks_per_img * num_imgs + assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs + assert pixel_shape[1] == expected_pixels_shape[1] diff --git a/tests/models/registry.py b/tests/models/registry.py index dcb8bfa0f9..0bd06dea0e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,5 +1,9 @@ from dataclasses import dataclass, field -from typing import AbstractSet, Mapping, Optional +from typing import AbstractSet, Any, Literal, Mapping, Optional + +import pytest +from packaging.version import Version +from transformers import __version__ as TRANSFORMERS_VERSION @dataclass(frozen=True) @@ -22,6 +26,11 @@ class _HfExamplesInfo: for speculative decoding. """ + min_transformers_version: Optional[str] = None + """ + The minimum version of HF Transformers that is required to run this model. + """ + is_available_online: bool = True """ Set this to ``False`` if the name of this architecture no longer exists on @@ -33,6 +42,50 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" + hf_overrides: dict[str, Any] = field(default_factory=dict) + """The ``hf_overrides`` required to load the model.""" + + def check_transformers_version( + self, + *, + on_fail: Literal["error", "skip"], + ) -> None: + """ + If the installed transformers version does not meet the requirements, + perform the given action. + """ + if self.min_transformers_version is None: + return + + current_version = TRANSFORMERS_VERSION + required_version = self.min_transformers_version + if Version(current_version) < Version(required_version): + msg = ( + f"You have `transformers=={current_version}` installed, but " + f"`transformers>={required_version}` is required to run this " + "model") + + if on_fail == "error": + raise RuntimeError(msg) + else: + pytest.skip(msg) + + def check_available_online( + self, + *, + on_fail: Literal["error", "skip"], + ) -> None: + """ + If the model is not available online, perform the given action. + """ + if not self.is_available_online: + msg = "Model is not available online" + + if on_fail == "error": + raise RuntimeError(msg) + else: + pytest.skip(msg) + # yapf: disable _TEXT_GENERATION_EXAMPLE_MODELS = { @@ -43,8 +96,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", trust_remote_code=True), - "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", - trust_remote_code=True), "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", trust_remote_code=True), "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", @@ -64,6 +115,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3", # noqa: E501 trust_remote_code=True), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), @@ -80,6 +132,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B", trust_remote_code=True), + "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct", + trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), @@ -147,6 +201,7 @@ _EMBEDDING_EXAMPLE_MODELS = { "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 @@ -167,6 +222,8 @@ _CROSS_ENCODER_EXAMPLE_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] + "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", + min_transformers_version="4.48"), "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", @@ -174,6 +231,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", is_available_online=False), + "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", @@ -184,7 +243,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 - "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"), # noqa: E501 + "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3", # noqa: E501 + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True), "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", @@ -201,7 +261,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 - "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", + trust_remote_code=True), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 @@ -237,5 +298,17 @@ class HfExampleModels: def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: return self.hf_models[model_arch] + def find_hf_info(self, model_id: str) -> _HfExamplesInfo: + for info in self.hf_models.values(): + if info.default == model_id: + return info + + # Fallback to extras + for info in self.hf_models.values(): + if any(extra == model_id for extra in info.extras.values()): + return info + + raise ValueError(f"No example model defined for {model_id}") + HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b728f2744..d3a3aaf670 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -11,11 +11,14 @@ from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if not model_info.is_available_online: - pytest.skip("Model is not available online") + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") # Avoid OOM def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: + if hf_config.model_type == "deepseek_vl_v2": + hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]}) + if hasattr(hf_config, "text_config"): text_config: PretrainedConfig = hf_config.text_config else: diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 73b70d65e8..ac0366847e 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -21,6 +21,9 @@ from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) def test_registry_imports(model_arch): + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + model_info.check_transformers_version(on_fail="skip") + # Ensure all model classes can be imported successfully model_cls, _ = ModelRegistry.resolve_model_cls(model_arch) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 7203d635c2..8456a463ad 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -16,7 +16,6 @@ NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps NUM_PROMPTS = [10] DEFAULT_SERVER_ARGS: List[str] = [ - "--disable-log-requests", "--worker-use-ray", "--gpu-memory-utilization", "0.85", @@ -110,7 +109,7 @@ async def test_multi_step( # Spin up client/server & issue completion API requests. # Default `max_wait_seconds` is 240 but was empirically - # was raised 3x to 720 *just for this test* due to + # was raised 5x to 1200 *just for this test* due to # observed timeouts in GHA CI ref_completions = await completions_with_server_args( prompts, diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index cc1fd19252..34030d9d6a 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -5,6 +5,8 @@ from typing import Optional import pytest +from tests.kernels.utils import override_backend_env_variable + from ..models.utils import check_logprobs_close, check_outputs_equal MODELS = [ @@ -19,10 +21,11 @@ NUM_PROMPTS = [10] @pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("enable_chunked_prefill", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs", [None, 5]) +@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"]) def test_multi_step_llm( hf_runner, vllm_runner, @@ -36,6 +39,8 @@ def test_multi_step_llm( num_scheduler_steps: int, num_prompts: int, num_logprobs: Optional[int], + attention_backend: str, + monkeypatch, ) -> None: """Test vLLM engine with multi-step scheduling via sync LLM Engine. @@ -63,6 +68,7 @@ def test_multi_step_llm( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> 1 logprob returned. """ + override_backend_env_variable(monkeypatch, attention_backend) prompts = example_prompts if len(prompts) < num_prompts: @@ -114,6 +120,7 @@ def test_multi_step_llm( @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)]) +@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) def test_multi_step_llm_w_prompt_logprobs( vllm_runner, example_prompts, @@ -126,6 +133,8 @@ def test_multi_step_llm_w_prompt_logprobs( num_prompts: int, num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], + attention_backend: str, + monkeypatch, ) -> None: """Test prompt logprobs with multi-step scheduling via sync LLM Engine. @@ -155,6 +164,7 @@ def test_multi_step_llm_w_prompt_logprobs( note that this argument is not supported by the OpenAI completions endpoint. """ + override_backend_env_variable(monkeypatch, attention_backend) prompts = example_prompts if len(prompts) < num_prompts: @@ -205,6 +215,7 @@ def test_multi_step_llm_w_prompt_logprobs( @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs", [None, 5]) +@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) def test_multi_step_llm_chunked_prefill_prefix_cache( vllm_runner, example_prompts, @@ -216,6 +227,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( num_scheduler_steps: int, num_prompts: int, num_logprobs: Optional[int], + attention_backend: str, + monkeypatch, ) -> None: """Test vLLM engine with multi-step+"single-step chunked prefill"+APC. @@ -278,6 +291,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( # # The Incorrect scheduling behavior - if it occurs - will cause an exception # in the model runner resulting from `do_sample=False`. + override_backend_env_variable(monkeypatch, attention_backend) + assert len(example_prompts) >= 2 challenge_prompts = copy.deepcopy(example_prompts) challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index f99d7556b2..13f820d013 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,25 +1,29 @@ from contextlib import nullcontext -from functools import partial from typing import cast from unittest.mock import MagicMock import numpy as np import pytest -from PIL import Image from vllm.config import ModelConfig -from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, - _PlaceholderInfo, find_text_matches, - find_token_matches, iter_placeholders, +# yapf conflicts with isort for this block +# yapf: disable +from vllm.multimodal.processing import (PlaceholderFeaturesInfo, + PromptReplacement, + find_mm_placeholders, + find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) +# yapf: enable +from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby +from .utils import random_image + # yapf: disable @pytest.mark.parametrize( @@ -314,21 +318,27 @@ def test_find_replace_text( # Should not be used since there is nothing to convert to text mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) + mm_prompt_repls = { + key: [ + PromptReplacement(key, target, + repl_by_key[key]).bind(mock_tokenizer) + ] for key, target in target_by_key.items() - ] - matches = find_text_matches(prompt, prompt_repls) + } + mm_matches = { + key: find_text_matches(prompt, prompt_repls) + for key, prompt_repls in mm_prompt_repls.items() + } result = replace_text_matches( prompt, - matches, + mm_matches, {key: mm_count for key in repl_by_key}, ) # Only displayed on error - print("matches:", matches) + print("mm_matches:", mm_matches) print("result:", result) # Manually constructed results @@ -380,21 +390,27 @@ def test_find_replace_tokens( # Should not be used since there is nothing to convert to tokens mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) + mm_prompt_repls = { + key: [ + PromptReplacement(key, target, + repl_by_key[key]).bind(mock_tokenizer) + ] for key, target in target_by_key.items() - ] - matches = find_token_matches(prompt, prompt_repls) + } + mm_matches = { + key: find_token_matches(prompt, prompt_repls) + for key, prompt_repls in mm_prompt_repls.items() + } result = replace_token_matches( prompt, - matches, + mm_matches, {key: mm_count for key in repl_by_key}, ) # Only displayed on error - print("matches:", matches) + print("mm_matches:", mm_matches) print("result:", result) # Manually constructed results @@ -409,6 +425,8 @@ def test_find_replace_tokens( "pattern_1": [32000, 32000], "pattern_2": [], "pattern_3": [1550, 918, 1550], + # Test different modalities having the same tokens (32000) + "pattern_4": [32000], }, ], ) @@ -417,58 +435,93 @@ def test_find_replace_tokens( [ ( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=6, - replacement=[32000, 32000], - ), - ], + { + "pattern_1": [ + PlaceholderFeaturesInfo( + modality="pattern_1", + item_idx=0, + start_idx=6, + tokens=[32000, 32000], + ), + ], + "pattern_4": [ + PlaceholderFeaturesInfo( + modality="pattern_4", + item_idx=0, + start_idx=3, + tokens=[32000], + ), + ], + } + ), ( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=1, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_1", - start_idx=5, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_3", - start_idx=7, - replacement=[1550, 918, 1550], - ), - ], + { + "pattern_1": [ + PlaceholderFeaturesInfo( + modality="pattern_1", + item_idx=0, + start_idx=1, + tokens=[32000, 32000], + ), + PlaceholderFeaturesInfo( + modality="pattern_1", + item_idx=1, + start_idx=5, + tokens=[32000, 32000], + ), + ], + "pattern_3": [ + PlaceholderFeaturesInfo( + modality="pattern_3", + item_idx=0, + start_idx=7, + tokens=[1550, 918, 1550], + ), + ], + # No match for pattern_4 as it has lower priority than pattern_1 + } ), ( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=1, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_1", - start_idx=3, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_3", - start_idx=6, - replacement=[1550, 918, 1550], - ), - ], + { + "pattern_1": [ + PlaceholderFeaturesInfo( + modality="pattern_1", + item_idx=0, + start_idx=1, + tokens=[32000, 32000], + ), + PlaceholderFeaturesInfo( + modality="pattern_1", + item_idx=1, + start_idx=3, + tokens=[32000, 32000], + ), + ], + "pattern_4": [ + PlaceholderFeaturesInfo( + modality="pattern_4", + item_idx=0, + start_idx=5, + tokens=[32000], + ), + ], + "pattern_3": [ + PlaceholderFeaturesInfo( + modality="pattern_3", + item_idx=0, + start_idx=6, + tokens=[1550, 918, 1550], + ), + ], + } ), ] ) # yapf: enable -def test_iter_placeholders( +def test_find_mm_placeholders( repl_by_key, prompt, expected, @@ -476,19 +529,18 @@ def test_iter_placeholders( # Should not be used since there is nothing to convert to tokens mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, [], repl).bind(mock_tokenizer) + mm_prompt_repls = { + key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)] for key, repl in repl_by_key.items() - ] + } - result = list( - iter_placeholders( - prompt_repls, - prompt, - # Effectively match all occurrences in the prompt - {key: 3 - for key in repl_by_key}, - )) + result = find_mm_placeholders( + mm_prompt_repls, + prompt, + # Effectively match all occurrences in the prompt + {key: 3 + for key in repl_by_key}, + ) # Only displayed on error print("result:", result) @@ -497,37 +549,6 @@ def test_iter_placeholders( assert result == expected -def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int): - w, h = rng.randint(min_wh, max_wh, size=(2, )) - arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) - return Image.fromarray(arr) - - -def _rand_video( - rng: np.random.RandomState, - min_frames: int, - max_frames: int, - min_wh: int, - max_wh: int, -): - # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 - num_frames = rng.randint(min_frames, max_frames) - num_frames = (num_frames // 2) * 2 - - w, h = rng.randint(min_wh, max_wh, size=(2, )) - return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) - - -def _rand_audio( - rng: np.random.RandomState, - min_len: int, - max_len: int, - sr: int, -): - audio_len = rng.randint(min_len, max_len) - return rng.rand(audio_len), sr - - @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize( ("limit", "num_supported", "is_valid"), @@ -548,18 +569,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): revision=None, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( + processor = MULTIMODAL_REGISTRY.create_processor( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), ) - - processor = processor_factory(ctx, cache=None) + profiler = MultiModalProfiler(processor) mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) - processor.get_supported_mm_limits = mock_supported_mm_limits + processor.info.get_supported_mm_limits = mock_supported_mm_limits if is_valid: exc_ctx = nullcontext() @@ -567,7 +585,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): exc_ctx = pytest.raises(ValueError, match="this model only supports") with exc_ctx: - processor._get_and_validate_dummy_mm_counts() + profiler.get_dummy_data(model_config.max_model_len) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @@ -590,18 +608,14 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): revision=None, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( + processor = MULTIMODAL_REGISTRY.create_processor( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), ) - processor = processor_factory(ctx, cache=None) - rng = np.random.RandomState(0) - image = _rand_img(rng, min_wh=128, max_wh=256) + image = random_image(rng, min_wh=128, max_wh=256) if num_images == 0: mm_data = {} elif num_images == 1: @@ -620,166 +634,3 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): mm_data=mm_data, hf_processor_mm_kwargs={}, ) - - -def _test_processing_cache_correctness( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": - hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} - else: - hf_overrides = {} - - limit_mm_per_prompt = { - modality: 3 if supports_multi else 1 - for modality, supports_multi in modalities.items() - } - - model_config = ModelConfig( - model_id, - task="auto", - tokenizer=model_id, - tokenizer_mode="auto", - trust_remote_code=True, - seed=0, - dtype="float16", - revision=None, - hf_overrides=hf_overrides, - limit_mm_per_prompt=limit_mm_per_prompt, - ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( - model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), - ) - # Ensure that it can fit all of the data - cache = ProcessingCache(capacity=1 << 30) - - baseline_processor = processor_factory(ctx, cache=None) - cached_processor = processor_factory(ctx, cache=cache) - - rng = np.random.RandomState(0) - - input_to_hit = { - "image": Image.new("RGB", size=(128, 128)), - "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), - "audio": (np.zeros((512, )), 16000), - } - input_factory = { - "image": - partial(_rand_img, rng, min_wh=128, max_wh=256), - "video": - partial(_rand_video, - rng, - min_frames=2, - max_frames=8, - min_wh=128, - max_wh=256), - "audio": - partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000), - } - - for batch_idx in range(num_batches): - mm_data = { - k: - [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(limit_mm_per_prompt[k]))] - for k in modalities - } - - mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text - - # Drop unnecessary keys and test single -> multi conversion - if rng.rand() < simplify_rate: - for k in list(mm_data.keys()): - if not mm_data[k]: - del mm_data[k] - elif len(mm_data[k]) == 1: - mm_data[k] = mm_data[k][0] - - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - assert baseline_result == cached_result, ( - f"Failed ({batch_idx=}, {mm_data=})") - - -# yapf: disable -# True if the model supports multiple data items of the modality per request -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("rhymes-ai/Aria", {"image": True}), - ("Salesforce/blip2-opt-2.7b", {"image": False}), - ("facebook/chameleon-7b", {"image": False}), - ("adept/fuyu-8b", {"image": False}), - ("llava-hf/llava-1.5-7b-hf", {"image": True}), - ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), - ("mistral-community/pixtral-12b", {"image": True}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), - ("fixie-ai/ultravox-v0_3", {"audio": True}), -]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_cache_correctness( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - _test_processing_cache_correctness( - model_id, - modalities, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) - - -# yapf: disable -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), -]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_cache_correctness_phi3v( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - # HACK - this is an attempted workaround for the following bug - # https://github.com/huggingface/transformers/issues/34307 - from transformers import AutoImageProcessor # noqa: F401 - from transformers import AutoProcessor # noqa: F401 - - AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) - - _test_processing_cache_correctness( - model_id, - modalities, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 6029f2e514..198344e5bd 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,16 +2,22 @@ import base64 import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import Dict, Tuple +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple import numpy as np import pytest from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (MediaConnector, + merge_and_sort_multimodal_metadata, repeat_and_pad_placeholder_tokens) +if TYPE_CHECKING: + from vllm.multimodal.hasher import MultiModalHashDict + from vllm.multimodal.inputs import MultiModalPlaceholderDict + # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model): assert new_prompt == expected_prompt assert new_token_ids == expected_token_ids assert ranges == expected_ranges + + +# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. +class TestCase(NamedTuple): + mm_positions: "MultiModalPlaceholderDict" + mm_hashes: Optional["MultiModalHashDict"] + expected_modalities: list[str] + expected_ranges: list[PlaceholderRange] + expected_hashes: Optional[list[str]] + + +def test_merge_and_sort_multimodal_metadata(): + + test_cases = [ + # Single modality should return result as is but flattened + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ] + }, + mm_hashes={"image": ["hash1", "hash2"]}, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ], + expected_hashes=["hash1", "hash2"], + ), + + # Single modality without hashes return None for mm hash. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ] + }, + mm_hashes=None, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ], + expected_hashes=None, + ), + + # Multiple modalities with hashes should return sorted modalities + # and flattened ranges and hashes. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1", "audio_hash2"], + }, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=[ + "audio_hash1", "audio_hash2", "image_hash1", "image_hash2" + ], + ), + + # Multiple modalities without hashes should return sorted modalities + # and flattened ranges and None. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes=None, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=None, + ), + + # Three modalities + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + ], + "video": [ + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1"], + "video": ["video_hash1", "video_hash2", "video_hash3"] + }, + expected_modalities=["audio", "video", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + expected_hashes=[ + "audio_hash1", "video_hash1", "video_hash2", "video_hash3", + "image_hash1", "image_hash2" + ], + ), + ] + + for (mm_positions, mm_hashes, expected_modalities, expected_ranges, + expected_hashes) in test_cases: + modalities, ranges, hashes = merge_and_sort_multimodal_metadata( + mm_positions, mm_hashes) + + assert modalities == expected_modalities + assert ranges == expected_ranges + assert hashes == expected_hashes + + +def test_merge_and_sort_multimodal_metadata_with_interleaving(): + + test_cases = [ + + # ", - replacement="" * max_image_tokens + "", - ) - ] - - def apply( - self, - prompt_text: str, - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the trailing bos_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - - def _get_dummy_mm_inputs( + def get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config(Blip2Config) + hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config max_image_size = vision_config.image_size @@ -480,7 +439,67 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor): ) -@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) +class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if not mm_data: + # HF processor always adds placeholders even when there's no image + tokenizer = self.info.get_tokenizer() + prompt_ids = tokenizer.encode(prompt) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + image_token_id = vocab["image"] + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target="", + replacement=PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ), + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor, + info=Blip2ProcessingInfo, + dummy_inputs=Blip2DummyInputsBuilder) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 0bd0194243..e834c9004f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -11,6 +11,7 @@ from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -27,20 +28,22 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) +logger = init_logger(__name__) + class ChameleonImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -48,53 +51,34 @@ class ChameleonImagePixelInputs(TypedDict): """Shape: `(batch_size * num_images, num_channels, height, width)`""" -class ChameleonMultiModalProcessor(BaseMultiModalProcessor): +class ChameleonProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(ChameleonConfig) + + def get_hf_processor(self): + return self.ctx.get_hf_processor(ChameleonProcessor) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def _get_num_image_tokens(self) -> int: - processor = self._get_hf_processor() + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + processor = self.get_hf_processor() return processor.image_seq_length - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - return {"image": self._get_num_image_tokens()} - def _get_hf_processor(self) -> ChameleonProcessor: - return self.ctx.get_hf_processor(ChameleonProcessor) +class ChameleonDummyInputsBuilder( + BaseDummyInputsBuilder[ChameleonProcessingInfo]): - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: - processor = self._get_hf_processor() - - return [ - PromptReplacement( - modality="image", - target="", - replacement="".join([ - processor.image_start_token, - processor.image_token * self._get_num_image_tokens(), - processor.image_end_token, - ]), - ) - ] - - def _get_dummy_mm_inputs( + def get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - config = self.ctx.get_hf_config(ChameleonConfig) + config = self.info.get_hf_config() width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) @@ -111,25 +95,73 @@ class ChameleonMultiModalProcessor(BaseMultiModalProcessor): mm_data=mm_data, ) - def apply( + +class ChameleonMultiModalProcessor( + BaseMultiModalProcessor[ChameleonProcessingInfo]): + + def _call_hf_processor( self, - prompt_text: str, - mm_data: MultiModalDataDict, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if not mm_data: + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + # HF processor adds sep token for chat mode + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + sep_token_id = vocab[tokenizer.sep_token] # type: ignore + + return prompt_tokens + [sep_token_id] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) - # Only tokens should be considered as placeholders, - # so we ignore the image_start_token and image_end_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"] + 1, - length=p["length"] - 2) for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() - return result + image_start_id = vocab[processor.image_start_token] + image_token_id = vocab[processor.image_token] + image_end_id = vocab[processor.image_end_token] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_start_id] + image_tokens + [image_end_id]), + features=image_tokens, + ), + ) + ] class ChameleonLayerNorm(nn.LayerNorm): @@ -901,7 +933,10 @@ class ChameleonModel(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor( + ChameleonMultiModalProcessor, + info=ChameleonProcessingInfo, + dummy_inputs=ChameleonDummyInputsBuilder) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -1095,7 +1130,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint (e.g. " f"{name}), but not found the expected name in " f"the model (e.g. {remapped_kv_scale_name}). " diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ffd6891b25..d5f9b4d19e 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,6 +1,6 @@ # Adapted from -# https://github.com/THUDM/GLM-4 -"""Inference-only ChatGLM model compatible with THUDM weights.""" +# https://github.com/THUDM/CogAgent +"""Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from array import array from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple, @@ -41,7 +41,7 @@ from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -201,7 +201,6 @@ def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs): new_input_ids = [] final_processed_position = 0 - final_processed_position = 0 for boi_position, eoi_position in zip(boi_positions, eoi_positions): assert boi_position < eoi_position @@ -275,12 +274,15 @@ class GLMAttention(nn.Module): # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) + # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False, + # which is equivalent to is_neox_style=True + is_neox_style = not config.original_rope self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim // 2, max_position=max_positions, base=10000 * rope_ratio, - is_neox_style=False, + is_neox_style=is_neox_style, ) self.attn = Attention(self.num_heads, self.head_dim, @@ -603,9 +605,50 @@ class ChatGLMModel(nn.Module): return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), + ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "rotary_pos_emb.inv_freq" in name: + continue + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={".word_embeddings": ""}, ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -658,52 +701,9 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - # Merge two ColumnParallelLinear into one MergedColumnParallelLinear - merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { - "transformer.vision.linear_proj.merged_proj.weight": { - "transformer.vision.linear_proj.gate_proj.weight": None, - "transformer.vision.linear_proj.dense_h_to_4h.weight": None, - } - } - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - is_weight_to_be_merge = False - for _, merged_weight_dict in merged_weights_dict.items(): - if name in merged_weight_dict: - assert merged_weight_dict[name] is None - merged_weight_dict[name] = loaded_weight - is_weight_to_be_merge = True - if is_weight_to_be_merge: - continue - if "rotary_pos_emb.inv_freq" in name: - continue - if "word_embeddings" in name: - name = name.replace(".word_embeddings", "") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - for combined_name, merged_weight_dict in merged_weights_dict.items(): - if combined_name in params_dict: - param = params_dict[combined_name] - combined_weight = torch.cat(list(merged_weight_dict.values()), - dim=0) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, combined_weight) - loaded_params.add(combined_name) - return loaded_params + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class ChatGLM(ChatGLMBaseModel): @@ -724,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel): class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): + packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], @@ -775,8 +776,8 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, ) -> None: config = vllm_config.model_config.hf_config # Initialize VL - if hasattr(config, "visual"): + if hasattr(config, "vision_config"): return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: - return ChatGLM(vllm_config=vllm_config, prefix=prefix) + return ChatGLM(vllm_config=vllm_config, prefix=prefix) \ No newline at end of file diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 0188452054..dd69f6c9a5 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -20,11 +20,10 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens, - resolve_visual_encoder_outputs) + repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -from .vision import VisionEncoderInfo +from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -164,15 +163,18 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): def get_max_image_tokens(self) -> int: return get_max_clip_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_clip_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa class CLIPVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index d22d1f3171..989056bf5c 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -45,6 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import ( row_parallel_weight_loader) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -53,7 +54,7 @@ from .utils import (extract_layer_index, is_pp_missing_parameter, maybe_prefix) -@torch.compile +@torch.compile(backend=current_platform.simple_compile_backend) def layer_norm_func(hidden_states, weight, variance_epsilon): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) @@ -436,6 +437,19 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 3932d8b52a..b2aa3c0709 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -83,7 +83,7 @@ class DbrxExperts(FusedMoE): # Define custom weight loader for dbrx model def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - weight_name: str): + weight_name: str, param_name: str): tp_rank = get_tensor_model_parallel_rank() param_data = param.data shard_size = self.intermediate_size @@ -91,25 +91,37 @@ class DbrxExperts(FusedMoE): # DBRX uses GLU for each experts. # GLU has 3 linear layers: w1, v1 and w2. if weight_name.endswith("w1"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 0] = loaded_weight + else: + param_data = loaded_weight if weight_name.endswith("v1"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, - shard_size:2 * shard_size, :] = loaded_weight[:, - shard, :] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, shard_size:2 * + shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 1] = loaded_weight + else: + param_data[:] = loaded_weight if weight_name.endswith("w2"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ).transpose(1, 2) - param_data[:] = loaded_weight[:, :, shard] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ).transpose(1, 2) + param_data[:] = loaded_weight[:, :, shard] + else: + param_data[:] = loaded_weight class DbrxMoE(nn.Module): @@ -430,14 +442,28 @@ class DbrxForCausalLM(nn.Module, SupportsPP): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - expert_params_mapping = [( - "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight", + "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() + for name, loaded_weight in weights: + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + + if name.endswith(("w1", "w2", "v1")): + name = name + "_weight" for param_name, weight_name in expert_params_mapping: if weight_name not in name: continue @@ -446,8 +472,9 @@ class DbrxForCausalLM(nn.Module, SupportsPP): continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, loaded_weight, weight_name) + weight_loader(param, loaded_weight, weight_name, name) break + else: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) @@ -456,6 +483,9 @@ class DbrxForCausalLM(nn.Module, SupportsPP): if is_pp_missing_parameter(name, self): continue + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 4cf4e6c358..af6810a140 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -45,7 +45,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -243,7 +244,11 @@ class DeepseekV2Attention(nn.Module): bias=False, quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.use_normal_rope = False + else: + self.use_normal_rope = True self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, @@ -257,14 +262,8 @@ class DeepseekV2Attention(nn.Module): mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # self.attn = Attention(self.num_heads, - # self.qk_head_dim, - # self.scaling, - # num_kv_heads=self.num_heads) - - # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, - 256, + self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, @@ -298,23 +297,30 @@ class DeepseekV2Attention(nn.Module): self.qk_nope_head_dim + self.v_head_dim) k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k_pe = latent_cache[:, :, self.kv_lora_rank:] + + if self.use_normal_rope: + seq_len = positions.size(0) + ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape + q_pe = q_pe.reshape(seq_len, -1) + k_pe = k_pe.reshape(seq_len, -1) + q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) + + if self.use_normal_rope: + q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) + q[..., self.qk_nope_head_dim:] = q_pe k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], - value=0).view(-1, - self.num_local_heads * 256) + # padding value to qk_head_dim for alignment + v = torch.nn.functional.pad( + v, [0, self.qk_head_dim - self.v_head_dim], + value=0).view(-1, self.num_local_heads * self.qk_head_dim) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) attn_output = attn_output.view( - -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads, + self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output @@ -355,6 +361,7 @@ class DeepseekV2DecoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.self_attn", ) + if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): @@ -619,6 +626,11 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): continue diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py index 333dc019b4..0b44f0d062 100644 --- a/vllm/model_executor/models/deepseek_v3.py +++ b/vllm/model_executor/models/deepseek_v3.py @@ -251,7 +251,11 @@ class DeepseekV3Attention(nn.Module): bias=False, quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.use_normal_rope = False + else: + self.use_normal_rope = True self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, @@ -265,14 +269,8 @@ class DeepseekV3Attention(nn.Module): mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # self.attn = Attention(self.num_heads, - # self.qk_head_dim, - # self.scaling, - # num_kv_heads=self.num_heads) - - # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, - 256, + self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, @@ -306,23 +304,30 @@ class DeepseekV3Attention(nn.Module): self.qk_nope_head_dim + self.v_head_dim) k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k_pe = latent_cache[:, :, self.kv_lora_rank:] + + if self.use_normal_rope: + seq_len = positions.size(0) + ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape + q_pe = q_pe.reshape(seq_len, -1) + k_pe = k_pe.reshape(seq_len, -1) + q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) + + if self.use_normal_rope: + q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) + q[..., self.qk_nope_head_dim:] = q_pe k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], - value=0).view(-1, - self.num_local_heads * 256) + # padding value to qk_head_dim for alignment + v = torch.nn.functional.pad( + v, [0, self.qk_head_dim - self.v_head_dim], + value=0).view(-1, self.num_local_heads * self.qk_head_dim) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) attn_output = attn_output.view( - -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads, + self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output @@ -583,7 +588,8 @@ class DeepseekV3ForCausalLM(nn.Module, SupportsPP): continue # TODO(simon): support nextn predict layers - if self.config.num_nextn_predict_layers > 0: + if hasattr(self.config, "num_nextn_predict_layers" + ) and self.config.num_nextn_predict_layers > 0: assert self.config.num_nextn_predict_layers == 1 layer_idx = self.config.num_hidden_layers if name.startswith(f"model.layers.{layer_idx}"): @@ -639,9 +645,6 @@ class DeepseekV3ForCausalLM(nn.Module, SupportsPP): if is_pp_missing_parameter(name, self): continue - if name not in params_dict: - for key in params_dict: - print(key) param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py new file mode 100644 index 0000000000..344832d8b3 --- /dev/null +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -0,0 +1,644 @@ +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py +"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" +import math +from functools import cached_property +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from transformers import BatchFeature + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.model_loader.utils import set_default_torch_dtype +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.utils import cached_get_tokenizer +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, + MlpProjectorConfig, + VisionEncoderConfig) +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) +from vllm.utils import is_list_of + +from .interfaces import SupportsMultiModal, SupportsPP +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + +logger = init_logger(__name__) + +# The image token id may be various +_IMAGE_TOKEN = "" + + +class DeepseekVL2ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + """ + images_spatial_crop: torch.Tensor + """ + Shape: `(batch_size * num_images, 2)` + """ + + +class DeepseekVL2VImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, List[torch.Tensor]] + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs, + DeepseekVL2VImageEmbeddingInputs] + + +class MlpProjector(nn.Module): + + def __init__(self, cfg: MlpProjectorConfig): + + super().__init__() + + self.cfg = cfg + assert not cfg.token_pooling, ( + "Token pooling is not supported currently.") + + if cfg.projector_type == "downsample_mlp_gelu": + mlp_depth = cfg.depth + mlp_ratio = cfg.mlp_ratio + modules = [ + nn.Linear( + cfg.input_dim * cfg.downsample_ratio * + cfg.downsample_ratio, cfg.n_embed * mlp_ratio) + ] + for _ in range(1, mlp_depth - 1): + modules.append(nn.GELU()) + modules.append( + nn.Linear(cfg.n_embed * mlp_ratio, + cfg.n_embed * mlp_ratio)) + modules.append(nn.GELU()) + modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed)) + modules = nn.Sequential(*modules) + + else: + raise NotImplementedError( + f"Unsupported projector type: {cfg.projector_type}") + + self.layers = modules + + def forward(self, x): + bs, hw, input_dim = x.shape + h = w = int((hw)**0.5) + """compute padding""" + if h % self.cfg.downsample_ratio: + pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio + else: + pad = 0 + x = x.reshape(bs, h, w, input_dim) + if pad > 0: + x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0) + """4 to 1 concat""" + x = x.permute(0, 3, 1, 2) # B, C, H, W + x = F.unfold(x, + kernel_size=self.cfg.downsample_ratio, + stride=self.cfg.downsample_ratio, + padding=0) # B, C*4, HW // 4 + x = x.permute(0, 2, 1) + + return self.layers(x) + + +class DeepseekVL2ProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(DeepseekVLV2Config) + + def get_hf_processor(self) -> DeepseekVLV2Processor: + return self.ctx.get_hf_processor(DeepseekVLV2Processor) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_num_image_tokens(self, *, image_width: int, + image_height: int) -> int: + hf_processor = self.get_hf_processor() + image_size = hf_processor.image_size + patch_size = hf_processor.patch_size + downsample_ratio = hf_processor.downsample_ratio + + best_width, best_height = hf_processor.select_best_resolution( + (image_width, image_height)) + + num_width_tiles, num_height_tiles = (best_width // image_size, + best_height // image_size) + h = w = math.ceil((image_size // patch_size) / downsample_ratio) + + global_views_tokens = h * (w + 1) + local_views_tokens = (num_height_tiles * h) * (num_width_tiles * w + 1) + return global_views_tokens + local_views_tokens + 1 + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + candidate_resolutions = hf_config.candidate_resolutions + height, width = max(candidate_resolutions, + key=lambda x: self.get_num_image_tokens( + image_width=x[1], image_height=x[0])) + return ImageSize(width=width, height=height) + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + max_image_size = self.get_image_size_with_most_features() + max_image_tokens = self.get_num_image_tokens( + image_height=max_image_size.height, + image_width=max_image_size.width) + + return {"image": max_image_tokens} + + +class DeepseekVL2DummyInputsBuilder( + BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + + max_image_size = self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size.width, + height=max_image_size.height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) + + +class DeepseekVL2MultiModalProcessor( + BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(prompt=prompt, **mm_data), + mm_kwargs, + ) + target_dtype = self.info.ctx.model_config.dtype + pixel_values = processed_outputs.pop("pixel_values").to( + target_dtype) + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] + patches_per_image = [ + x.prod().item() + 1 for x in images_spatial_crop + ] + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values + else: + tokenizer = self.info.get_tokenizer() + processed_outputs = tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + images_spatial_crop=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token_id = hf_processor.image_token_id + assert isinstance(image_token_id, int) + + def get_replacement_deepseek_vl2(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement_deepseek_vl2, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + DeepseekVL2MultiModalProcessor, + info=DeepseekVL2ProcessingInfo, + dummy_inputs=DeepseekVL2DummyInputsBuilder) +class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ + "language.": "language_model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: DeepseekVLV2Config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.vision_config = config.vision_config + self.projector_config = config.projector_config + self.text_config = config.text_config + + model_config = vllm_config.model_config + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + tokenizer_revision=model_config.tokenizer_revision, + trust_remote_code=model_config.trust_remote_code, + ) + self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN) + + self.vision = self._init_vision_module(self.vision_config, + quant_config, + maybe_prefix(prefix, "vision")) + + self.projector = MlpProjector(self.projector_config) + self.tile_tag = config.tile_tag + self.global_view_pos = config.global_view_pos + + # special token for image token sequence format + embed_std = 1 / torch.sqrt( + torch.tensor(self.projector_config.n_embed, dtype=torch.float32)) + if self.tile_tag == "2D": + # <|view_separator|>, <|\n|> + self.image_newline = nn.Parameter( + torch.randn(self.projector_config.n_embed) * embed_std) + # This is a typo in original implementation + self.view_seperator = nn.Parameter( + torch.randn(self.projector_config.n_embed) * embed_std) + else: + raise ValueError( + f"Only 2D tile_tag is supported currently, got: {self.tile_tag}" + ) + + if self.text_config.topk_method == "noaux_tc": + architectures = ["DeepseekV3ForCausalLM"] + elif not self.text_config.use_mla: + architectures = ["DeepseekForCausalLM"] + else: + architectures = ["DeepseekV2ForCausalLM"] + + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=self.text_config, + prefix=maybe_prefix(prefix, "language"), + architectures=architectures, + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + def _init_vision_module( + self, + vision_config: VisionEncoderConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> nn.Module: + # TODO: refactor vision model through timm wrapper from transformers + try: + import timm + except ImportError: + raise ImportError("Please install timm") from ImportError + + with set_default_torch_dtype(torch.float16): + model = timm.create_model( + "vit_so400m_patch14_siglip_384.webli", + pretrained=False, + num_classes=0, + dynamic_img_size=True, + dynamic_img_pad=True, + ) + + model = model.to(dtype=torch.get_default_dtype()) + return model + + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler + + return get_sampler() + + def _validate_pixel_values( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + + h = w = self.vision_config.image_size + expected_dims = (3, h, w) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("num_patches", *map(str, expected_dims)) + raise ValueError( + "The expected shape of pixel values per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _validate_images_spatial_crop( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + expected_dims = 2 + + def _validate_shape(d: torch.Tensor): + actual_dims = d.size(-1) + + if actual_dims != expected_dims: + expected_expr = str(expected_dims) + raise ValueError( + f"The expected shape of image sizes per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + images_spatial_crop = kwargs.pop("images_spatial_crop", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + if not isinstance(images_spatial_crop, (torch.Tensor, list)): + raise ValueError("Incorrect type of image sizes. " + f"Got type: {type(images_spatial_crop)}") + + return DeepseekVL2ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(flatten_bn(pixel_values)), + images_spatial_crop=self._validate_images_spatial_crop( + flatten_bn(images_spatial_crop, concat=True))) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return DeepseekVL2VImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds), + ) + + raise AssertionError("This line should be unreachable.") + + def _pixel_values_to_embedding( + self, + pixel_values: NestedTensors, + images_spatial_crop: torch.Tensor, + ) -> NestedTensors: + # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width] + total_tiles = [x for x in pixel_values] + + # [batch_all_tiles, 3, height, width] + total_tiles = torch.cat(total_tiles, dim=0) + + # [batch_all_tiles, vit_seq_len, c] + images_feature = self.vision.forward_features(total_tiles) + + # [batch_all_tiles, hw, D] + images_embeds = self.projector(images_feature) + + _, hw, n_dim = images_embeds.shape + h = w = int(hw**0.5) + + # 根据self.tile_tag & self.global_view_pos填充image token sequence + tile_index = 0 + vision_embeddings = [] + for jdx in range(images_spatial_crop.size(0)): + # extra global & local features + num_width_tiles, num_height_tiles = images_spatial_crop[jdx] + if num_width_tiles == 0 or num_height_tiles == 0: + break + num_tiles_in_image = num_width_tiles * num_height_tiles + + # [hw, D] + global_features = images_embeds[tile_index] + + # [num_height_tiles * num_width_tiles, hw, D] + local_features = images_embeds[tile_index + 1:tile_index + 1 + + num_tiles_in_image] + tile_index += num_tiles_in_image + 1 + + # format global and local features + # ----------------- global view add newline ----------------- + # [hw, D] -> [h, w, D] + global_features = global_features.view(h, w, n_dim) + + # [D] -> [h, 1, D] + new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h) + + # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D] + global_features = torch.cat([global_features, new_lines_in_global], + dim=1) + + # [h, w + 1, D] -> [h * (w + 1), D] + global_features = global_features.view(-1, n_dim) + + # ----------------- local view add newline ----------------- + # [num_height_tiles * num_width_tiles, h * w, D] -> + # [num_height_tiles * h, num_width_tiles * w, D] + local_features = rearrange(local_features, + "(th tw) (h w) d -> (th h) (tw w) d", + th=num_height_tiles, + tw=num_width_tiles, + h=h, + w=w) + + # [D] -> [num_height_tiles * h, 1, D] + new_lines_in_local = repeat(self.image_newline, + "d -> (th h) 1 d", + th=num_height_tiles, + h=h) + + # [num_height_tiles * h, num_width_tiles * w + 1, D] + local_features = torch.cat([local_features, new_lines_in_local], + dim=1) + + # [num_height_tiles * h, num_width_tiles * w + 1, D] + # --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D] + local_features = local_features.view(-1, n_dim) + + # merge global and local tiles + if self.global_view_pos == "head": + global_local_features = torch.cat([ + global_features, + self.view_seperator[None, :], + local_features, + ]) + else: + global_local_features = torch.cat([ + local_features, + self.view_seperator[None, :], + global_features, + ]) + + vision_embeddings.append(global_local_features) + return vision_embeddings + + def _process_image_input( + self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + image_data = image_input["data"] + if is_list_of(image_data, torch.Tensor): + # it's already a list of tensors + return image_data + if len(image_data.shape) == 3: + # 3D tensor + return list(torch.unbind(image_data, dim=0)) + raise ValueError( + "We expect batched 2D tensors;" + "this can be either a list of 2D tensors or a single 3D tensor." + ) + + pixel_values = image_input["data"] + images_spatial_crop = image_input["images_spatial_crop"] + + return self._pixel_values_to_embedding( + pixel_values=pixel_values, images_spatial_crop=images_spatial_crop) + + def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.image_token_id) + return inputs_embeds + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object): + + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + + loader = AutoWeightsLoader(self) + autoloaded_weights = loader.load_weights(weights, + mapper=self.hf_to_vllm_mapper) + return autoloaded_weights diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index f138d13630..948560b490 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -17,14 +17,35 @@ from vllm.sequence import IntermediateTensors from .utils import maybe_prefix +class DummyInputLayerNorm(nn.Module): + + def __init__(self, weight=None, bias=None): + super().__init__() + self.weight = nn.Parameter(weight) if weight is not None else None + self.bias = nn.Parameter(bias) if bias is not None else None + + def forward(self, x): + return x + + +class DummyOutputNorm(nn.Module): + + def forward(self, x, residual): + if residual is None: + return x + else: + return x, residual + + class EAGLE(nn.Module): """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077 Reference implementation: https://github.com/SafeAILab/EAGLE Differences from reference implementation: 1. In reference, LlamaDecoderLayer implementation doesn't have - input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427) - but we do as HF implementation also does. + input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427). + Following this approach, our implementation also disables + the input_layernorm for the first decoder layer. 2. We allow any decoder layer to be used in EAGLE whereas in reference decoder layer is fixed to be LlamaDecoderLayer. 3. We have an optional token_map which reduces draft vocab to most @@ -46,10 +67,20 @@ class EAGLE(nn.Module): self.model = model_cls(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.fc = nn.Linear(config.model.hidden_size * 2, config.model.hidden_size, bias=getattr(self.config, "eagle_fc_bias", False)) + # Modify layer normalization and residual connections as suggested + # in the EAGLE framework: https://github.com/SafeAILab/EAGLE + # While weights and biases are generally not needed, + # they are retained here to support certain unit tests + # (e.g., spec_decode/e2e/test_eagle_correctness.py). + self.model.model.layers[0].input_layernorm = DummyInputLayerNorm( + weight=self.model.model.layers[0].input_layernorm.weight) + self.model.model.norm = DummyOutputNorm() + self.orig_vocab_size = config.vocab_size self.truncated_vocab_size = config.truncated_vocab_size self.unpadded_vocab_size = self.truncated_vocab_size diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 8324a563ed..bc3295da7b 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -30,8 +30,7 @@ from torch import nn from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -39,16 +38,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.exaone import ExaoneConfig @@ -430,14 +426,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "c_fc_0": ("gate_up_proj", 0), - "c_fc_1": ("gate_up_proj", 1), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -447,6 +435,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.transformer = ExaoneModel( vllm_config=vllm_config, @@ -540,12 +529,14 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue @@ -583,32 +574,3 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params - - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, - tp_rank, - tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type, - ): - if not isinstance(self.transformer.h[layer_idx], nn.Identity): - layer_self_attn = self.transformer.h[layer_idx].attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py new file mode 100644 index 0000000000..b93a686803 --- /dev/null +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -0,0 +1,151 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llama model for fairseq2 weights.""" + +from typing import Iterable, Set, Tuple + +import torch +from torch.nn import Parameter + +from vllm.config import VllmConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import set_weight_attrs +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .utils import AutoWeightsLoader, WeightsMapper + + +class Fairseq2LlamaForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + # For the model loader to read only the relevant checkpoint files + self.allow_patterns_overrides = [ + # either the full checkpoint + "model.pt", + # or the tp-sharded checkpoint of the current rank + f"model.{self.tp_rank}.pt", + ] + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + # fairseq2's serialization adds a wrapper to usual .pt state_dict's: + # { "model_key": my_model_name, "my_model_name": state_dict } + # which we first need to unpack + weights_wrapped = dict(weights) + weights = weights_wrapped[ + weights_wrapped["model_key"]].items() # type: ignore + + # remap keys + fs2_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder_frontend.embed.": "model.embed_tokens.", + "decoder.": "model.", + "final_proj.": "lm_head.", + }, + orig_to_new_substr={ + ".self_attn_layer_norm.": ".input_layernorm.", + ".ffn_layer_norm.": ".post_attention_layernorm.", + ".self_attn.output_proj.": ".self_attn.o_proj.", + ".ffn.gate_proj.": ".mlp.gate_proj.", + ".ffn.inner_proj.": ".mlp.up_proj.", + ".ffn.output_proj.": ".mlp.down_proj.", + ".layer_norm.": ".norm.", + }, + ) + weights = fs2_to_vllm_mapper.apply(weights) + + params = dict(self.named_parameters()) + + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights( + (self.reshape_fairseq2_weights(name, loaded_weight, params) + for name, loaded_weight in weights)) + + def flag_sharded_weights(self, params: dict[str, Parameter]): + """Sets the `is_sharded_weight` flag to True for all sharded weights""" + for name, param in params.items(): + modules = name.split(".") + if "norm" in name and len(param.size()) < 2: + # layer norms are not sharded + continue + elif any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # for now we repeat embedding layers for compatibility + continue + else: + # all other layers are sharded + set_weight_attrs(param, {"is_sharded_weight": True}) + + def reshape_fairseq2_weights( + self, + name: str, + loaded_weight: torch.Tensor, + params: dict[str, Parameter], + ) -> Tuple[str, torch.Tensor]: + """Reshape fairseq2's weights.""" + + def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: + attn_in = self.config.head_dim * n_heads + # check for a sharded weight on dim 0 + if attn_in // self.tp_size == w.size()[0]: + attn_in //= self.tp_size + n_heads //= self.tp_size + attn_out = self.config.hidden_size + return (w.view(n_heads, attn_in // n_heads // 2, 2, + attn_out).transpose(1, + 2).reshape(attn_in, attn_out)) + + modules = name.split(".") + + # rotary embeds should be sliced + if "k_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads) + + elif "q_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads) + + # We make the loaded weights compatible with both + # full checkpoints and tp sharded checkpoints. + # Embeddings are repeated to fit the vocab size. + # Other weights are flagged for the weight_loader calls. + if any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # Embeddings are sharded on dim 0 + dim = 0 + # In fairseq2, vocab size has to be divisible by tp_size + # so we don't worry about padding + if self.tp_size > 1 and loaded_weight.shape[ + dim] < self.config.vocab_size: + assert loaded_weight.shape[ + dim] * self.tp_size == self.config.vocab_size, \ + "vocab_size should be divisible by tp_size." + repeats = [1] * len(loaded_weight.size()) + repeats[dim] = self.tp_size + # repeat to match vocab size and to be easily 'narrow'able + loaded_weight = loaded_weight.repeat(repeats) + set_weight_attrs(params[name], {"is_sharded_weight": False}) + # if embeddings are sharded, the rest is too + if "embed_tokens" in modules: + self.flag_sharded_weights(params) + + return name, loaded_weight diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 8660cf79b9..c503a368e8 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -409,9 +409,9 @@ class FalconModel(nn.Module): class FalconForCausalLM(nn.Module, SupportsPP): - - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = {} + packed_modules_mapping = { + "query_key_value": ["query_key_value"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 3680d01725..dbf9da50cc 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -30,13 +30,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, + MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -63,26 +64,40 @@ class FuyuImagePatchInputs(TypedDict): """ -class FuyuMultiModalProcessor(BaseMultiModalProcessor): +class FuyuProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(FuyuConfig) + + def get_hf_processor(self): + return self.ctx.get_hf_processor(FuyuProcessor) + + def get_image_processor(self) -> FuyuImageProcessor: + return self.get_hf_processor().image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def _get_image_target_size(self) -> ImageSize: - processor = self._get_hf_processor() - image_processor: FuyuImageProcessor = processor.image_processor + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self.get_image_size_with_most_features() - target_size = image_processor.size - return ImageSize(width=target_size["width"], - height=target_size["height"]) + max_ncols, max_nrows = self.get_image_feature_grid_size( + image_width=target_width, + image_height=target_height, + ) + max_image_tokens = (max_ncols + 1) * max_nrows - def _get_image_feature_grid_size( + return {"image": max_image_tokens} + + def get_image_feature_grid_size( self, *, image_width: int, image_height: int, ) -> tuple[int, int]: - target_width, target_height = self._get_image_target_size() + image_processor = self.get_image_processor() + target_width = image_processor.size["width"] + target_height = image_processor.size["height"] if not (image_width <= target_width and image_height <= target_height): height_scale_factor = target_height / image_height @@ -96,19 +111,37 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): nrows = math.ceil(image_height / 30) return ncols, nrows - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - target_width, target_height = self._get_image_target_size() + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + return ImageSize(width=image_processor.size["width"], + height=image_processor.size["height"]) - max_ncols, max_nrows = self._get_image_feature_grid_size( - image_width=target_width, - image_height=target_height, + +class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, ) - max_image_tokens = (max_ncols + 1) * max_nrows - return {"image": max_image_tokens} - def _get_hf_processor(self) -> FuyuProcessor: - return self.ctx.get_hf_processor(FuyuProcessor) +class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): def _call_hf_processor( self, @@ -116,14 +149,10 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: - if not mm_data: # Avoid warning from HF logger for text-only input - # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id - # Tokenizer won't add boa_token_id by default, we add it manually. - tokenizer = self._get_tokenizer() - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore - prompt_ids = tokenizer.encode(prompt) + [boa_token_id] + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") processed_outputs = super()._call_hf_processor( @@ -148,6 +177,18 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): return processed_outputs + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + # HF processor adds boa_token_id + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + boa_token_id = vocab["<0x04>"] + + return prompt_tokens + [boa_token_id] + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -161,10 +202,11 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(FuyuConfig) + hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() eot_token_id = tokenizer.bos_token_id assert isinstance(eot_token_id, int) @@ -172,13 +214,17 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) - ncols, nrows = self._get_image_feature_grid_size( + ncols, nrows = self.info.get_image_feature_grid_size( image_width=image_size.width, image_height=image_size.height, ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows - return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + - [bos_token_id]) + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) return [ PromptReplacement( @@ -188,47 +234,10 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): ) ] - def apply( - self, - prompt_text: str, - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) - # Only |SPEAKER| (image) tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - target_width, target_height = self._get_image_target_size() - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="", - mm_data=mm_data, - ) - - -@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor, + info=FuyuProcessingInfo, + dummy_inputs=FuyuDummyInputsBuilder) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index b28715c48a..6de0c866bc 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -349,15 +349,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "gate_up_proj", "down_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } # Gemma does not apply LoRA to the embedding layer. embedding_modules = {} diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index f4530e4771..f0dc769397 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -31,8 +31,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -254,6 +252,7 @@ class Gemma2Model(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config + self.quant_config = quant_config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, @@ -329,7 +328,8 @@ class Gemma2Model(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if scale_name := get_compressed_tensors_cache_scale(name): + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): # Loading kv cache scales for compressed-tensors quantization param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", @@ -399,16 +399,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 39a5736eb1..51922e6f2d 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -42,7 +42,8 @@ class PatchEmbedding(nn.Module): torch.Tensor Transformed tensor with shape (B, L, D) """ - images = images.to(self.proj.weight.device) + images = images.to(device=self.proj.weight.device, + dtype=self.proj.weight.dtype) x = self.proj(images) x = x.flatten(2).transpose(1, 2) cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index fd926ff025..1656a3cc9e 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -198,7 +198,10 @@ class GPT2Model(nn.Module): assert not config.scale_attn_by_inverse_layer_idx assert not config.reorder_and_upcast_attn self.embed_dim = config.hidden_size - self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) + self.wte = VocabParallelEmbedding(config.vocab_size, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.wte") self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, @@ -259,7 +262,9 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): self.lm_head = self.transformer.wte else: self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -304,7 +309,7 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if "lm_head.weight" in name: + if name.startswith("lm_head"): # GPT-2 ties the weights of the embedding layer and the final # linear layer. continue diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 4829578a56..08298cc0db 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -313,6 +313,19 @@ class GPTJForCausalLM(nn.Module, SupportsPP): for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index a91ed4158a..543b4e2f5e 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -29,8 +29,7 @@ from transformers import GraniteConfig from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -39,16 +38,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -362,14 +358,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -379,6 +367,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = GraniteModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -482,12 +471,14 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue @@ -525,29 +516,3 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params - - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, tp_rank, tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type): - if not isinstance(self.model.layers[layer_idx], nn.Identity): - layer_self_attn = self.model.layers[layer_idx].self_attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 17e772e7fa..d16a77f862 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -662,16 +662,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, "down_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 70b78fe64f..c5fd0d9332 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -38,9 +38,15 @@ class SupportsMultiModal(Protocol): to be merged with text embeddings. The output embeddings must be one of the following formats: - - A list or tuple of 2D tensors, where each tensor corresponds to - each input image. + + - A list or tuple of 2D tensors, where each tensor corresponds to + each input multimodal data item (e.g, image). - A single 3D tensor, with the batch dimension grouping the 2D tensors. + + Note: + The returned multimodal embeddings must be in the same order as + the appearances of their corresponding multimodal data item in the + input prompt. """ ... @@ -55,6 +61,7 @@ class SupportsMultiModal(Protocol): ) -> torch.Tensor: ... + @overload def get_input_embeddings( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index de733b6d49..37b91a803d 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -3,7 +3,6 @@ from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union, import torch import torch.nn as nn -from transformers import PretrainedConfig from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger @@ -19,9 +18,6 @@ if TYPE_CHECKING: logger = init_logger(__name__) -# The type of HF config -C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True) - # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa # which has T = List[torch.Tensor] @@ -34,7 +30,8 @@ T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) @runtime_checkable -class VllmModel(Protocol[C_co, T_co]): +class VllmModel(Protocol[T_co]): + """The interface required for all models in vLLM.""" def __init__( self, @@ -96,7 +93,8 @@ def is_vllm_model( @runtime_checkable -class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): + """The interface required for all generative models in vLLM.""" def compute_logits( self, @@ -141,7 +139,8 @@ def is_text_generation_model( @runtime_checkable -class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForPooling(VllmModel[T], Protocol[T]): + """The interface required for all pooling models in vLLM.""" def pooler( self, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 7ff68bd60e..8ad009d510 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -271,7 +271,7 @@ class InternSdpaAttention(nn.Module): v = v.transpose(1, 2) x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - x = x.transpose(1, 2).view(B, N, -1) + x = x.transpose(1, 2).reshape(B, N, -1) x = self.proj(x) return x diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8623da9957..e214c30f5d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -29,8 +29,7 @@ from transformers import LlamaConfig from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -38,16 +37,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -97,20 +93,19 @@ class LlamaMLP(nn.Module): class LlamaAttention(nn.Module): - def __init__( - self, - config: LlamaConfig, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - quant_config: Optional[QuantizationConfig] = None, - bias: bool = False, - cache_config: Optional[CacheConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, + config: LlamaConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + bias_o_proj: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "") -> None: super().__init__() layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size @@ -150,13 +145,14 @@ class LlamaAttention(nn.Module): self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, - bias=bias, + bias=bias_o_proj, quant_config=quant_config, prefix=f"{prefix}.o_proj", ) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "llama": is_neox_style = False self.rotary_emb = get_rope( @@ -230,6 +226,11 @@ class LlamaDecoderLayer(nn.Module): # Support internlm/internlm-7b with bias attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False) + bias_o_proj = attention_bias + # support internlm/internlm3-8b with qkv_bias + if hasattr(config, 'qkv_bias'): + attention_bias = config.qkv_bias + self.self_attn = LlamaAttention( config=config, hidden_size=self.hidden_size, @@ -241,6 +242,7 @@ class LlamaDecoderLayer(nn.Module): max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + bias_o_proj=bias_o_proj, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -300,6 +302,7 @@ class LlamaModel(nn.Module): lora_config = vllm_config.lora_config self.config = config + self.quant_config = quant_config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -390,12 +393,14 @@ class LlamaModel(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue @@ -433,32 +438,6 @@ class LlamaModel(nn.Module): loaded_params.add(name) return loaded_params - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, tp_rank, tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type): - if not isinstance(self.layers[layer_idx], nn.Identity): - layer_self_attn = self.layers[layer_idx].self_attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") - class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { @@ -477,16 +456,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - # Mistral/Llama models can also be loaded with --load-format mistral # from consolidated.safetensors checkpoints mistral_mapping = { @@ -596,9 +565,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.maybe_remap_mistral(name, loaded_weight) for name, loaded_weight in weights) - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - self.model.load_kv_cache_scales(quantization_param_path) - # This function is used to remap the mistral format as # used by Mistral and Llama <=2 def maybe_remap_mistral( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 78de27cd82..296af2aac5 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,18 +1,21 @@ from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, Union) + Protocol, Set, Tuple, TypedDict, TypeVar, Union) import torch import torch.nn as nn +from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor from vllm.attention import AttentionMetadata from vllm.config import VllmConfig +from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -21,15 +24,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, - ImageSize) + ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, - MultiModalDataItems, ProcessingCache, - ProcessorInputs, PromptReplacement, - full_groupby_modality) + BaseProcessingInfo, ProcessingCache, + PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel @@ -39,7 +41,7 @@ from .pixtral import (PixtralHFVisionModel, from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import vision_encoder_info +from .vision import get_vision_encoder_info class LlavaImagePixelInputs(TypedDict): @@ -96,31 +98,33 @@ class LlavaMultiModalProjector(nn.Module): class LlavaLikeConfig(Protocol): vision_config: Final[PretrainedConfig] + image_token_index: Final[int] vision_feature_select_strategy: Final[str] - vision_feature_layer: Final[Union[int, List[int]]] + vision_feature_layer: Final[Union[int, list[int]]] -class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): +class LlavaLikeProcessor(Protocol): + image_token: Final[str] - def __init__(self, - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: - super().__init__(ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks) - vision_config = self._get_hf_config().vision_config - self._vision_encoder_info = vision_encoder_info(vision_config) +class BaseLlavaProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self) -> LlavaLikeConfig: + return self.ctx.get_hf_config(LlavaConfig) + + def get_vision_encoder_info(self): + return get_vision_encoder_info(self.get_hf_config()) @abstractmethod - def _get_hf_config(self) -> LlavaLikeConfig: + def get_hf_processor(self) -> LlavaLikeProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} + def _apply_feature_select_strategy( self, strategy: str, @@ -134,43 +138,53 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): msg = f"Unexpected feature select strategy: {strategy!r}" raise NotImplementedError(msg) - def _get_max_image_tokens(self) -> int: - hf_config = self._get_hf_config() + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() return self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_max_image_tokens(), + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), ) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - return {"image": self._get_max_image_tokens()} + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, ) - def _get_dummy_image_size(self) -> ImageSize: - image_size = self._vision_encoder_info.get_image_size() - return ImageSize(image_size, image_size) - @abstractmethod - def _get_image_token(self) -> str: - raise NotImplementedError +_I = TypeVar("_I", bound=BaseLlavaProcessingInfo) - def _get_dummy_mm_inputs( + +class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + + def get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - image_token = self._get_image_token() - target_width, target_height = self._get_dummy_image_size() + processor = self.info.get_hf_processor() + image_token = processor.image_token + target_width, target_height = \ + self.info.get_image_size_with_most_features() mm_data = { "image": @@ -185,32 +199,22 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): ) -class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): +class LlavaProcessingInfo(BaseLlavaProcessingInfo): - def _get_hf_config(self) -> LlavaConfig: - return self.ctx.get_hf_config(LlavaConfig) - - def _get_hf_processor(self) -> LlavaProcessor: + def get_hf_processor(self): return self.ctx.get_hf_processor(LlavaProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - def _get_num_image_tokens( +class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]): + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_mm_fields_config( self, - *, - image_width: int, - image_height: int, - ) -> int: - hf_config = self._get_hf_config() - - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_num_image_tokens( - image_width=image_width, - image_height=image_height, - ), - ) + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + raise NotImplementedError def _get_prompt_replacements( self, @@ -218,7 +222,7 @@ class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index def get_replacement(item_idx: int): @@ -229,7 +233,7 @@ class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): num_image_tokens = images.get_feature_size(item_idx) else: image_size = images.get_image_size(item_idx) - num_image_tokens = self._get_num_image_tokens( + num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, ) @@ -245,16 +249,28 @@ class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): ] -class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): +class LlavaMultiModalProcessor( + BaseLlavaMultiModalProcessor[LlavaProcessingInfo]): - def _get_hf_config(self) -> LlavaConfig: - return self.ctx.get_hf_config(LlavaConfig) + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) - def _get_hf_processor(self) -> PixtralProcessor: + +class PixtralHFProcessingInfo(BaseLlavaProcessingInfo): + + def get_hf_processor(self): return self.ctx.get_hf_processor(PixtralProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token + +class PixtralHFMultiModalProcessor( + BaseMultiModalProcessor[PixtralHFProcessingInfo]): def _call_hf_processor( self, @@ -283,19 +299,30 @@ class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): return processed_outputs + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() - image_token_id = hf_config.image_token_index + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + hf_config = self.info.get_hf_config() + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() - processor = self._get_hf_processor() - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token + image_break_id = vocab[processor.image_break_token] + image_token_id = hf_config.image_token_index + image_end_id = vocab[processor.image_end_token] vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) @@ -310,10 +337,10 @@ class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): image_height=image_size.height, ) - tokens = ([image_token] * ncols + [image_break_token]) * nrows - tokens[-1] = image_end_token + tokens = ([image_token_id] * ncols + [image_break_id]) * nrows + tokens[-1] = image_end_id - return "".join(tokens) + return tokens return [ PromptReplacement( @@ -324,26 +351,40 @@ class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): ] -def _build_llava_or_pixtral_hf_processor( - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True, -) -> BaseLlavaMultiModalProcessor: +def _build_llava_or_pixtral_hf_info( + ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo: hf_config = ctx.get_hf_config(LlavaConfig) if isinstance(hf_config.vision_config, PixtralVisionConfig): + return PixtralHFProcessingInfo(ctx) + + return LlavaProcessingInfo(ctx) + + +def _build_llava_or_pixtral_hf_processor( + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True, +) -> BaseMultiModalProcessor: + if isinstance(info, PixtralHFProcessingInfo): return PixtralHFMultiModalProcessor( - ctx, + info, + dummy_inputs, # type: ignore cache=cache, enable_sanity_checks=enable_sanity_checks, ) - return LlavaMultiModalProcessor( - ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks, - ) + if isinstance(info, LlavaProcessingInfo): + return LlavaMultiModalProcessor( + info, + dummy_inputs, # type: ignore + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) + + raise NotImplementedError(type(info)) def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: @@ -421,16 +462,14 @@ def init_vision_tower_for_llava( raise NotImplementedError(msg) -@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor) +@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor, + info=_build_llava_or_pixtral_hf_info, + dummy_inputs=LlavaDummyInputsBuilder) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -507,6 +546,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") + if self.config.vision_config.model_type == "pixtral": + return LlavaImagePixelInputs( + type="pixel_values", + data=flatten_bn(pixel_values), + ) + return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( @@ -674,27 +719,45 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return loader.load_weights(weights) -class MantisMultiModalProcessor(LlavaMultiModalProcessor): +class MantisProcessingInfo(LlavaProcessingInfo): - def _get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaProcessor) + def get_hf_processor(self): + hf_config = self.get_hf_config() + vision_info = self.get_vision_encoder_info() + + if Version(TRANSFORMERS_VERSION) < Version("4.48"): + # BUG: num_additional_image_tokens = 0 but treated as 1, + # so we set vision_feature_select_strategy to None to offset this + vision_feature_select_strategy = None + else: + # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 + vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 + + return self.ctx.get_hf_processor( + LlavaProcessor, + patch_size=vision_info.get_patch_size(), + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + +class MantisMultiModalProcessor(LlavaMultiModalProcessor): def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - hf_config = self.ctx.get_hf_config(LlavaConfig) + ) -> MultiModalInputs: + hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index # Assume that it doesn't depend on the image size - num_image_tokens = self._get_num_image_tokens( + num_image_tokens = self.info.get_num_image_tokens( image_width=-1, image_height=-1, ) - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) mm_items = self._to_mm_items(mm_data) mm_item_counts = mm_items.get_all_counts() @@ -709,7 +772,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ")", # 3 tokens ]) - mantis_repls = self._bind_prompt_replacements([ + mantis_mm_repls = self._bind_and_group_repls([ PromptReplacement( modality="image", target=[image_token_id] * num_image_tokens, @@ -717,9 +780,9 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ) ]) - prompt_ids, prompt_text, _ = self._apply_prompt_replacements( + prompt_ids, prompt, _ = self._apply_prompt_replacements( result["prompt_token_ids"], - mantis_repls, + mantis_mm_repls, mm_item_counts, ) @@ -728,28 +791,34 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): hf_processor_mm_kwargs, mm_kwargs, ) - orig_repls = self._bind_prompt_replacements(unbound_orig_repls) + orig_repls = self._bind_and_group_repls(unbound_orig_repls) - all_placeholders = self._find_placeholders(orig_repls, prompt_ids, - mm_item_counts) - assert len(all_placeholders) == mm_item_counts.get("image", 0) + mm_placeholders = self._find_mm_placeholders( + orig_repls, + prompt_ids, + mm_item_counts, + ) - mm_placeholders = { - modality: [item.to_range() for item in items] - for modality, items in full_groupby_modality(all_placeholders) + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) + + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", - prompt=prompt_text, + prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_placeholders=mm_placeholders, + mm_placeholders=mm_placeholder_ranges, ) # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` -@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor, + info=MantisProcessingInfo, + dummy_inputs=LlavaDummyInputsBuilder) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 24debd1cbf..fda4f22d36 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,6 +1,7 @@ +from abc import abstractmethod from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, TypeVar, Union) import torch import torch.nn as nn @@ -20,8 +21,9 @@ from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector, - init_vision_tower_for_llava) +from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo, + LlavaDummyInputsBuilder, LlavaLikeConfig, + LlavaMultiModalProjector, init_vision_tower_for_llava) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, init_vllm_registered_model, maybe_prefix) @@ -59,58 +61,40 @@ LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, LlavaNextImageEmbeddingInputs] -class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): +class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): + image_grid_pinpoints: Final[list[list[int]]] - def _get_hf_config(self) -> LlavaNextConfig: + +class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): + + def get_hf_config(self) -> LlavaNextLikeConfig: return self.ctx.get_hf_config(LlavaNextConfig) - def _get_hf_processor(self) -> LlavaNextProcessor: + def get_hf_processor(self): return self.ctx.get_hf_processor(LlavaNextProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - - def _get_max_image_tokens(self) -> int: - largest_feature_size, _ = self._get_pinpoint_with_most_features() - return largest_feature_size - - def _get_dummy_image_size(self) -> ImageSize: - _, pinpoint = self._get_pinpoint_with_most_features() - return pinpoint - - # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 - def _get_num_image_tokens( + # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113 + def get_num_image_tokens( self, *, image_width: int, image_height: int, ) -> int: - hf_config = self._get_hf_config() + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() base_feature_size = self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_num_image_tokens( + vision_encoder_info.get_num_image_tokens( image_width=image_width, image_height=image_height, ), ) - num_patches = self._vision_encoder_info.get_num_patches() num_patch_height, num_patch_width = get_anyres_image_grid_shape( image_size=(image_height, image_width), grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=self._vision_encoder_info.get_image_size(), + patch_size=vision_encoder_info.get_image_size(), ) ( @@ -119,14 +103,14 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): ) = self._get_num_unpadded_features( original_height=image_height, original_width=image_width, - npatches=num_patches, + npatches=vision_encoder_info.get_patch_grid_length(), num_patch_height=num_patch_height, num_patch_width=num_patch_width, ) return unpadded_feature_size + newline_feature_size + base_feature_size - # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 + # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86 def _get_num_unpadded_features( self, *, @@ -139,35 +123,30 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): current_height = npatches * num_patch_height current_width = npatches * num_patch_width - original_aspect_ratio = original_width / original_height + aspect_ratio = original_width / original_height current_aspect_ratio = current_width / current_height - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + if aspect_ratio > current_aspect_ratio: + new_height = (original_height * current_width) // original_width padding = (current_height - new_height) // 2 - current_height -= 2 * padding + current_height = current_height - (2 * padding) else: - scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = (original_width * current_height) // original_height padding = (current_width - new_width) // 2 - current_width -= 2 * padding + current_width = current_width - (2 * padding) unpadded_features = current_height * current_width newline_features = current_height + return (unpadded_features, newline_features) - def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: - """ - Get the grid pinpoint with the most features and - the corresponding feature size. - """ - hf_config = self._get_hf_config() + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() largest_feature_size, largest_feature_pinpoint = 0, None for (height, width) in hf_config.image_grid_pinpoints: - feat_size = self._get_num_image_tokens(image_width=width, - image_height=height) + feat_size = self.get_num_image_tokens(image_width=width, + image_height=height) if feat_size > largest_feature_size: largest_feature_size = feat_size largest_feature_pinpoint = ImageSize(width=width, @@ -176,10 +155,42 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): if largest_feature_size == 0 or largest_feature_pinpoint is None: raise ValueError("Cannot have a largest feature size of 0!") - return largest_feature_size, largest_feature_pinpoint + return largest_feature_pinpoint -@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) +_I = TypeVar("_I", bound=LlavaNextProcessingInfo) + + +class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]): + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + raise NotImplementedError + + +class LlavaNextMultiModalProcessor( + BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + +@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor, + info=LlavaNextProcessingInfo, + dummy_inputs=LlavaDummyInputsBuilder) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 0de9d8c5ea..5be85d7c0f 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -3,38 +3,34 @@ from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from transformers import (CLIPVisionConfig, LlavaNextVideoConfig, - SiglipVisionConfig) +from transformers import (BatchFeature, LlavaNextVideoConfig, + LlavaNextVideoProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - dummy_seq_data_for_siglip) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) - -# For profile run -_MAX_FRAMES_PER_VIDEO = 32 -_MAX_NUM_VIDEOS = 1 +from .vision import get_vision_encoder_info class LlavaNextVideoPixelInputs(TypedDict): @@ -50,149 +46,176 @@ class LlavaNextVideoPixelInputs(TypedDict): """ -def get_llava_next_video_frame_feature_size( - hf_config: LlavaNextVideoConfig) -> int: - # Support both CLIPVisionConfig and SiglipVisionConfig - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - spatial_pool_stride = hf_config.spatial_pool_stride +class LlavaNextVideoProcessingInfo(BaseProcessingInfo): - return int((image_size / patch_size / spatial_pool_stride)**2) + def get_hf_config(self): + return self.ctx.get_hf_config(LlavaNextVideoConfig) + def get_vision_encoder_info(self): + return get_vision_encoder_info(self.get_hf_config()) -def _get_max_llm_tokens(ctx: InputContext) -> int: - """ - Calculated from the maximum video frames under the context length - constraints of the language model. - """ - hf_text_config = ctx.model_config.hf_text_config - model_config = ctx.model_config - max_tokens = model_config.max_model_len - rope_scaling = model_config.rope_scaling + def get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaNextVideoProcessor) - if rope_scaling: - rope_scaling_factor = hf_text_config.rope_scaling["factor"] - else: - rope_scaling_factor = 1 + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"video": 1} - max_tokens *= rope_scaling_factor + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self.get_image_size_with_most_features() - return max_tokens - - -def get_max_llava_next_video_tokens(ctx: InputContext) -> int: - # Currently set to 32 frames - # TODO: max_tokens = _get_max_llm_tokens(ctx) - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) - return _MAX_FRAMES_PER_VIDEO * tokens_per_frame - - -def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - vision_config = hf_config.vision_config - - # TODO: support multiple videos - num_videos = mm_counts["video"] - if num_videos != _MAX_NUM_VIDEOS: - raise NotImplementedError( - f"Only {_MAX_NUM_VIDEOS} videos are supported") - - # TODO: support configuring the number of frames - frames_per_video = _MAX_FRAMES_PER_VIDEO - # num_images = num_videos * frames_per_video - - # fills the sequence with as longer video data as possible - tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) - video_feature_size = frames_per_video * tokens_per_frame - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video", + max_video_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features(seq_len), ) - pil_frame = dummy_image_for_clip(vision_config, num_images=1) - np_frame = np.array(pil_frame["image"]) - mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) - mm_data = {"video": mm_data_per_video} - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video", + return {"video": max_video_tokens} + + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + + def _get_num_frame_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + spatial_pool_stride = hf_config.spatial_pool_stride + + vision_encoder_info = self.get_vision_encoder_info() + patch_grid_length = vision_encoder_info.get_patch_grid_length() + pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) + + return pooled_grid_length * pooled_grid_length + + def get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + num_frame_tokens = self._get_num_frame_tokens( + image_width=image_width, + image_height=image_height, ) - pil_frame = dummy_image_for_siglip(vision_config, num_images=1) - np_frame = np.array(pil_frame["image"]) - mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) - mm_data = {"video": mm_data_per_video} - return DummyData(seq_data, mm_data, ranges) + return num_frame_tokens * num_frames - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + next_max_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) + + if next_max_tokens > max_tokens: + break + + num_frames = next_num_frames + + return num_frames + + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_total_frames = self._get_max_video_frames(seq_len) + + return max(max_total_frames // max(max_videos, 1), 1) -def input_processor_for_llava_next_video(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "video" not in multi_modal_data: - return inputs +class LlavaNextVideoDummyInputsBuilder( + BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]): - if "multi_modal_placeholders" in inputs and "video" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_videos = mm_counts.get("video", 0) - video_data = multi_modal_data["video"] + processor = self.info.get_hf_processor() + video_token = processor.video_token - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - vision_config = hf_config.vision_config + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len) - if isinstance(video_data, np.ndarray): - # Supports both CLIP and Siglip - num_frames = video_data.shape[0] - frame_feature_size = \ - get_llava_next_video_frame_feature_size(hf_config) - video_feature_size = num_frames * frame_feature_size + mm_data = { + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=target_num_frames, + num_videos=num_videos, + ) + } - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, + return ProcessorInputs( + prompt_text=video_token * num_videos, + mm_data=mm_data, ) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - elif is_list_of(video_data, np.ndarray): - raise NotImplementedError( - "Processing multiple videos is not supported") +class LlavaNextVideoMultiModalProcessor( + BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]): - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.info.get_hf_config() + video_token_id = hf_config.video_token_index + + def get_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (VideoEmbeddingItems, VideoProcessorItems)) + + if isinstance(videos, VideoEmbeddingItems): + num_video_tokens = videos.get_feature_size(item_idx) + else: + image_size = videos.get_frame_size(item_idx) + num_video_tokens = self.info.get_num_video_tokens( + image_width=image_size.width, + image_height=image_size.height, + num_frames=videos.get_num_frames(item_idx), + ) + + return [video_token_id] * num_video_tokens + + return [ + PromptReplacement( + modality="video", + target=[video_token_id], + replacement=get_replacement, + ), + ] # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): - def __init__(self, config): + def __init__(self, config: LlavaNextVideoConfig): super().__init__() mode = config.spatial_pool_mode @@ -210,7 +233,7 @@ class LlavaNextVideoPooler(nn.Module): raise ValueError( f"Unknown pooling mode: {mode}. Expected [`average`, `max`]") - def forward(self, image_features): + def forward(self, image_features: torch.Tensor): ori_width = int( math.sqrt(image_features.shape[1] * self.image_size // self.image_size)) @@ -246,11 +269,11 @@ class LlavaNextMultiModalProjector(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_input_mapper("video") -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_llava_next_video_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video) +@MULTIMODAL_REGISTRY.register_processor( + LlavaNextVideoMultiModalProcessor, + info=LlavaNextVideoProcessingInfo, + dummy_inputs=LlavaNextVideoDummyInputsBuilder, +) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 0bebc1c745..5b0f35b086 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,46 +1,40 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from PIL import Image -from transformers import (CLIPVisionConfig, LlavaOnevisionConfig, - SiglipVisionConfig) +from transformers import (BatchFeature, LlavaOnevisionConfig, + LlavaOnevisionProcessor) from transformers.models.llava_onevision.modeling_llava_onevision import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) +from vllm.multimodal.processing import PromptReplacement +from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import (CLIPVisionModel, dummy_seq_data_for_clip, - dummy_video_for_clip, get_clip_image_feature_size, - get_clip_patch_grid_length, input_processor_for_clip) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, - dummy_video_for_siglip, get_siglip_image_feature_size, - get_siglip_patch_grid_length, input_processor_for_siglip) +from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava +from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig, + LlavaNextProcessingInfo) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -# Result in the max possible feature size (2x2 grid of 336x336px tiles) -MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 - # For profile run _MAX_FRAMES_PER_VIDEO = 16 @@ -92,286 +86,281 @@ LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs, LlavaOnevisionVideoPixelInputs] -def _get_llava_onevision_image_unppaded_feature_size(height, width, patches, - scale_height, - scale_width): - current_height = patches * scale_height - current_width = patches * scale_width - - original_aspect_ratio = width / height - current_aspect_ratio = current_width / current_height - if original_aspect_ratio > current_aspect_ratio: - new_height = int(height * (current_width / width)) - padding = (current_height - new_height) // 2 - current_height -= padding * 2 - else: - new_width = int(width * (current_height / height)) - padding = (current_width - new_width) // 2 - current_width -= padding * 2 - - unpadded_features = current_height * current_width - newline_features = current_height - - ratio = math.sqrt(current_height * current_width / (9 * patches**2)) - if ratio > 1.1: - unpadded_features = int(current_height // ratio) * int( - current_width // ratio) - newline_features = int(current_height // ratio) - - return (unpadded_features, newline_features) +class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol): + video_token_index: Final[int] -def get_llava_onevision_image_feature_size( - hf_config: LlavaOnevisionConfig, - *, - input_height: int, - input_width: int, -) -> int: - vision_config = hf_config.vision_config +class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): - if isinstance(vision_config, CLIPVisionConfig): - num_patches = get_clip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, + def get_hf_config(self) -> LlavaOnevisionLikeConfig: + return self.ctx.get_hf_config(LlavaOnevisionConfig) + + def get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaOnevisionProcessor) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "video": self.get_max_video_tokens(seq_len), + } + + # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86 + # with additional logic afterwards taken from LlavaOnevisionProcessor + def _get_num_unpadded_features( + self, + *, + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, + ) -> tuple[int, int]: + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width + + aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if aspect_ratio > current_aspect_ratio: + new_height = (original_height * current_width) // original_width + padding = (current_height - new_height) // 2 + current_height = current_height - (2 * padding) + else: + new_width = (original_width * current_height) // original_height + padding = (current_width - new_width) // 2 + current_width = current_width - (2 * padding) + + unpadded_features = current_height * current_width + newline_features = current_height + + ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) + if ratio > 1.1: + height_factor = int(current_height // ratio) + width_factor = int(current_width // ratio) + unpadded_features = height_factor * width_factor + newline_features = height_factor + + return (unpadded_features, newline_features) + + def get_image_size_with_most_features(self) -> ImageSize: + # NOTE: This hardcoded value is found via processor tests + return ImageSize(width=1153, height=944) + + def _get_num_frame_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) + + vision_encoder_info = self.get_vision_encoder_info() + patch_grid_length = vision_encoder_info.get_patch_grid_length() + pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) + + return pooled_grid_length * pooled_grid_length + + def get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + num_frame_tokens = self._get_num_frame_tokens( + image_width=image_width, + image_height=image_height, ) - base_feature_size = get_clip_image_feature_size(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_patches = get_siglip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, + + return num_frame_tokens * num_frames + 1 # Newline token + + def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + next_max_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) + + if next_max_tokens > max_tokens: + break + + num_frames = next_num_frames + + return num_frames + + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_image_tokens = self.get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + max_frames_per_video = min(max_total_frames // max(max_videos, 1), + _MAX_FRAMES_PER_VIDEO) + + return max(max_frames_per_video, 1) + + def get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features(seq_len), ) - base_feature_size = get_siglip_image_feature_size(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - base_feature_size -= 1 - elif strategy == "full": - pass - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") - - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_size=(input_height, input_width), - grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=vision_config.image_size, - ) - - ( - unpadded_feature_size, - newline_feature_size, - ) = _get_llava_onevision_image_unppaded_feature_size( - input_height, input_width, num_patches, num_patch_height, - num_patch_width) - - return unpadded_feature_size + newline_feature_size + base_feature_size -def get_max_llava_onevision_image_tokens(ctx: InputContext): - return get_llava_onevision_image_feature_size( - ctx.get_hf_config(LlavaOnevisionConfig), - input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) +class LlavaOnevisionDummyInputsBuilder( + LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]): + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) -def get_llava_onevision_video_frame_feature_size( - hf_config: LlavaOnevisionConfig) -> int: - # Support both CLIPVisionConfig and SiglipVisionConfig - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - spatial_pool_stride = hf_config.spatial_pool_stride if hasattr( - hf_config, "spatial_pool_stride") else 2 + processor = self.info.get_hf_processor() + image_token = processor.image_token + video_token = processor.video_token - height = width = image_size // patch_size - return math.ceil(height / spatial_pool_stride) * math.ceil( - width / spatial_pool_stride) + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len) + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=target_num_frames, + num_videos=num_videos, + ) + } -def get_llava_onevision_video_tokens(ctx: InputContext, - num_frames: int) -> int: - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - - # TODO: support configuring (not supported by HF right now) - num_token_image_newline = 1 - tokens_per_frame = get_llava_onevision_video_frame_feature_size(hf_config) - video_feature_size = num_frames * tokens_per_frame + num_token_image_newline - - return video_feature_size - - -def get_max_llava_onevision_video_tokens(ctx: InputContext) -> int: - return get_llava_onevision_video_tokens(ctx, _MAX_FRAMES_PER_VIDEO) - - -def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - vision_config = hf_config.vision_config - - num_videos = mm_counts["video"] - - # TODO: support configuring the number of frames - num_frames = _MAX_FRAMES_PER_VIDEO - video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video") - - mm_data = dummy_video_for_clip(vision_config, - num_frames=num_frames, - num_videos=num_videos) - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video") - - mm_data = dummy_video_for_siglip(vision_config, - num_frames=num_frames, - num_videos=num_videos) - return DummyData(seq_data, mm_data, ranges) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_when_multimodal_input_image(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - vision_config = hf_config.vision_config - - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - width, height = image_data.size - - image_feature_size = get_llava_onevision_image_feature_size( - hf_config, - input_height=height, - input_width=width, + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, ) - elif is_list_of(image_data, Image.Image): - image_feature_size = [ - get_llava_onevision_image_feature_size(hf_config, - input_height=img.height, - input_width=img.width) - for img in image_data + + +class LlavaOnevisionMultiModalProcessor( + BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), + ) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_data = dict(mm_data) + videos = mm_data.pop("videos", []) + assert isinstance(videos, list) + + if not videos: + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + processor = self.info.get_hf_processor() + video_token = processor.video_token + + # LLaVA-OneVision processor doesn't support multiple videos + # with different sizes when converting back to tensors + text_image_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + pixel_values_videos = [] + for video in videos: + item_processor_data = dict(prompt=video_token, videos=video) + + item_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=item_processor_data, + mm_kwargs=mm_kwargs, + ) + + pixel_values_videos.append( + item_outputs.pop("pixel_values_videos")[0]) + + combined_outputs = dict( + **text_image_outputs, + pixel_values_videos=pixel_values_videos, + ) + return BatchFeature(combined_outputs) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + image_repls = super()._get_prompt_replacements( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) + + hf_config = self.info.get_hf_config() + video_token_id = hf_config.video_token_index + + def get_video_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (VideoEmbeddingItems, VideoProcessorItems)) + + if isinstance(videos, VideoEmbeddingItems): + num_video_tokens = videos.get_feature_size(item_idx) + else: + image_size = videos.get_frame_size(item_idx) + num_video_tokens = self.info.get_num_video_tokens( + image_width=image_size.width, + image_height=image_size.height, + num_frames=videos.get_num_frames(item_idx), + ) + + return [video_token_id] * num_video_tokens + + return image_repls + [ + PromptReplacement( + modality="video", + target=[video_token_id], + replacement=get_video_replacement, + ), ] - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[1] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - return input_processor_for_clip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return input_processor_for_siglip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_when_multimodal_input_video(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "video" not in multi_modal_data: - return inputs - video_data = multi_modal_data["video"] - - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - - if isinstance(video_data, np.ndarray): - # Supports both CLIP and Siglip - num_frames = video_data.shape[0] - video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, - ) - - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - - elif is_list_of(video_data, np.ndarray): - video_feature_size = [] - for video in video_data: - num_frames = video.shape[0] - video_feature_size.append( - get_llava_onevision_video_tokens(ctx, num_frames)) - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, - ) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - else: - raise TypeError(f"Invalid video type: {type(video_data)}") - - msg = f"Unsupported video type: {type(video_data)}" - raise NotImplementedError(msg) - - -def input_processor_for_llava_onevision(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or ("video" not in multi_modal_data - and "image" not in multi_modal_data): - return inputs - if "image" in multi_modal_data: - return input_processor_when_multimodal_input_image(ctx, inputs) - if "video" in multi_modal_data: - return input_processor_when_multimodal_input_video(ctx, inputs) - - msg = "Unsupported multi data type" - raise NotImplementedError(msg) class LlavaOnevisionMultiModalProjector(nn.Module): @@ -394,14 +383,10 @@ class LlavaOnevisionMultiModalProjector(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_input_mapper("video") -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "image", get_max_llava_onevision_image_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_llava_onevision_video_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision) +@MULTIMODAL_REGISTRY.register_processor( + LlavaOnevisionMultiModalProcessor, + info=LlavaOnevisionProcessingInfo, + dummy_inputs=LlavaOnevisionDummyInputsBuilder) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -566,13 +551,17 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: modalities = {} - if "pixel_values" in kwargs: - modalities["images"] = self._parse_and_validate_image_input( - **kwargs) - - if "pixel_values_videos" in kwargs: - modalities["videos"] = self._parse_and_validate_video_input( - **kwargs) + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) return modalities @@ -827,46 +816,67 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return image_feature def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None - # We make a tuple of each embedding with its modality string. This is a - # temporary workaround for models to handle mixed modalities when - # get_multimodal_embeddings and get_input_embeddings are called - # separately. - # TODO(ywang96): Add support for mixed-modality inference for v1. - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () - if "images" in modalities: - image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings.append((vision_embeddings, "image")) - if "videos" in modalities: - video_input = modalities["videos"] - video_embeddings = self._process_video_pixels(video_input) - multimodal_embeddings.append((video_embeddings, "video")) + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(vision_embeddings) + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_pixels(video_input) + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - for embeddings, modality in multimodal_embeddings: - if modality == "image": - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, embeddings, - self.config.image_token_index) - if modality == "video": - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, embeddings, - self.config.video_token_index) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_index, self.config.video_token_index]) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[NestedTensors] = None, + video_input: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_index, + ) + + if video_input is not None: + video_embeds = self._process_video_pixels(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_index, + ) + return inputs_embeds def forward( @@ -888,13 +898,21 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 5a0f202364..6254d26c70 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -534,16 +534,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index e9d7eada1d..5e1e6c6fa6 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -241,11 +241,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM): # `embedding_modules` and `embedding_padding_modules` # are inherited from MiniCPMForCausalLM - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 7120225025..1aa5290568 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -141,8 +141,6 @@ class Resampler2_5(BaseResampler): self.max_size = max_size self._set_2d_pos_cache(self.max_size) - self.apply(self._init_weights) - def _set_2d_pos_cache(self, max_size: Tuple[int, int], device: torch.types.Device = "cpu") -> None: @@ -659,7 +657,7 @@ class MiniCPMV2_0(MiniCPMVBaseModel): quant_config: Optional[QuantizationConfig], prefix: str = "", ) -> nn.Module: - # TODO: refactor this vision model + # TODO: refactor vision model through timm wrapper from transformers try: import timm except ImportError: @@ -763,16 +761,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): "kv_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - embedding_modules = {} embedding_padding_modules = [] @@ -883,16 +871,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): "kv_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a5b364fe5e..da415cdae9 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -347,6 +347,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = MixtralModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -428,6 +429,18 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 6536f98077..61baa8e588 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -123,6 +123,13 @@ def input_processor_for_mllama( assert is_list_of(image_data, Image.Image) + num_image_tokens = dec_inputs['prompt_token_ids'].count( + MLLAMA_IMAGE_TOKEN_ID) + if num_image_tokens != len(image_data): + raise ValueError( + f"The number of image tokens ({num_image_tokens}) must be" + f" the same as the number of images ({len(image_data)})") + # Since only the last group of consecutive images # are attended by the decoded tokens, we only need to # get the number of tiles for those images. @@ -770,6 +777,7 @@ class MllamaTextCrossAttention(nn.Module): self.scaling, self.num_local_key_value_heads, prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_DECODER, ) def forward( @@ -805,13 +813,9 @@ class MllamaTextCrossAttention(nn.Module): kv_range_for_decode, attn_metadata) else: - output = self.attn(q.view(-1, - self.num_local_heads * self.head_dim), - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_DECODER) + output = self.attn( + q.view(-1, self.num_local_heads * self.head_dim), k, v, + kv_cache, attn_metadata) out, _ = self.o_proj(output) return out @@ -827,6 +831,7 @@ class MllamaTextCrossAttention(nn.Module): ) -> torch.Tensor: # Skip writing kv-cache for the initial profiling run. if len(kv_cache.shape) > 1: + i = torch.ones(1, dtype=torch.float32) if self.attn.backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1): cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) @@ -839,8 +844,8 @@ class MllamaTextCrossAttention(nn.Module): attn_metadata. cross_slot_mapping, # type: ignore[union-attr] "auto", - 1.0, - 1.0, + i, + i, ) elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA): key_cache, value_cache = PagedAttention.split_kv_cache( @@ -849,7 +854,7 @@ class MllamaTextCrossAttention(nn.Module): cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) PagedAttention.write_to_paged_cache( cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + attn_metadata.cross_slot_mapping, "auto", i, i) else: raise ValueError( f"Unsupported Attention backend {self.attn.backend} " @@ -1103,20 +1108,16 @@ class MllamaForCausalLM(nn.Module): @INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_mllama) @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama) class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config + self.quant_config = quant_config self.vocab_size = config.text_config.vocab_size self.hidden_size = config.text_config.hidden_size self.max_num_tiles = config.vision_config.max_num_tiles @@ -1430,6 +1431,17 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): name = name.replace('patch_embedding.weight', 'patch_embedding._linear.weight') loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + updated_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue @@ -1496,6 +1508,8 @@ def convert_sparse_cross_attention_mask_to_dense( dense_mask[seq_start + start:seq_start + end, tile_start:tile_start + tile] = 1 tile_start += tile + assert ts != -1 + assert td != 0 tile_range_for_decode.append((ts, ts + td)) seq_start += length diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index cc25be9f5b..5c7ae0deef 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -23,7 +23,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul +from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU, + SiluAndMul) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -462,15 +463,6 @@ class MolmoAttention(nn.Module): return output -class SwiGLU(nn.Module): - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x, gate = x.chunk(2, dim=-1) - # Note that the order is reversed compared to - # SiluAndMul. - return x * F.silu(gate) - - class LanuageModelMLP(nn.Module): """Molmo's LLM mlp.""" @@ -489,7 +481,7 @@ class LanuageModelMLP(nn.Module): quant_config=quant_config, ) # Activation function. - self.act_fn = SwiGLU() + self.act_fn = MulAndSilu() # Feed-forward output projection. self.down_proj = RowParallelLinear( self.intermediate_size, @@ -972,8 +964,6 @@ def image_input_mapper_for_molmo( assert len(data) == 1, "Molmo supports only one image per prompt." data = data[0] - # Remove unused dummy PIL image - data.pop('raw_mm_data', None) return MultiModalKwargs(data) @@ -1019,7 +1009,6 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, dummy_imgdata = { "images": out["images"], "image_input_idx": out["image_input_idx"], - "raw_mm_data": dummy_image, } if "image_masks" in out: dummy_imgdata["image_masks"] = out["image_masks"] @@ -1071,7 +1060,7 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): trust_remote_code=model_config.trust_remote_code) # NOTE: message formatting for raw text prompt is only applied for - # offline inference; for online inference, the prompt is always in + # offline inference; for online serving, the prompt is always in # instruction format and tokenized. if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$", prompt): @@ -1081,45 +1070,25 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): else: out = processor.process(None, image, tokens=inputs["prompt_token_ids"]) + # If there is no image, return directly. + if image is None: + new_prompt_token_ids = out["input_ids"].tolist() + prompt = inputs.get("prompt") + if prompt is None: + prompt = tokenizer.decode(new_prompt_token_ids) + return token_inputs( + prompt_token_ids=new_prompt_token_ids, + prompt=prompt, + ) + image_processor = processor.image_processor max_total_crops = 1 + image_processor.max_crops - if image is not None: - images, image_input_idx, image_masks = pad_images( - max_total_crops, - out["images"], - out["image_input_idx"], - out.get("image_masks"), - ) - else: - base_image_input_size = image_processor.base_image_input_size - image_patch_size = image_processor.image_patch_size - image_num_patch = ( - base_image_input_size[0] // image_patch_size, - base_image_input_size[1] // image_patch_size, - ) - n_pixels = image_patch_size * image_patch_size * 3 - n_patches = image_num_patch[0] * image_num_patch[1] - - image_length_w = image_processor.image_token_length_w - image_length_h = image_processor.image_token_length_h - tokens_per_image = image_length_w * image_length_h - images = torch.full( - (max_total_crops, n_patches, n_pixels), - -1, - dtype=torch.float32, - ) - image_input_idx = torch.full( - (max_total_crops, tokens_per_image), - -1, - dtype=torch.int32, - ) - if image_processor.image_padding_mask: - image_masks = torch.full( - (max_total_crops, n_patches), - -1, - dtype=torch.float32, - ) - + images, image_input_idx, image_masks = pad_images( + max_total_crops, + out["images"], + out["image_input_idx"], + out.get("image_masks"), + ) image_data = dict( images=images, image_input_idx=image_input_idx, @@ -1143,11 +1112,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): offset = i size += 1 image_data["image_start_end"] = (offset, offset + size) - prompt = inputs.get("prompt") if prompt is None: prompt = tokenizer.decode(new_prompt_token_ids) - return token_inputs( prompt_token_ids=new_prompt_token_ids, prompt=prompt, @@ -1218,12 +1185,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - "gate_proj": ("merged_linear", 0), - "up_proj": ("merged_linear", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 34cb9981c1..2340283b69 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -395,12 +395,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -411,6 +405,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = NemotronModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -495,6 +490,17 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 5d9091cfb9..fbe5d1aee0 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -20,6 +20,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -34,13 +35,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.utils import print_warning_once from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +logger = init_logger(__name__) + class OlmoeMoE(nn.Module): """A tensor-parallel MoE implementation for Olmoe that shards each expert @@ -446,7 +448,7 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 7edafcd20b..ea1185aa80 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -329,13 +329,9 @@ class OPTModel(nn.Module): class OPTForCausalLM(nn.Module, SupportsPP): - - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f9ad0c67ad..5a28b1ffbb 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -136,6 +136,17 @@ class PaliGemmaMultiModalProjector(nn.Module): @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index f9e972688d..59b7508a37 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -279,14 +279,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "fc2", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - } - embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 937858ee3b..34141511ea 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -14,7 +14,3 @@ class Phi3ForCausalLM(LlamaForCausalLM): "gate_up_proj", ], } - - # BitandBytes specific attributes - # Initialize an empty dict when there is no stacked parameter mapping. - bitsandbytes_stacked_params_mapping = {} diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index da7e4cdbc6..f47676b934 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -54,12 +55,12 @@ class HeadMajorColumnParallelLinear(MergedColumnParallelLinear): return load_column_parallel_weight(param, loaded_weight) -@torch.compile(dynamic=True) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def quick_gelu(x): return x * torch.sigmoid(1.702 * x) -@torch.compile(dynamic=True) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def gegelu(input, limit: Optional[float] = None): a_gelu, a_linear = input[..., ::2], input[..., 1::2] if limit is not None: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index f2e49d8e48..0fcda81da2 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -14,7 +14,7 @@ # limitations under the License. from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -28,22 +28,26 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +# yapf conflicts with isort for this block +# yapf: disable from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + BaseProcessingInfo, + BoundPromptReplacement, + PlaceholderFeaturesInfo, PromptReplacement, - _BoundPromptReplacement, - _PlaceholderInfo) + PromptReplacementDetails) +# yapf: enable +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, @@ -54,10 +58,6 @@ logger = init_logger(__name__) # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 32044 -# Result in the max possible feature size (h:w = 16:1) -MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000 -MAX_IMAGE_FEATURE_SIZE_WIDTH = 50 - CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0, hidden_act="quick_gelu", hidden_size=1024, @@ -305,33 +305,9 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): return image_features_hd_newline -class Phi3VMultiModalProcessor(BaseMultiModalProcessor): +class Phi3VProcessingInfo(BaseProcessingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def _get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - processor = self._get_hf_processor() - - return processor.calc_num_image_tokens_from_image_size( # type: ignore - width=image_width, - height=image_height, - ) - - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - max_image_tokens = self._get_num_image_tokens( - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) - - return {"image": max_image_tokens} - - def _get_hf_processor( + def get_hf_processor( self, *, num_crops: Optional[int] = None, @@ -341,6 +317,70 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): return self.ctx.get_hf_processor() + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self.get_image_size_with_most_features() + + max_image_tokens = self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=None, + ) + + return {"image": max_image_tokens} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional[ProcessorMixin], + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.calc_num_image_tokens_from_image_size( # type: ignore + width=image_width, + height=image_height, + ) + + def get_image_size_with_most_features(self) -> ImageSize: + # Result in the max possible feature size (h:w = 16:1) + return ImageSize(height=8000, width=50) + + +class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + hf_processor = self.info.get_hf_processor() + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + + return ProcessorInputs( + prompt_text="".join(image_tokens[:num_images]), + mm_data=mm_data, + ) + + +class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): + def _call_hf_processor( self, prompt: str, @@ -377,13 +417,13 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_tokens: list[str] = hf_processor.img_tokens # type: ignore - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() bos_token_id = tokenizer.bos_token_id assert isinstance(bos_token_id, int) @@ -395,12 +435,18 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): num_image_tokens = images.get_feature_size(item_idx) else: image_size = images.get_image_size(item_idx) - num_image_tokens = self._get_num_image_tokens( + num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, + processor=hf_processor, ) - return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] + image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens + + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) num_images = mm_items.get_count("image", strict=False) @@ -415,12 +461,12 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): def _apply_prompt_replacements( self, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, - prompt_repls=prompt_repls, + mm_prompt_repls=mm_prompt_repls, mm_item_counts=mm_item_counts, ) @@ -428,56 +474,24 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): if text.startswith(" <|image|>"): text = text.replace(" <|image|>", "<|image|>", 1) token_ids = [token_ids[0], *token_ids[2:]] - placeholders = [ - _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement) - for p in placeholders - ] + placeholders = { + modality: [ + PlaceholderFeaturesInfo( + modality=p.modality, + item_idx=p.item_idx, + start_idx=p.start_idx - 1, + tokens=p.tokens, + ) for p in ps + ] + for modality, ps in placeholders.items() + } return token_ids, text, placeholders - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - data = dummy_image_for_clip( - CLIP_VIT_LARGE_PATCH14_336_CONFIG, - num_images, - image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) - - hf_processor = self._get_hf_processor() - image_tokens: list[str] = hf_processor.img_tokens # type: ignore - - return ProcessorInputs( - prompt_text="".join(image_tokens[:num_images]), - mm_data=data, - ) - - def apply( - self, - prompt_text: str, - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) - - # Only <|image|> tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - - -@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor, + info=Phi3VProcessingInfo, + dummy_inputs=Phi3VDummyInputsBuilder) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 1febd62f2f..881c09ea9d 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -546,6 +546,7 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = vllm_config.quant_config self.model = PhiMoEModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -623,6 +624,18 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d7233bd602..37b9989e48 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -31,14 +31,13 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - resolve_visual_encoder_outputs) + consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData from .interfaces import SupportsMultiModal, SupportsPP from .utils import (init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import VisionEncoderInfo +from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs try: from xformers import ops as xops @@ -774,21 +773,24 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): ) -> int: return get_pixtral_hf_image_feature_size( image_size=self.vision_config.image_size, - patch_size=self.get_image_size(), + patch_size=self.vision_config.patch_size, ) def get_max_image_tokens(self) -> int: return get_max_pixtral_hf_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_pixtral_hf_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - class PixtralHFMLP(nn.Module): diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index baf955f6b5..1345b381f0 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1028,13 +1028,6 @@ class QWenLLM(QWenBaseModel): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "w2": ("gate_up_proj", 0), - "w1": ("gate_up_proj", 1), - } - class QWenVL(QWenBaseModel, SupportsMultiModal): packed_modules_mapping = { diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 88f4ea4352..82de1c3574 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -107,7 +107,8 @@ class Qwen2Attention(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, rope_scaling: Optional[Tuple] = None, - prefix: str = "") -> None: + prefix: str = "", + attn_type: str = AttentionType.DECODER) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -160,7 +161,8 @@ class Qwen2Attention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=attn_type) def forward( self, @@ -168,17 +170,11 @@ class Qwen2Attention(nn.Module): hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, - attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=attn_type) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.o_proj(attn_output) return output @@ -197,6 +193,16 @@ class Qwen2DecoderLayer(nn.Module): # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) + + # By default, Qwen2 uses causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -207,6 +213,7 @@ class Qwen2DecoderLayer(nn.Module): quant_config=quant_config, rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", + attn_type=attn_type, ) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, @@ -220,15 +227,6 @@ class Qwen2DecoderLayer(nn.Module): self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - # By default, Qwen2 uses causal attention as it is a decoder-only model. - # You can override the HF config with `is_causal=False` to enable - # bidirectional attention, which is used in some embedding models - # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) - if getattr(config, "is_causal", True): - self._attn_type = AttentionType.DECODER - else: - self._attn_type = AttentionType.ENCODER_ONLY - def forward( self, positions: torch.Tensor, @@ -249,7 +247,6 @@ class Qwen2DecoderLayer(nn.Module): hidden_states=hidden_states, kv_cache=kv_cache, attn_metadata=attn_metadata, - attn_type=self._attn_type, ) # Fully Connected @@ -259,7 +256,15 @@ class Qwen2DecoderLayer(nn.Module): return hidden_states, residual -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) class Qwen2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -282,6 +287,7 @@ class Qwen2Model(nn.Module): )) self.config = config + self.quant_config = quant_config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -301,7 +307,7 @@ class Qwen2Model(nn.Module): lambda prefix: Qwen2DecoderLayer(config=config, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.layers"), + prefix=prefix), prefix=f"{prefix}.layers", ) @@ -367,6 +373,17 @@ class Qwen2Model(nn.Module): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue @@ -421,16 +438,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d050fd0603..fc5aed5c94 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -20,8 +20,8 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from functools import cached_property -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -38,10 +38,12 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser +from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, + MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -79,19 +81,12 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths -class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): +class Qwen2AudioProcessingInfo(BaseProcessingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"audio": None} + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen2AudioConfig) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) - max_source_positions = hf_config.audio_config.max_source_positions - max_output_lengths = (max_source_positions - 2) // 2 + 1 - - return {"audio": max_output_lengths} - - def _get_hf_processor( + def get_hf_processor( self, *, # Ignored in initialization @@ -99,42 +94,84 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): ) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor) - def _get_feature_extractor(self) -> WhisperFeatureExtractor: - return self._get_hf_processor().feature_extractor # type: ignore + def get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + feature_extractor = hf_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + hf_config = self.get_hf_config() + max_source_positions = hf_config.audio_config.max_source_positions + max_output_lengths = (max_source_positions - 2) // 2 + 1 + + return {"audio": max_output_lengths} + + +class Qwen2AudioDummyInputsBuilder( + BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self.info.get_feature_extractor() + + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + return ProcessorInputs( + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, + ) + + +class Qwen2AudioMultiModalProcessor( + BaseMultiModalProcessor[Qwen2AudioProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( self, prompt: str, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], + mm_kwargs: Mapping[str, Any], ) -> BatchFeature: - mm_data = dict(mm_data) - audios = mm_data.pop("audios", []) + # Text-only input not supported in composite processor + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - if audios: - mm_data["audios"] = audios + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + mm_kwargs = dict( + **mm_kwargs, + sampling_rate=feature_extractor.sampling_rate, + ) - feature_extractor = self._get_feature_extractor() - mm_kwargs = dict( - **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - else: - # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - pass - - processed_outputs = super()._call_hf_processor( + return super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, ) - return processed_outputs - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -151,8 +188,20 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) - placeholder = hf_config.audio_token_index + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + # Use getattr with default to be compatible with transformers<4.48 + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + audio_bos_token = getattr(processor, "audio_bos_token", + "<|audio_bos|>") + audio_eos_token = getattr(processor, "audio_eos_token", + "<|audio_eos|>") + + audio_token_id = vocab[audio_token] + audio_bos_id = vocab[audio_bos_token] + audio_eos_id = vocab[audio_eos_token] feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: @@ -165,54 +214,44 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): audio_output_lengths = audio_output_lens.tolist() def get_replacement_qwen2_audio(item_idx: int): - num_placeholders = audio_output_lengths[item_idx] - if num_placeholders == 0: + num_features = audio_output_lengths[item_idx] + if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - return [placeholder] * num_placeholders + audio_tokens = [audio_token_id] * num_features + + return PromptReplacementDetails( + full=[audio_bos_id] + audio_tokens + [audio_eos_id], + features=audio_tokens, + ) return [ PromptReplacement( modality="audio", - target=[placeholder], + target=audio_token, replacement=get_replacement_qwen2_audio, ) ] def _always_apply_prompt_replacements(self) -> bool: - # HF never applies prompt replacements, so we have to do it ourselves - # _find_placeholders may incorrectly think that HF has already performed - # processing for multi-audio input when the input audios are short - # (the corresponding placeholders may take up fewer tokens than - # the number of audio items) - return True - - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() - - sampling_rate = feature_extractor.sampling_rate - audio_len = feature_extractor.chunk_length * sampling_rate - num_audios = mm_counts.get("audio", 0) - - mm_data = { - "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) - } - - return ProcessorInputs( - prompt_text="<|AUDIO|>" * num_audios, - mm_data=mm_data, - ) + # Qwen2-Audio processor will start inserting placeholder tokens + # in an upcoming release: + # https://github.com/huggingface/transformers/pull/35534 + # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF + # has already performed processing for multi-audio input when the input + # audios are short (the corresponding placeholders may take up fewer + # tokens than the number of audio items) + return not hasattr(self.info.get_hf_processor(), "audio_token") -@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor( + Qwen2AudioMultiModalProcessor, + info=Qwen2AudioProcessingInfo, + dummy_inputs=Qwen2AudioDummyInputsBuilder) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -310,13 +349,16 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, selected_audio_feature = audio_outputs.last_hidden_state audio_features = self.multi_modal_projector(selected_audio_feature) num_audios, max_audio_tokens, embed_dim = audio_features.shape + audio_output_lengths = audio_output_lengths.unsqueeze(1) audio_features_mask = torch.arange(max_audio_tokens).expand( - num_audios, max_audio_tokens - ).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1) + num_audios, max_audio_tokens).to( + audio_output_lengths.device) < audio_output_lengths masked_audio_features = audio_features[audio_features_mask].view( -1, embed_dim) - return masked_audio_features + # Split to tuple of embeddings for individual audio input. + return torch.split(masked_audio_features, + audio_output_lengths.flatten().tolist()) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: audio_input = self._parse_and_validate_audio_input(**kwargs) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index ba70243c65..95de6c2187 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,6 +34,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -50,13 +51,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.utils import print_warning_once from .interfaces import SupportsPP from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +logger = init_logger(__name__) + class Qwen2MoeMLP(nn.Module): @@ -524,7 +526,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP): remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 988d682d36..593ce4857a 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -12,7 +12,7 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput @@ -32,7 +32,7 @@ class ReLU(nn.Module): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): +class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -60,7 +60,6 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -74,14 +73,11 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): config.hidden_size, quant_config=quant_config), ReLU(), - RowParallelLinear(config.hidden_size, 1, + RowParallelLinear(config.hidden_size, + config.num_labels, quant_config=quant_config), ) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + self._pooler: SimplePooler self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -115,3 +111,31 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) + + +class Qwen2ForRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 1 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False) + + +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 2 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.STEP, + normalize=False, + softmax=True, + step_tag_id=151651, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 26b6d768ad..a2778ee738 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -38,7 +38,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.distributed import parallel_state +from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata @@ -55,21 +55,27 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, - NestedTensors, VideoItem) -from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser + VideoItem) +from vllm.multimodal.parse import (ImageSize, ModalityDataItems, + MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, - init_vllm_registered_model, maybe_prefix) +from .utils import (AutoWeightsLoader, WeightsMapper, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) +from .vision import get_vit_attn_backend logger = init_logger(__name__) +# For profile run +_MAX_FRAMES_PER_VIDEO = 16 + # === Vision Inputs === # @@ -133,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). + (concatenation of all videos' feature tensors). Tensor shape: `(num_image_features, hidden_size)` - `num_image_features` varies based on @@ -239,6 +245,8 @@ class Qwen2VisionAttention(nn.Module): super().__init__() # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_size = world_size + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( @@ -261,24 +269,41 @@ class Qwen2VisionAttention(nn.Module): raise RuntimeError( f"Qwen2-VL does not support {self.attn_backend} backend now.") + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = tensor_model_parallel_all_gather(qkv) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] + if self.tp_size > 1: + splitter = partial(dist_utils.split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + v = splitter(v)[self.tp_rank] + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] + + # [s, b, c] --> [s, b, 3 * head * head_dim] x, _ = self.qkv(x) - # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] - new_x_shape = x.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head, - ) - x = x.view(*new_x_shape) - - # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] - q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) batch_size = q.shape[1] q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() @@ -590,6 +615,7 @@ class Qwen2VisionTransformer(nn.Module): # adapter x = self.merger(x) + return x def load_weights(self, weights: Iterable[Tuple[str, @@ -614,24 +640,6 @@ class Qwen2VisionTransformer(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - if name.endswith("qkv.weight"): - visual_num_heads = self.num_heads - visual_embed_dim = self.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size, - visual_embed_dim) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) - elif name.endswith("qkv.bias"): - visual_num_heads = self.num_heads - visual_embed_dim = self.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1) - param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -640,58 +648,6 @@ class Qwen2VisionTransformer(nn.Module): return loaded_params -# === Vision input helpers === # - - -def _get_vision_info( - vision_config: Qwen2VLVisionConfig, - height: int, - width: int, - min_pixels: int, - max_pixels: int, - *, - do_resize: bool = True, - modality: str = "image", - mm_count: int = 1, -): - """Get information (resized height / width and number of vision tokens) - of input image / video frame.""" - patch_size = vision_config.patch_size - merge_size = vision_config.spatial_merge_size - temporal_patch_size = vision_config.temporal_patch_size - - if do_resize: - resized_height, resized_width = smart_resize( - height=height, - width=width, - factor=patch_size * merge_size, - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - else: - resized_height, resized_width = height, width - - if modality == "image": - grid_t = mm_count - elif modality == "video": - grid_t = max(mm_count // temporal_patch_size, 1) - else: - raise ValueError(f"Modality {modality} is not supported") - - grid_h = resized_height // patch_size - grid_w = resized_width // patch_size - vision_tokens = grid_t * grid_h * grid_w - llm_num_vision_tokens = vision_tokens // (merge_size**2) - - return resized_height, resized_width, llm_num_vision_tokens - - -def _get_image_processor(hf_processor: Qwen2VLProcessor): - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, Qwen2VLImageProcessor) - return image_processor - - class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], dict[str, torch.Tensor]]): @@ -758,45 +714,20 @@ class Qwen2MultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): +class Qwen2VLProcessingInfo(BaseProcessingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen2VLConfig) - def _get_max_mm_tokens(self, modality: str) -> int: - hf_config = self.ctx.get_hf_config(Qwen2VLConfig) - vision_config = hf_config.vision_config - - hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) - - _, _, max_llm_image_tokens = _get_vision_info( - vision_config, - height=9999999, - width=9999999, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - modality=modality, - ) - return max_llm_image_tokens - - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - return { - "image": self._get_max_mm_tokens("image"), - "video": self._get_max_mm_tokens("video"), - } - - def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2MultiModalDataParser() - - def _get_hf_processor( + def get_hf_processor( self, *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, ) -> Qwen2VLProcessor: hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = _get_image_processor(hf_processor) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) if min_pixels: image_processor.min_pixels = min_pixels @@ -810,34 +741,241 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): return hf_processor + def get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + ): + hf_processor = self.get_hf_processor(min_pixels=min_pixels, + max_pixels=max_pixels) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + return image_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "video": self.get_max_video_tokens(seq_len), + } + + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 1, + do_resize: bool = True, + image_processor: Optional[Qwen2VLImageProcessor], + ) -> tuple[ImageSize, int]: + if image_processor is None: + image_processor = self.get_image_processor() + + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + temporal_patch_size = vision_config.temporal_patch_size + + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, + height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, + height=image_height) + + grid_t = max(num_frames // temporal_patch_size, 1) + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_vision_tokens = num_patches // (merge_size**2) + + return preprocessed_size, num_vision_tokens + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + image_processor: Optional[Qwen2VLImageProcessor], + ) -> int: + _, num_image_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + image_processor=image_processor, + ) + return num_image_tokens + + def get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + image_processor: Optional[Qwen2VLImageProcessor], + ) -> int: + _, num_video_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + num_frames=num_frames, + image_processor=image_processor, + ) + return num_video_tokens + + def get_image_size_with_most_features(self) -> ImageSize: + max_image_size, _ = self._get_vision_info( + image_width=9999999, + image_height=9999999, + image_processor=None, + ) + return max_image_size + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + image_processor=None, + ) + + def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + next_max_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + image_processor=None, + ) + + if next_max_tokens > max_tokens: + break + + num_frames = next_num_frames + + return num_frames + + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_image_tokens = self.get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + num_frames = min(max(max_total_frames // max(max_videos, 1), 1), + _MAX_FRAMES_PER_VIDEO) + + # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 + if num_frames > 1 and num_frames % 2 == 1: + num_frames += 1 + + return num_frames + + def get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features(seq_len), + image_processor=None, + ) + + +class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + video_token: str = hf_processor.video_token + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=target_num_frames, + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, + ) + + +class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] + ): + + def _get_data_parser(self) -> MultiModalDataParser: + return Qwen2MultiModalDataParser() + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor( + **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered placeholder = { - "image": hf_processor.image_token, - "video": hf_processor.video_token, + "image": vocab[hf_processor.image_token], + "video": vocab[hf_processor.video_token], } + merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod() // merge_length - return placeholder[modality] * num_tokens + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder[modality]] * num_tokens return [ PromptReplacement( modality=modality, - target=placeholder[modality], + target=[placeholder[modality]], replacement=partial(get_replacement_qwen2vl, modality=modality), ) for modality in ("image", "video") @@ -872,37 +1010,10 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): video_grid_thw=MultiModalFieldConfig.batched("video"), ) - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) - image_token: str = hf_processor.image_token - resized_height, resized_width = smart_resize( - height=9999999, - width=9999999, - factor=image_processor.patch_size * image_processor.merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - ) - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=resized_width, - height=resized_height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text=image_token * num_images, - mm_data=mm_data, - ) - - -@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor, + info=Qwen2VLProcessingInfo, + dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): packed_modules_mapping = { @@ -944,11 +1055,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: Qwen2VLConfig = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - assert not cache_config.enable_prefix_caching, \ - "Qwen2-VL currently does not support prefix caching" self.config = config self.multimodal_config = multimodal_config @@ -1070,85 +1178,122 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, video_embeds=video_embeds, video_grid_thw=video_grid_thw) - def _process_image_input(self, - image_input: Qwen2VLImageInputs) -> torch.Tensor: + def _process_image_input( + self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + if image_input["type"] == "image_embeds": - return image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, - grid_thw=image_input["image_grid_thw"]) - return image_embeds + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 - def _process_video_input(self, - video_input: Qwen2VLVideoInputs) -> torch.Tensor: if video_input["type"] == "video_embeds": - return video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=video_input["video_grid_thw"]) - return video_embeds + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size - def _merge_multimodal_embeddings( - self, - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int, - ) -> torch.Tensor: - mask = (input_ids == placeholder_token_id) - inputs_embeds[mask, :] = multimodal_embeddings - return inputs_embeds + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - if image_input is None and video_input is None: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - # We make a tuple of each embedding with its modality string. This is a - # temporary workaround for models to handle mixed modalities when - # get_multimodal_embeddings and get_input_embeddings are called - # separately. - # TODO(ywang96): Add support for mixed-modality inference for v1. - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () - if image_input is not None: - image_embeds = self._process_image_input(image_input) - multimodal_embeddings.append((image_embeds, "image")) - if video_input is not None: - video_embeds = self._process_video_input(video_input) - multimodal_embeddings.append((video_embeds, "video")) + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings return multimodal_embeddings def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - for embeddings, modality in multimodal_embeddings: - if modality == "image": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.image_token_id, - ) - if modality == "video": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.video_token_id, - ) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[tuple[torch.Tensor, ...]] = None, + video_input: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) return inputs_embeds def forward( @@ -1184,22 +1329,25 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) - # We need to check for usage of mrope here in case there is - # multimodal data. - # TODO (ywang96): move this to model runner in V1. - if multimodal_embeddings is not None and uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 62840b8c1b..8d2719ca2d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -47,6 +47,7 @@ _TEXT_GENERATION_MODELS = { "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), @@ -60,6 +61,7 @@ _TEXT_GENERATION_MODELS = { "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), + "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), @@ -125,6 +127,7 @@ _EMBEDDING_MODELS = { "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 @@ -149,6 +152,7 @@ _MULTIMODAL_MODELS = { "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), + "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ba1a78ac64..5997a76890 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,3 +1,4 @@ +import itertools from typing import Iterable, List, Optional, Tuple import torch @@ -20,6 +21,30 @@ from vllm.transformers_utils.config import ( from .interfaces import SupportsCrossEncoding +def roberta_task_weights_filter( + all_weights: Iterable[Tuple[str, torch.Tensor]] +) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, + torch.Tensor]]]: + """ + Separate task-specific weights that are applied on top + of the encoder-decoder bert base. + To do so, return two generators over the original iterator. + Also, remove the "roberta." prefix to make it loadable + from vanilla BertModel. + """ + # Copy of a lazy iterator without in-memory overhead so both + # iterators can be iterated upon independently. + all_weights1, all_weights2 = itertools.tee(all_weights) + + def encoder_decoder_weights(): + for name, weight in all_weights1: + if name.startswith("roberta."): + yield (name[len("roberta."):], weight) + + return encoder_decoder_weights(), ((n, w) for n, w in all_weights2 + if not n.startswith("roberta.")) + + class RobertaEmbedding(nn.Module): def __init__(self, config: RobertaConfig): @@ -152,6 +177,18 @@ class RobertaEmbeddingModel(BertEmbeddingModel): prefix=prefix, embedding_class=RobertaEmbedding) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + weights = self.hf_to_vllm_mapper.apply(weights) + # Separate weights in "roberta"-prefixed and all else (not in memory). + # For use with models like FacebookAI/roberta-base. + bert_weights, task_weights = roberta_task_weights_filter(weights) + loaded = self.model.load_weights(bert_weights) + if not len(loaded): + # Fix for models like `sentence-transformers/stsb-roberta-base-v2` + # which use the same architecture, but have no "roberta" prefix. + loaded = self.model.load_weights(task_weights) + assert len(loaded), "Unable to load RobertaEmbeddingModel" + class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): """A model that uses Roberta to provide embedding functionalities. @@ -181,20 +218,12 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - self_weights = [] - - def weight_filter(): - for name, weight in weights: - if name.startswith("roberta."): - yield (name[len("roberta."):], weight) - else: - self_weights.append((name, weight)) - - self.roberta.load_weights(weight_filter()) + bert_weights, task_weights = roberta_task_weights_filter(weights) + self.roberta.load_weights(bert_weights) params_dict = dict(self.named_parameters()) - for name, loaded_weight in self_weights: + for name, loaded_weight in task_weights: if name.startswith("classifier"): param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 115eaaac90..1e51018973 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -24,11 +24,10 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens, - resolve_visual_encoder_outputs) + repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -from .vision import VisionEncoderInfo +from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -171,15 +170,18 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]): def get_max_image_tokens(self) -> int: return get_max_siglip_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_siglip_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa class SiglipVisionEmbeddings(nn.Module): @@ -342,10 +344,14 @@ class SiglipMLP(nn.Module): self.config = config self.activation_fn = get_act_fn(config.hidden_act) - - # For quantization, we require the hidden size to be a multiple of 64 - quantizable = (config.hidden_size % 64 == 0 - and config.intermediate_size % 64 == 0) + # Special handling for BNB quantization + if quant_config and quant_config.get_name() == "bitsandbytes": + quantizable = True + else: + # For other quantization, we require the hidden size to be a + # multiple of 64 + quantizable = (config.hidden_size % 64 == 0 + and config.intermediate_size % 64 == 0) self.fc1 = ColumnParallelLinear( config.hidden_size, config.intermediate_size, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index a7cf65a0e3..e6d919f23c 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,8 +30,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -39,16 +38,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -401,14 +397,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -417,6 +405,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = SolarModel( vllm_config=vllm_config, @@ -499,12 +488,14 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue @@ -542,32 +533,3 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params - - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, - tp_rank, - tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type, - ): - if not isinstance(self.model.layers[layer_idx], nn.Identity): - layer_self_attn = self.model.layers[layer_idx].self_attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 6b2107bef0..c9d1af7824 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -22,7 +22,7 @@ from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import StableLmConfig from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -50,8 +50,9 @@ from .utils import (is_pp_missing_parameter, class StablelmMLP(nn.Module): def __init__(self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None) -> None: + config: StableLmConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -59,10 +60,13 @@ class StablelmMLP(nn.Module): self.gate_up_proj = MergedColumnParallelLinear( config.hidden_size, [config.intermediate_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(config.intermediate_size, config.hidden_size, - bias=False) + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -75,7 +79,7 @@ class StablelmMLP(nn.Module): class StablelmAttention(nn.Module): def __init__(self, - config: PretrainedConfig, + config: StableLmConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: @@ -116,11 +120,13 @@ class StablelmAttention(nn.Module): self.total_num_heads, self.total_num_key_value_heads, self.qkv_bias, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, self.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.o_proj") self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.rotary_ndims, @@ -154,7 +160,7 @@ class StablelmDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: StableLmConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -164,7 +170,7 @@ class StablelmDecoderLayer(nn.Module): cache_config, quant_config, prefix=f"{prefix}.self_attn") - self.mlp = StablelmMLP(config, quant_config) + self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp") norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05)) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps) @@ -210,6 +216,8 @@ class StableLMEpochModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -270,7 +278,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 22189a517d..1cd0dedfed 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -88,12 +88,14 @@ class Starcoder2Attention(nn.Module): self.total_num_kv_heads, bias=self.use_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=self.use_bias, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( self.head_dim, @@ -129,19 +131,22 @@ class Starcoder2MLP(nn.Module): def __init__(self, config: Starcoder2Config, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.c_fc = ColumnParallelLinear( config.hidden_size, config.intermediate_size, bias=config.use_bias, quant_config=quant_config, + prefix=f"{prefix}.c_fc", ) self.c_proj = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=config.use_bias, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.act = get_act_fn(config.hidden_act) @@ -165,7 +170,9 @@ class Starcoder2DecoderLayer(nn.Module): cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.mlp = Starcoder2MLP(config, quant_config=quant_config) + self.mlp = Starcoder2MLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, @@ -213,8 +220,11 @@ class Starcoder2Model(nn.Module): self.vocab_size = config.vocab_size # TODO: consider padding_idx (currently removed) - self.embed_tokens = VocabParallelEmbedding(config.vocab_size, - config.hidden_size) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Starcoder2DecoderLayer( @@ -279,6 +289,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, quant_config=quant_config, + prefix=f"{prefix}.lm_head", ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0b83684c9b..d577e545a4 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,12 +1,10 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" - import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) -import numpy as np import torch import torch.utils.checkpoint from torch import nn @@ -15,9 +13,10 @@ from transformers import BatchFeature, ProcessorMixin from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper.modeling_whisper import WhisperEncoder +from vllm import envs from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader @@ -25,19 +24,21 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import MultiModalDataParser +from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings, merge_multimodal_embeddings_from_map) +_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>" +_AUDIO_PLACEHOLDER_TOKEN = 128002 _AUDIO_TOKENS_PER_SECOND = 6.25 @@ -57,32 +58,76 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, UltravoxAudioEmbeddingInputs] -class UltravoxMultiModalProcessor(BaseMultiModalProcessor): +class UltravoxProcessingInfo(BaseProcessingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"audio": None} - - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - feature_extractor = self._get_feature_extractor() - max_audio_tokens = math.ceil(feature_extractor.chunk_length * - _AUDIO_TOKENS_PER_SECOND) - - return {"audio": max_audio_tokens} - - def _get_hf_processor( + def get_hf_processor( self, *, # Ignored in initialization sampling_rate: Optional[int] = None, ) -> ProcessorMixin: - return self.ctx.get_hf_processor() + hf_processor = self.ctx.get_hf_processor() - def _get_feature_extractor(self) -> WhisperFeatureExtractor: - hf_processor = self._get_hf_processor() - return hf_processor.audio_processor.feature_extractor # type: ignore + # NOTE: Ultravox processing definition uses '<|eot_id|>' as the + # placeholder that will cause confusion with the actual end of turn + # token, thus we override placeholder with a reserved special + # token. + hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE + return hf_processor + + def get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + audio_processor = hf_processor.audio_processor # type: ignore + feature_extractor = audio_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + feature_extractor = self.get_feature_extractor() + max_audio_tokens = math.ceil(feature_extractor.chunk_length * + _AUDIO_TOKENS_PER_SECOND) + + return {"audio": max_audio_tokens} + + +class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo] + ): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self.info.get_feature_extractor() + + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + return ProcessorInputs( + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, + ) + + +class UltravoxMultiModalProcessor( + BaseMultiModalProcessor[UltravoxProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( @@ -92,34 +137,21 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): mm_kwargs: Mapping[str, object], ) -> BatchFeature: # Text-only input not supported in composite processor - if not mm_data: - tokenizer = self._get_tokenizer() - - prompt_ids = tokenizer.encode( - prompt, - add_special_tokens=False, # type: ignore - ) + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") mm_data = dict(mm_data) audios = mm_data.pop("audios", []) + assert isinstance(audios, list) - if not audios: - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) - # Already resampled by _get_hf_mm_data - assert is_list_of(audios, np.ndarray) - # Ultravox processor doesn't support multiple inputs, # therefore we need to input text and audio one by one audio_features, audio_token_len = [], [] @@ -145,6 +177,16 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ) return BatchFeature(combined_outputs) + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + # HF processor omits bos_token_id by setting add_special_tokens=False + tokenizer = self.info.get_tokenizer() + assert prompt_tokens[0] == tokenizer.bos_token_id + + return prompt_tokens[1:] + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -159,15 +201,19 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() - placeholder = hf_processor.audio_token_replacement # type: ignore + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + replacement_id = vocab[ + hf_processor.audio_token_replacement] # type: ignore def get_replacement_ultravox(item_idx: int): audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] - return placeholder * audio_token_len + return [replacement_id] * int(audio_token_len) # type: ignore return [ PromptReplacement( @@ -177,26 +223,6 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ) ] - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() - - sampling_rate = feature_extractor.sampling_rate - audio_len = feature_extractor.chunk_length * sampling_rate - num_audios = mm_counts.get("audio", 0) - - mm_data = { - "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) - } - - return ProcessorInputs( - prompt_text="<|audio|>" * num_audios, - mm_data=mm_data, - ) - class StackAudioFrames(nn.Module): """ @@ -219,15 +245,6 @@ class StackAudioFrames(nn.Module): return audio_embeds -class FlippedSiluAndMul(SiluAndMul): - """Ultravox is trained with SwiGLU with flipped halves.""" - - def forward(self, x: torch.Tensor): - a, b = x.chunk(2, dim=-1) - flipped = torch.cat((b, a), dim=-1) - return super().forward(flipped) - - class UltravoxProjector(nn.Module): def __init__(self, config: UltravoxConfig): @@ -240,7 +257,7 @@ class UltravoxProjector(nn.Module): dim = self.hidden_dim if config.projector_act == "swiglu": - self.act = FlippedSiluAndMul() + self.act = MulAndSilu() dim = dim // 2 else: self.act = get_act_fn(config.projector_act) @@ -316,7 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder): return hidden_states -@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, + info=UltravoxProcessingInfo, + dummy_inputs=UltravoxDummyInputsBuilder + ) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( @@ -449,11 +469,15 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - # TODO(ywang96): use merge_multimodal_embeddings after - # v0 is deprecated - merge_multimodal_embeddings_from_map( - inputs_embeds, multimodal_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) + # TODO(ywang96): remove this block after v0 is deprecated. + if not envs.VLLM_USE_V1: + merge_multimodal_embeddings_from_map( + inputs_embeds, multimodal_embeddings, + attn_metadata.multi_modal_placeholder_index_maps["audio"]) + else: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + _AUDIO_PLACEHOLDER_TOKEN) return inputs_embeds def forward(self, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 31017f16d3..43b3c973c9 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -8,16 +8,12 @@ import torch.nn as nn from torch.func import functional_call from transformers import PretrainedConfig -import vllm.envs as envs -from vllm.attention.selector import (backend_name_to_enum, - get_global_forced_attn_backend) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors -from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_pin_memory_available, print_warning_once +from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -281,6 +277,15 @@ def flatten_bn( ... +@overload +def flatten_bn( + x: Union[List[torch.Tensor], torch.Tensor], + *, + concat: bool = False, +) -> Union[List[torch.Tensor], torch.Tensor]: + ... + + def flatten_bn( x: Union[List[torch.Tensor], torch.Tensor], *, @@ -603,37 +608,6 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): return make_empty_intermediate_tensors -def get_vit_attn_backend(support_fa: bool = False) -> _Backend: - """ - Get the available attention backend for Vision Transformer. - """ - # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. - selected_backend: Optional[_Backend] = get_global_forced_attn_backend() - if selected_backend is None: - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - # For Volta and Turing GPUs, use xformers instead. - device_available = current_platform.has_device_capability(80) - if device_available and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): - selected_backend = _Backend.FLASH_ATTN - else: - print_warning_once( - "Current `vllm-flash-attn` has a bug inside vision module, " - "so we use xformers backend instead. You can run " - "`pip install flash-attn` to use flash-attention backend.") - selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu() or current_platform.is_rocm(): - # ROCM doesn't support xformers - selected_backend = _Backend.TORCH_SDPA - else: - selected_backend = _Backend.XFORMERS - return selected_backend - - def maybe_prefix(prefix: str, name: str) -> str: """Add a prefix to a name if the prefix is non-empty. diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 65a773480d..57166f05cd 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,8 +1,17 @@ from abc import ABC, abstractmethod -from typing import Generic, TypeVar +from typing import Final, Generic, Optional, Protocol, TypeVar, Union +import torch from transformers import PretrainedConfig +import vllm.envs as envs +from vllm.attention.selector import (backend_name_to_enum, + get_global_forced_attn_backend) +from vllm.logger import init_logger +from vllm.platforms import _Backend, current_platform + +logger = init_logger(__name__) + _C = TypeVar("_C", bound=PretrainedConfig) @@ -26,21 +35,31 @@ class VisionEncoderInfo(ABC, Generic[_C]): def get_max_image_tokens(self) -> int: raise NotImplementedError - @abstractmethod - def get_num_patches(self) -> int: - raise NotImplementedError - @abstractmethod def get_image_size(self) -> int: raise NotImplementedError + @abstractmethod + def get_patch_size(self) -> int: + raise NotImplementedError -def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: + @abstractmethod + def get_patch_grid_length(self) -> int: + raise NotImplementedError + + +class VisionLanguageConfig(Protocol): + vision_config: Final[PretrainedConfig] + + +def get_vision_encoder_info( + hf_config: VisionLanguageConfig) -> VisionEncoderInfo: # Avoid circular imports from .clip import CLIPEncoderInfo, CLIPVisionConfig from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig from .siglip import SiglipEncoderInfo, SiglipVisionConfig + vision_config = hf_config.vision_config if isinstance(vision_config, CLIPVisionConfig): return CLIPEncoderInfo(vision_config) if isinstance(vision_config, PixtralVisionConfig): @@ -50,3 +69,79 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) + + +def get_vit_attn_backend(support_fa: bool = False) -> _Backend: + """ + Get the available attention backend for Vision Transformer. + """ + # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. + selected_backend: Optional[_Backend] = get_global_forced_attn_backend() + if selected_backend is None: + backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is None: + if current_platform.is_cuda(): + device_available = current_platform.has_device_capability(80) + if device_available and support_fa: + from transformers.utils import is_flash_attn_2_available + if is_flash_attn_2_available(): + selected_backend = _Backend.FLASH_ATTN + else: + logger.warning_once( + "Current `vllm-flash-attn` has a bug inside vision " + "module, so we use xformers backend instead. You can " + "run `pip install flash-attn` to use flash-attention " + "backend.") + selected_backend = _Backend.XFORMERS + else: + # For Volta and Turing GPUs, use xformers instead. + selected_backend = _Backend.XFORMERS + else: + # Default to torch SDPA for other non-GPU platforms. + selected_backend = _Backend.TORCH_SDPA + return selected_backend + + +def resolve_visual_encoder_outputs( + encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], + feature_sample_layers: Optional[list[int]], + post_layer_norm: Optional[torch.nn.LayerNorm], + max_possible_layers: int, +) -> torch.Tensor: + """Given the outputs a visual encoder module that may correspond to the + output of the last layer, or a list of hidden states to be stacked, + handle post normalization and resolve it into a single output tensor. + + Args: + encoder_outputs: Output of encoder's last layer or all hidden states. + feature_sample_layers: Optional layer indices to grab from the encoder + outputs; if provided, encoder outputs must be a list. + post_layer_norm: Post norm to apply to the output of the encoder. + max_possible_layers: Total layers in the fully loaded visual encoder. + + """ + if feature_sample_layers is None: + if post_layer_norm is not None: + return post_layer_norm(encoder_outputs) + return encoder_outputs + + # Get the hidden states corresponding to the layer indices. + # Negative values are relative to the full visual encoder, + # so offset them depending on how many layers were loaded. + # NOTE: this assumes that encoder_outputs contains a list + # of hidden states in the same order as the encoder layers + # that produced them. + offset = max_possible_layers - len(encoder_outputs) + hs_pool = [ + encoder_outputs[layer_idx] + if layer_idx >= 0 else encoder_outputs[layer_idx + offset] + for layer_idx in feature_sample_layers + ] + + # Apply post-norm on the final hidden state if we are using it + uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) + if post_layer_norm is not None and uses_last_layer: + hs_pool[-1] = post_layer_norm(encoder_outputs) + return torch.cat(hs_pool, dim=-1) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index cb54b4c3ba..b8512b735d 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -106,6 +106,7 @@ class WhisperAttention(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + attn_type=self.attn_type, ) def _init_qkv( @@ -134,12 +135,7 @@ class WhisperAttention(nn.Module): qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=self.attn_type) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) @@ -164,6 +160,7 @@ class WhisperCrossAttention(WhisperAttention): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + attn_type=AttentionType.ENCODER_DECODER, ) def _init_qkv( @@ -207,12 +204,13 @@ class WhisperCrossAttention(WhisperAttention): else: k = v = None - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_DECODER) + attn_output = self.attn( + q, + k, + v, + kv_cache, + attn_metadata, + ) output, _ = self.out_proj(attn_output) @@ -731,7 +729,22 @@ class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) - loaded_weights = [(name, loaded_weight) - for name, loaded_weight in weights] mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) - return loader.load_weights(loaded_weights, mapper=mapper) \ No newline at end of file + # add fake zeros bias for k_proj to state_dict + weights = _create_fake_bias_for_k_proj(weights) + return loader.load_weights(weights, mapper=mapper) + + +def _create_fake_bias_for_k_proj( + weights: Iterable[Tuple[str, torch.Tensor]] +) -> Iterable[Tuple[str, torch.Tensor]]: + """ + Create full zeros bias for k_proj weight in self-attention layers. + So that the bias for k_proj in qkv_proj can be initialized with zeros. + """ + for name, weight in weights: + if ".self_attn.k_proj.weight" in name: + bias = torch.zeros(weight.size(0)) + bias_name = name.replace("weight", "bias") + yield from [(name, weight), (bias_name, bias)] + yield name, weight diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 02d22a5ca6..a9ce8af15d 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -6,6 +6,7 @@ from torch.nn import Parameter from vllm.distributed import get_tensor_model_parallel_rank from vllm.logger import init_logger +from vllm.model_executor.utils import _make_synced_weight_loader __all__ = [ "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter", @@ -37,14 +38,32 @@ class BasevLLMParameter(Parameter): :returns: a torch.nn.parameter """ + # During weight loading, we often do something like: + # narrowed_tensor = param.data.narrow(0, offset, len) + # narrowed_tensor.copy_(real_weight) + # expecting narrowed_tensor and param.data to share the same storage. + # However, on TPUs, narrowed_tensor will lazily propagate to the base + # tensor, which is param.data, leading to the redundant memory usage. + # This sometimes causes OOM errors during model loading. To avoid this, + # we sync the param tensor after its weight loader is called. + from vllm.platforms import current_platform + if current_platform.is_tpu(): + weight_loader = _make_synced_weight_loader(weight_loader) + self._weight_loader = weight_loader @property def weight_loader(self): return self._weight_loader + def _is_1d_and_scalar(self, loaded_weight: torch.Tensor): + cond1 = self.data.ndim == 1 and self.data.numel() == 1 + cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1 + return (cond1 and cond2) + def _assert_and_load(self, loaded_weight: torch.Tensor): - assert self.data.shape == loaded_weight.shape + assert (self.data.shape == loaded_weight.shape + or self._is_1d_and_scalar(loaded_weight)) self.data.copy_(loaded_weight) def load_column_parallel_weight(self, loaded_weight: torch.Tensor): diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index e58bbe8171..1d7f5d57fa 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,5 @@ from .base import MultiModalPlaceholderMap, MultiModalPlugin +from .hasher import MultiModalHashDict, MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict, NestedTensors) @@ -7,10 +8,10 @@ from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() """ The global :class:`~MultiModalRegistry` is used by model runners to -dispatch data processing according to its modality and the target model. +dispatch data processing according to the target model. See also: - :ref:`input-processing-pipeline` + :ref:`mm-processing` """ __all__ = [ @@ -18,6 +19,8 @@ __all__ = [ "ModalityData", "MultiModalDataBuiltins", "MultiModalDataDict", + "MultiModalHashDict", + "MultiModalHasher", "MultiModalKwargs", "MultiModalPlaceholderDict", "MultiModalPlaceholderMap", diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7f4029e726..fd3ec7e0ec 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -49,9 +49,6 @@ class MultiModalPlugin(ABC): process the same data differently). This registry is in turn used by :class:`~MultiModalRegistry` which acts at a higher level (i.e., the modality of the data). - - See also: - :ref:`adding-multimodal-plugin` """ def __init__(self) -> None: @@ -93,10 +90,6 @@ class MultiModalPlugin(ABC): invoked to transform the data into a dictionary of model inputs. If `None` is provided, then the default input mapper is used instead. - - See also: - - :ref:`input-processing-pipeline` - - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -129,10 +122,6 @@ class MultiModalPlugin(ABC): Raises: TypeError: If the data type is not supported. - - See also: - - :ref:`input-processing-pipeline` - - :ref:`enabling-multimodal-inputs` """ # Avoid circular import @@ -189,9 +178,6 @@ class MultiModalPlugin(ABC): for a model class. If `None` is provided, then the default calculation is used instead. - - See also: - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -221,9 +207,6 @@ class MultiModalPlugin(ABC): If this registry is not applicable to the model, `0` is returned. The model is identified by ``model_config``. - - See also: - :ref:`enabling-multimodal-inputs` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py new file mode 100644 index 0000000000..24aa1ca658 --- /dev/null +++ b/vllm/multimodal/hasher.py @@ -0,0 +1,100 @@ +import pickle +from typing import TYPE_CHECKING, Iterable, Mapping, Optional + +import numpy as np +import torch +from blake3 import blake3 +from PIL import Image + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.inputs import TokensPrompt + +logger = init_logger(__name__) + +MultiModalHashDict = Mapping[str, list[str]] +""" +A dictionary containing hashes for items in each modality. +""" + + +class MultiModalHasher: + + @classmethod + def serialize_item(cls, obj: object) -> bytes: + # Simple cases + if isinstance(obj, str): + return obj.encode("utf-8") + if isinstance(obj, bytes): + return obj + if isinstance(obj, Image.Image): + return obj.tobytes() + + # Convertible to NumPy arrays + if isinstance(obj, torch.Tensor): + obj = obj.numpy() + if isinstance(obj, (int, float)): + obj = np.array(obj) + if isinstance(obj, np.ndarray): + return obj.tobytes() + + logger.warning( + "No serialization method found for %s. " + "Falling back to pickle.", type(obj)) + + return pickle.dumps(obj) + + @classmethod + def item_to_bytes( + cls, + key: str, + obj: object, + ) -> Iterable[tuple[bytes, bytes]]: + # Recursive cases + if isinstance(obj, (list, tuple)): + for i, elem in enumerate(obj): + yield from cls.item_to_bytes(f"{key}.{i}", elem) + elif isinstance(obj, dict): + for k, v in obj.items(): + yield from cls.item_to_bytes(f"{key}.{k}", v) + else: + key_bytes = cls.serialize_item(key) + value_bytes = cls.serialize_item(obj) + yield key_bytes, value_bytes + + @classmethod + def hash_kwargs(cls, **kwargs: object) -> str: + hasher = blake3() + + for k, v in kwargs.items(): + for k_bytes, v_bytes in cls.item_to_bytes(k, v): + hasher.update(k_bytes) + hasher.update(v_bytes) + + return hasher.hexdigest() + + @classmethod + def hash_prompt_mm_data( + cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]: + """Hash multimodal data in the user input prompt if they exist.""" + + if "multi_modal_data" not in prompt: + return None + + mm_data = prompt["multi_modal_data"] + if not mm_data: + # mm_data can be None or an empty dict. + return None + + mm_items = { + modality: items if isinstance(items, list) else [items] + for modality, items in mm_data.items() + } + + mm_hashes = { + modality: [cls.hash_kwargs(**{modality: item}) for item in items] + for modality, items in mm_items.items() + } + + return mm_hashes diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index b0a1104546..b35184f685 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast, - final) +from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, + Union, cast, final) import numpy as np import torch @@ -14,6 +14,9 @@ from typing_extensions import NotRequired, TypeAlias from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves +if TYPE_CHECKING: + from .hasher import MultiModalHashDict + _T = TypeVar("_T") HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] @@ -97,11 +100,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] """ A dictionary containing an entry for each modality type to input. -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalDataBuiltins` as long as a customized plugin - is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. +The built-in modalities are defined by :class:`MultiModalDataBuiltins`. """ @@ -488,13 +487,14 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] """ -A dictionary containing placeholder ranges. +A dictionary containing placeholder ranges for each modality. """ -class MultiModalInputsV2(TypedDict): +class MultiModalInputs(TypedDict): """ - Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`, + Represents the outputs of + :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, ready to be passed to vLLM internals. """ @@ -513,7 +513,7 @@ class MultiModalInputsV2(TypedDict): mm_kwargs: MultiModalKwargs """Keyword arguments to be directly passed to the model after batching.""" - mm_hashes: NotRequired[list[str]] + mm_hashes: NotRequired[Optional["MultiModalHashDict"]] """The hashes of the multi-modal data.""" mm_placeholders: MultiModalPlaceholderDict diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 00acb77435..ccff0e857e 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -13,14 +13,16 @@ from vllm.utils import is_list_of from .audio import resample_audio from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, - ImageItem, ModalityData, MultiModalDataDict, - NestedTensors, VideoItem) + ImageItem, ModalityData, MultiModalDataDict, VideoItem) _T = TypeVar("_T") _I = TypeVar("_I") class ModalityDataItems(ABC, Generic[_T, _I]): + """ + Represents data items for a modality in :class:`MultiModalDataItems`. + """ def __init__(self, data: _T, modality: str) -> None: super().__init__() @@ -69,6 +71,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]): class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + """Base class for data items that are arranged in a list.""" def get_count(self) -> int: return len(self.data) @@ -83,7 +86,12 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): return {} -class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): +class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]], + torch.Tensor]): + """ + Base class for data items that are expressed as a batched embedding tensor, + or a list of embedding tensors (one per item). + """ def get_count(self) -> int: return len(self.data) @@ -109,7 +117,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): class AudioEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "audio") @@ -137,7 +145,7 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): class ImageEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "image") @@ -146,10 +154,24 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): def __init__(self, data: Sequence[HfVideoItem]) -> None: super().__init__(data, "video") + def get_num_frames(self, item_idx: int) -> int: + return len(self.get(item_idx)) + + def get_frame_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx)[0] # Assume that the video isn't empty + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + class VideoEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "video") @@ -158,8 +180,8 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ - As :class:`MultiModalDataDict`, but normalized such that each entry - corresponds to a list. + As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized + such that each entry corresponds to a list. """ def get_count(self, modality: str, *, strict: bool = True) -> int: @@ -212,7 +234,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], class MultiModalDataParser: """ - Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into + :class:`MultiModalDataItems`. Args: target_sr (float, optional): Enables automatic resampling of audio @@ -224,7 +247,9 @@ class MultiModalDataParser: self.target_sr = target_sr - def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: + def _is_embeddings( + self, data: object + ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]: if isinstance(data, torch.Tensor): return data.ndim == 3 if is_list_of(data, torch.Tensor): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index eb7552176e..750646ac6e 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,55 +1,139 @@ -import pickle import re from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence +from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, + Sequence) from dataclasses import dataclass, field from functools import lru_cache -from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union +from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, + TypeVar, Union) -import numpy as np -import numpy.typing as npt -import torch -from blake3 import blake3 -from PIL import Image -from transformers import BatchFeature, ProcessorMixin +from transformers import BatchFeature, PretrainedConfig, ProcessorMixin -from vllm.inputs import DummyData, InputProcessingContext +import vllm.envs as envs +from vllm.inputs import InputProcessingContext from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens +from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, + encode_tokens) from vllm.utils import LRUCache, flatten_2d_lists, full_groupby +from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - MultiModalKwargsItem, PlaceholderRange) + MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser +if TYPE_CHECKING: + from .profiling import BaseDummyInputsBuilder + logger = init_logger(__name__) _S = TypeVar("_S", str, list[int]) -_PromptSeq = Union[str, list[int]] + +PromptSeq = Union[str, list[int]] +"""A token sequence (list of token IDs) or text.""" + + +@dataclass +class PromptReplacementDetails: + """Details about the replacement token sequence or text.""" + + full: PromptSeq + """The full replacement.""" + + features: PromptSeq + """ + The part of the replacement that corresponds to feature placeholders; + this will be replaced by the output of the vision encoder during model + inference. + """ + + @staticmethod + def from_seq(seq: PromptSeq) -> "PromptReplacementDetails": + return PromptReplacementDetails(full=seq, features=seq) + + +PromptRepl = Union[PromptSeq, PromptReplacementDetails] +""" +The replacement token sequence or text. + +If only part of the replacement corresponds to feature placeholders, you can +use :class:`PromptReplacementDetails` to specify which part. +""" @dataclass class PromptReplacement: + """ + Defines how to replace portions of an input prompt with placeholder tokens. + + Example: + + For each image, replace one ```` input placeholder in the prompt + with a number of ```` feature placeholders + equal to the feature size of the vision encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement="" * image_feature_size, + ) + + As above, but further pad the feature placeholders with ```` + and ```, which are not supposed to be passed to the vision + encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement=PromptReplacementDetails( + full="".join([ + "", + "" * image_feature_size, + "", + ]), + features="" * image_feature_size, + ), + ) + + To avoid unnecessary tokenization during prompt replacement, + we recommended passing token sequences instead of text: + + .. code-block:: python + + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_bos_id] + [image_token_id] * image_feature_size + + [image_eos_id]), + features=[image_token_id] * image_feature_size, + ), + ) + """ + modality: str """The modality for which the replacement is made.""" - target: _PromptSeq - """The text or token sequence to find and replace.""" + target: PromptSeq + """The token sequence (or text) to find and replace.""" - replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) """ - Given the index of the processed item within :attr:`modality`, output the - replacement text or token sequence. + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). - For convenience, you can pass in the replacement instead of a function - if it does not depend on the input. + For convenience, you can directly pass in the replacement token sequence + (or text) instead of a function if it does not depend on the input. """ - def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement": - return _BoundPromptReplacement( + def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement": + return BoundPromptReplacement( tokenizer=tokenizer, modality=self.modality, _target=self.target, @@ -69,19 +153,6 @@ def _cached_encode( add_special_tokens=add_special_tokens) -def _decode( - tokenizer: AnyTokenizer, - token_ids: list[int], - *, - skip_special_tokens: bool = False, -) -> str: - """ - Backend-agnostic equivalent of HF's - :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. - """ - return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) - - @lru_cache(maxsize=2048) def _cached_decode( tokenizer: AnyTokenizer, @@ -89,9 +160,9 @@ def _cached_decode( *, skip_special_tokens: bool = False, ) -> str: - return _decode(tokenizer, - list(token_ids), - skip_special_tokens=skip_special_tokens) + return decode_tokens(tokenizer, + list(token_ids), + skip_special_tokens=skip_special_tokens) class _HasModalityAttr(Protocol): @@ -115,11 +186,26 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: @dataclass class _BoundPromptSequence: + """ + A :data:`_PromptSeq` bound to a tokenizer to automatically + convert between token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) _text: Optional[str] _token_ids: Optional[list[int]] + @staticmethod + def from_seq( + tokenizer: AnyTokenizer, + seq: PromptSeq, + ) -> "_BoundPromptSequence": + return _BoundPromptSequence( + tokenizer=tokenizer, + _text=seq if isinstance(seq, str) else None, + _token_ids=seq if isinstance(seq, list) else None, + ) + def __post_init__(self) -> None: if self._text is None and self._token_ids is None: raise ValueError("At least one of 'text' and 'token_ids' must be " @@ -143,28 +229,38 @@ class _BoundPromptSequence: @dataclass -class _BoundPromptReplacement: +class _BoundPromptReplacementGroup: + full: _BoundPromptSequence + features: _BoundPromptSequence + + +@dataclass +class BoundPromptReplacement: + """ + A :class:`PromptReplacement` bound to a tokenizer to automatically + convert :attr:`target` and the result of :meth:`get_replacement` between + token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) modality: str - _target: _PromptSeq - _replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + _target: PromptSeq + _replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) def __post_init__(self) -> None: - self._replacement_cache = dict[int, _BoundPromptSequence]() + self._replacement_cache = dict[int, _BoundPromptReplacementGroup]() @property def target(self) -> _BoundPromptSequence: - target = self._target + """The token sequence (or text) to find and replace.""" + return _BoundPromptSequence.from_seq(self.tokenizer, self._target) - return _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=target if isinstance(target, str) else None, - _token_ids=target if isinstance(target, list) else None, - ) - - def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup: + """ + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). + """ replacement = self._replacement if callable(replacement): cache_key = item_idx @@ -175,10 +271,16 @@ class _BoundPromptReplacement: else: cache_key = None - bound_replacement = _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=replacement if isinstance(replacement, str) else None, - _token_ids=replacement if isinstance(replacement, list) else None, + if not isinstance(replacement, PromptReplacementDetails): + replacement = PromptReplacementDetails.from_seq(replacement) + + bound_full = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.full) + bound_features = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.features) + bound_replacement = _BoundPromptReplacementGroup( + full=bound_full, + features=bound_features, ) if cache_key is not None: @@ -195,7 +297,7 @@ class _TokenMatch(NamedTuple): def iter_token_matches( token_ids: list[int], match_ids: list[int], -) -> Iterable[_TokenMatch]: +) -> Generator[_TokenMatch]: """ Yield each occurrence of :code:`match_ids` in :code:`token_ids`. @@ -222,7 +324,7 @@ def iter_token_matches( @dataclass(repr=False) class _PromptReplacementMatch(ABC): - prompt_repl: _BoundPromptReplacement + prompt_repl: BoundPromptReplacement @property def modality(self) -> str: @@ -269,14 +371,16 @@ class _PromptReplacementTextMatch(_PromptReplacementMatch): return self.match.end() -class _PlaceholderInfo(NamedTuple): +@dataclass +class PlaceholderFeaturesInfo: modality: str + item_idx: int start_idx: int - replacement: list[int] + tokens: list[int] @property def length(self) -> int: - return len(self.replacement) + return len(self.tokens) def to_range(self) -> PlaceholderRange: return PlaceholderRange( @@ -287,7 +391,7 @@ class _PlaceholderInfo(NamedTuple): def find_token_matches( prompt: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + prompt_repls: Sequence[BoundPromptReplacement], ) -> list[_PromptReplacementTokenMatch]: """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" return [ @@ -299,7 +403,7 @@ def find_token_matches( def find_text_matches( prompt: str, - prompt_repls: Sequence[_BoundPromptReplacement], + prompt_repls: Sequence[BoundPromptReplacement], ) -> list[_PromptReplacementTextMatch]: """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" return [ @@ -310,13 +414,15 @@ def find_text_matches( def _resolve_matches( - prompt: _PromptSeq, - matches: Sequence[_PromptReplacementMatch], + prompt: PromptSeq, + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], ) -> list[_PromptReplacementMatch]: """ - Resolve :code:`matches` to ensure that there are no overlapping matches, + Resolve :code:`mm_matches` to ensure that there are no overlapping matches, and sort them such that earlier matches take priority over later ones. """ + matches = [m for matches in mm_matches.values() for m in matches] + seen_matches: list[Optional[_PromptReplacementMatch]] = [None ] * len(prompt) @@ -334,14 +440,15 @@ def _resolve_matches( def _replace_matches( prompt: _S, - matches: Sequence[_PromptReplacementMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], mm_item_counts: Mapping[str, int], ) -> list[_S]: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" out_seqs = list[_S]() prev_end_idx = 0 next_idx_by_modality = defaultdict[str, int](lambda: 0) - for match in _resolve_matches(prompt, matches): + for match in _resolve_matches(prompt, mm_matches): modality = match.modality item_idx = next_idx_by_modality[modality] @@ -355,10 +462,10 @@ def _replace_matches( replacement = repl_info.get_replacement(item_idx) if isinstance(prompt, str): - repl_seq = replacement.text + repl_seq = replacement.full.text out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) else: - repl_seq = replacement.token_ids + repl_seq = replacement.full.token_ids out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) prev_end_idx = end_idx @@ -371,105 +478,105 @@ def _replace_matches( def replace_token_matches( prompt: list[int], - matches: Sequence[_PromptReplacementTokenMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]], mm_item_counts: Mapping[str, int], ) -> list[int]: - """Apply :code:`prompt_repls` to :code:`prompt`.""" - if not matches: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: return prompt - token_id_seqs = _replace_matches(prompt, matches, mm_item_counts) + token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts) return flatten_2d_lists(token_id_seqs) def replace_text_matches( prompt: str, - matches: Sequence[_PromptReplacementTextMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]], mm_item_counts: Mapping[str, int], ) -> str: - """Apply :code:`prompt_repls` to :code:`prompt`.""" - if not matches: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: return prompt - texts = _replace_matches(prompt, matches, mm_item_counts) + texts = _replace_matches(prompt, mm_matches, mm_item_counts) return "".join(texts) -def _iter_modality_placeholders( +def _iter_placeholders( + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], - modality: str, - modality_repls: Sequence[_BoundPromptReplacement], - modal_item_count: int, -) -> Iterable[_PlaceholderInfo]: - if modal_item_count == 0: - return + mm_item_counts: Mapping[str, int], +) -> Iterable[PlaceholderFeaturesInfo]: + """ + Yield each set of placeholder tokens found in :code:`prompt`. + Matches are exclusive even when multiple modalities share + the same placeholder tokens. In that case, the modality that + appears earlier in `mm_prompt_repls` takes priority. + + Note that empty matches are ignored. + """ prompt_len = len(prompt) - item_index = 0 + item_idx_by_modality = defaultdict[str, int](lambda: 0) start_idx = 0 while start_idx < prompt_len: found = False - for repl_info in modality_repls: - replacement = repl_info.get_replacement(item_index) - repl_tokens = replacement.token_ids - repl_len = len(repl_tokens) - end_idx = start_idx + repl_len - - if repl_len == 0 or end_idx > prompt_len: + for modality, modality_repls in mm_prompt_repls.items(): + item_idx = item_idx_by_modality[modality] + if item_idx >= mm_item_counts.get(modality, 0): continue - if prompt[start_idx:end_idx] == repl_tokens: - yield _PlaceholderInfo( - modality=modality, - start_idx=start_idx, - replacement=repl_tokens, - ) + for repl_info in modality_repls: + replacement = repl_info.get_replacement(item_idx) + repl_tokens_full = replacement.full.token_ids + repl_len_full = len(repl_tokens_full) + end_idx_full = start_idx + repl_len_full - item_index += 1 - if item_index >= modal_item_count: - return + if repl_len_full == 0 or end_idx_full > prompt_len: + continue - # Exclude overlapping matches - start_idx = end_idx - found = True - break + if prompt[start_idx:end_idx_full] == repl_tokens_full: + repl_tokens_feat = replacement.features.token_ids + + try: + match = next( + iter_token_matches(repl_tokens_full, + repl_tokens_feat)) + yield PlaceholderFeaturesInfo( + modality=modality, + item_idx=item_idx, + start_idx=start_idx + match.start_idx, + tokens=repl_tokens_feat, + ) + except StopIteration: + raise AssertionError( + f"{repl_tokens_feat=} should be a " + f"subsequence of {repl_tokens_full=}") from None + + # Exclude overlapping matches + start_idx = end_idx_full + item_idx_by_modality[modality] += 1 + found = True + break + + if found: + break # Go back to the outer while loop if not found: start_idx += 1 -def iter_placeholders( - prompt_repls: Sequence[_BoundPromptReplacement], +def find_mm_placeholders( + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Iterable[_PlaceholderInfo]: - """ - Yield each set of placeholder tokens found in :code:`prompt`. - - Note that empty matches are ignored. - """ - repls_by_modality = dict(full_groupby_modality(prompt_repls)) - - for modality, modal_item_count in mm_item_counts.items(): - if modality in repls_by_modality: - yield from _iter_modality_placeholders( - prompt, - modality, - repls_by_modality[modality], - modal_item_count, - ) - - -@dataclass -class ProcessorInputs: - """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" - prompt_text: str - mm_data: MultiModalDataDict - hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) +) -> Mapping[str, list[PlaceholderFeaturesInfo]]: + it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) + return dict(full_groupby_modality(it)) class ProcessingCache: @@ -492,56 +599,6 @@ class ProcessingCache: logger.debug("ProcessingCache: hit_ratio = %.2f", cache_stats.hit_ratio) - def _serialize_item(self, obj: object) -> bytes: - # Simple cases - if isinstance(obj, str): - return obj.encode("utf-8") - if isinstance(obj, bytes): - return obj - if isinstance(obj, Image.Image): - return obj.tobytes() - - # Convertible to NumPy arrays - if isinstance(obj, torch.Tensor): - obj = obj.numpy() - if isinstance(obj, (int, float)): - obj = np.array(obj) - if isinstance(obj, np.ndarray): - return obj.tobytes() - - logger.warning( - "No serialization method found for %s. " - "Falling back to pickle.", type(obj)) - - return pickle.dumps(obj) - - def _item_to_bytes( - self, - key: str, - obj: object, - ) -> Iterable[tuple[bytes, bytes]]: - # Recursive cases - if isinstance(obj, (list, tuple)): - for i, elem in enumerate(obj): - yield from self._item_to_bytes(f"{key}.{i}", elem) - elif isinstance(obj, dict): - for k, v in obj.items(): - yield from self._item_to_bytes(f"{key}.{k}", v) - else: - key_bytes = self._serialize_item(key) - value_bytes = self._serialize_item(obj) - yield key_bytes, value_bytes - - def _hash_kwargs(self, **kwargs: object) -> str: - hasher = blake3() - - for k, v in kwargs.items(): - for k_bytes, v_bytes in self._item_to_bytes(k, v): - hasher.update(k_bytes) - hasher.update(v_bytes) - - return hasher.hexdigest() - def get( self, model_id: str, @@ -560,9 +617,9 @@ class ProcessingCache: """ self._maybe_log_cache_stats() - cache_key = self._hash_kwargs(model_id=model_id, - **{modality: input_item}, - **input_kwargs) + cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) return self._cache.get(cache_key) def put( @@ -577,35 +634,36 @@ class ProcessingCache: Put a processed multi-modal item into the cache according to its dependencies (see :meth:`get`). """ - cache_key = self._hash_kwargs(model_id=model_id, - **{modality: input_item}, - **input_kwargs) + cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) self._cache.put(cache_key, output_kwargs) -class BaseMultiModalProcessor(ABC): - """ - Abstract base class to process multi-modal inputs to be used in vLLM. - """ +class BaseProcessingInfo: + """Base class to provide the information necessary for data processing.""" - def __init__(self, - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: + def __init__(self, ctx: InputProcessingContext) -> None: super().__init__() self.ctx = ctx - self.cache = cache - self.enable_sanity_checks = enable_sanity_checks - def __call__( - self, - prompt: str, - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + @property + def model_id(self) -> str: + return self.ctx.model_config.model + + def get_tokenizer(self) -> AnyTokenizer: + return self.ctx.tokenizer + + def get_hf_config(self) -> PretrainedConfig: + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + """ + Subclasses can override this method to handle + specific kwargs from model config or user inputs. + """ + return self.ctx.get_hf_processor(**kwargs) @abstractmethod def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -620,7 +678,7 @@ class BaseMultiModalProcessor(ABC): raise NotImplementedError @abstractmethod - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: """ Get the maximum possible number of tokens per data item for each modality. @@ -630,9 +688,43 @@ class BaseMultiModalProcessor(ABC): """ raise NotImplementedError + +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class BaseMultiModalProcessor(ABC, Generic[_I]): + """ + Abstract base class to process multi-modal inputs to be used in vLLM. + + Not to be confused with :class:`transformers.ProcessorMixin`. + """ + + def __init__(self, + info: _I, + dummy_inputs: "BaseDummyInputsBuilder[_I]", + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__() + + self.info = info + self.dummy_inputs = dummy_inputs + self.cache = cache + self.enable_sanity_checks = enable_sanity_checks + + self.data_parser = self._get_data_parser() + + def __call__( + self, + prompt: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputs: + return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + def _get_data_parser(self) -> MultiModalDataParser: """ - Construct a data parser to preprocess multi-modal data items + Construct a parser to preprocess multi-modal data items before passing them to :meth:`_get_hf_mm_data`. You can support additional modalities by creating a subclass @@ -640,16 +732,6 @@ class BaseMultiModalProcessor(ABC): """ return MultiModalDataParser() - def _get_hf_processor(self) -> ProcessorMixin: - """ - Subclasses can add keyword arguments to this method to accept - additional kwargs from model config or user inputs. - """ - return self.ctx.get_hf_processor() - - def _get_tokenizer(self) -> AnyTokenizer: - return self.ctx.tokenizer - def _to_mm_items( self, mm_data: MultiModalDataDict, @@ -658,10 +740,9 @@ class BaseMultiModalProcessor(ABC): Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` before passing them to :meth:`_get_hf_mm_data`. """ - parser = self._get_data_parser() - mm_items = parser.parse_mm_data(mm_data) + mm_items = self.data_parser.parse_mm_data(mm_data) - mm_limits = self.ctx.get_mm_config().limit_per_prompt + mm_limits = self.info.ctx.get_mm_config().limit_per_prompt for modality, items in mm_items.items(): limit = mm_limits.get(modality, 1) if len(items) > limit: @@ -703,21 +784,21 @@ class BaseMultiModalProcessor(ABC): """ raise NotImplementedError - def _find_placeholders( + def _find_mm_placeholders( self, - all_prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> list[_PlaceholderInfo]: - return list( - iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: + return find_mm_placeholders(mm_prompt_repls, new_token_ids, + mm_item_counts) def _get_hf_mm_data( self, mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - processor_data = dict[str, Any]() - passthrough_data = dict[str, Any]() + ) -> tuple[Mapping[str, object], Mapping[str, object]]: + processor_data = dict[str, object]() + passthrough_data = dict[str, object]() for items in mm_items.values(): processor_data.update(items.get_processor_data()) @@ -737,21 +818,21 @@ class BaseMultiModalProcessor(ABC): Call the HF processor on the prompt text and associated multi-modal data. """ - return self.ctx.call_hf_processor( - self._get_hf_processor(**mm_kwargs), + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), dict(text=prompt, **mm_data), mm_kwargs, ) - def _apply_hf_processor( + def _apply_hf_processor_text_mm( self, prompt_text: str, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: """ - Wrapper of :meth:`_call_hf_processor` that applies - additional pre-processing and post-processing. + Apply the HF processor on the prompt text and multi-modal data + together. """ processor_data, passthrough_data = self._get_hf_mm_data(mm_items) @@ -771,37 +852,93 @@ class BaseMultiModalProcessor(ABC): return prompt_ids, mm_kwargs - def _apply_hf_processor_missing( - self, - prompt_text: str, - mm_missing_data_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - ): + def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]: """ - Apply the HF processor on the full prompt text, but only on the - multi-modal data that are missing from the cache. + Apply the HF processor on the prompt text only. - Note: - We pass prompt text and multi-modal data into the HF processor - in separate calls to avoid HF prompt replacement being done for - cached items; instead, we rely on our own prompt replacement logic - (:meth:`_get_prompt_replacements`) for the full text. + Since HF processor requires that text and multi-modal items + correspond to each other, we create dummy multi-modal items + to go along with the text. """ - mm_missing_counts = mm_missing_data_items.get_all_counts() - - prompt_ids, _ = self._apply_hf_processor( + prompt_ids, _ = self._apply_hf_processor_text_mm( prompt_text=prompt_text, mm_items=MultiModalDataItems({}), hf_processor_mm_kwargs={}, ) - # Some HF processors (e.g. Qwen2-VL) expect corresponding - # multi-modal tokens to be in the prompt text - dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts) + return prompt_ids - _, mm_missing_kwargs = self._apply_hf_processor( + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + """ + Apply the HF processor on the prompt tokens only. + + Most HF processors accept prompt text but not prompt tokens. + If the HF processor adds or removes tokens that are not related to + multi-modal data, you should override this method so it is consistent + with the output of :meth:`_apply_hf_processor_text_only` on the + corresponding text. + """ + return prompt_tokens + + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalKwargs: + """ + Apply the HF processor on the multi-modal data only. + + Since HF processor requires that text and multi-modal items + correspond to each other, we generate dummy text using + :class:`DummyInputsBuilder` to go along with the multi-modal data. + """ + mm_counts = mm_items.get_all_counts() + + dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs( + self.info.ctx.model_config.max_model_len, + mm_counts, + ) + + _, mm_kwargs = self._apply_hf_processor_text_mm( prompt_text=dummy_inputs.prompt_text, - mm_items=mm_missing_data_items, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + return mm_kwargs + + def _apply_hf_processor_main( + self, + prompt: Union[str, list[int]], + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + *, + enable_hf_prompt_replacement: bool, + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the prompt text and multi-modal data. + + Note: + If :code:`enable_hf_prompt_replacement=False`, the prompt should + correspond to the multi-modal items. + """ + if isinstance(prompt, str): + if enable_hf_prompt_replacement: + return self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + prompt_ids = self._apply_hf_processor_text_only(prompt) + else: + prompt_ids = self._apply_hf_processor_tokens_only(prompt) + + mm_missing_kwargs = self._apply_hf_processor_mm_only( + mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) @@ -809,7 +946,7 @@ class BaseMultiModalProcessor(ABC): def _cached_apply_hf_processor( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: @@ -818,14 +955,15 @@ class BaseMultiModalProcessor(ABC): caching the results and reusing cached results. """ cache = self.cache - model_id = self.ctx.model_config.model + model_id = self.info.model_id _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: - return self._apply_hf_processor( - prompt_text=prompt_text, + return self._apply_hf_processor_main( + prompt=prompt, mm_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=True, ) mm_maybe_cached_kw_items = { @@ -847,10 +985,13 @@ class BaseMultiModalProcessor(ABC): } mm_missing_data_items = self._to_mm_items(mm_missing_data) - prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing( - prompt_text=prompt_text, - mm_missing_data_items=mm_missing_data_items, + # NOTE: `prompt` does not correspond to `mm_missing_data_items`, + # so we need to pass `enable_hf_prompt_replacement=False` + prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_missing_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=False, ) mm_missing_next_idx = { @@ -889,50 +1030,44 @@ class BaseMultiModalProcessor(ABC): mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) - if self.enable_sanity_checks: - mm_item_counts = mm_data_items.get_all_counts() - - for modality, item_count in mm_item_counts.items(): - for item_idx in range(item_count): - try: - mm_kwargs.get_item(modality, item_idx) - except Exception as e: - # Make it easy to set a breakpoint in the debugger - raise e - return prompt_ids, mm_kwargs - def _bind_prompt_replacements( + def _bind_and_group_repls( self, prompt_repls: list[PromptReplacement], - ) -> list[_BoundPromptReplacement]: - tokenizer = self._get_tokenizer() + ) -> dict[str, list[BoundPromptReplacement]]: + tokenizer = self.info.get_tokenizer() - return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] + it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls) + return dict(full_groupby_modality(it)) def _always_apply_prompt_replacements(self) -> bool: """ A flag which can be overridden so that :meth:`_apply_prompt_replacements` is always called even if we - detect that HF has performed processing via :meth:`_find_placeholders`. + detect that HF has performed processing via + :meth:`_find_placeholders_by_modality`. - This is useful in cases where :meth:`_find_placeholders` cannot be - reliably used to detect whether HF has performed processing or not. + This is useful in cases where :meth:`_find_placeholders_by_modality` + cannot be reliably used to detect whether HF has performed processing. """ return False def _apply_prompt_replacements( self, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, list[_PlaceholderInfo]]: - tokenizer = self._get_tokenizer() + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: + tokenizer = self.info.get_tokenizer() - token_matches = find_token_matches(token_ids, prompt_repls) + mm_token_matches = { + modality: find_token_matches(token_ids, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } mm_match_counts = { modality: len(matches) - for modality, matches in full_groupby_modality(token_matches) + for modality, matches in mm_token_matches.items() } # If the search text does not represent a special token, @@ -951,38 +1086,98 @@ class BaseMultiModalProcessor(ABC): ): # yapf: disable token_ids = replace_token_matches( token_ids, - token_matches, + mm_token_matches, mm_item_counts, ) - text = _decode(tokenizer, token_ids) - matched_repls = [match.prompt_repl for match in token_matches] + text = decode_tokens(tokenizer, token_ids) + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_token_matches.items() + } else: - text = _decode(tokenizer, token_ids) + text = decode_tokens(tokenizer, token_ids) - text_matches = find_text_matches(text, prompt_repls) + mm_text_matches = { + modality: find_text_matches(text, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } text = replace_text_matches( text, - text_matches, + mm_text_matches, mm_item_counts, ) token_ids = encode_tokens(tokenizer, text, add_special_tokens=False) - matched_repls = [match.prompt_repl for match in text_matches] + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_text_matches.items() + } - placeholders = self._find_placeholders(matched_repls, token_ids, - mm_item_counts) + placeholders = self._find_mm_placeholders( + matched_repls, + token_ids, + mm_item_counts, + ) return token_ids, text, placeholders + def _validate_mm_kwargs( + self, + mm_kwargs: MultiModalKwargs, + mm_item_counts: Mapping[str, int], + ) -> None: + for modality, item_count in mm_item_counts.items(): + if modality in mm_kwargs.modalities: + items = mm_kwargs.get_items(modality) + else: + items = [] + + if len(items) != item_count: + raise RuntimeError( + f"Expected there to be {item_count} {modality} items in " + f"keyword arguments corresponding to {item_count} " + f"{modality} data items, but only found {len(items)}! " + "There is likely a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_mm_fields_config`).") + + def _validate_mm_placeholders( + self, + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], + mm_item_counts: Mapping[str, int], + *, + allow_missing: bool = False, + ) -> Mapping[str, int]: + missing_repl_counts = dict[str, int]() + + for modality, item_count in mm_item_counts.items(): + placeholders = mm_placeholders.get(modality, []) + + if len(placeholders) != item_count and not allow_missing: + raise RuntimeError( + f"Expected there to be {item_count} prompt replacements " + f"corresponding to {item_count} {modality} items, but only " + f"found {len(placeholders)} prompt replacements! Either " + "the prompt text has missing/incorrect tokens for " + "multi-modal inputs, or there is a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_prompt_replacements`).") + + missing_repl_counts[modality] = item_count - len(placeholders) + + return missing_repl_counts + def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -998,8 +1193,26 @@ class BaseMultiModalProcessor(ABC): """ mm_items = self._to_mm_items(mm_data) + # Create MM hashes (only used in V1) + # TODO: Use these hash keys for caching operations in apply_hf_processor + # instead of rehashing. + + if envs.VLLM_USE_V1: + model_id = self.info.model_id + mm_hashes = { + modality: [ + MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: item}, + **hf_processor_mm_kwargs) + for item in items + ] + for modality, items in mm_items.items() + } + else: + mm_hashes = None + prompt_ids, mm_kwargs = self._cached_apply_hf_processor( - prompt_text, + prompt, mm_items, hf_processor_mm_kwargs, ) @@ -1009,172 +1222,68 @@ class BaseMultiModalProcessor(ABC): hf_processor_mm_kwargs, mm_kwargs, ) - prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls) + mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls) + + mm_item_counts = mm_items.get_all_counts() + self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + + hf_mm_placeholders = self._find_mm_placeholders( + mm_prompt_repls, + prompt_ids, + mm_item_counts, + ) + + if self._always_apply_prompt_replacements(): + mm_missing_repl_counts = mm_item_counts + mm_missing_repls = dict(mm_prompt_repls) + else: + mm_missing_repl_counts = self._validate_mm_placeholders( + hf_mm_placeholders, + mm_item_counts, + allow_missing=True, + ) + + mm_missing_repls = dict[str, list[BoundPromptReplacement]]() + for modality, missing_repl_count in mm_missing_repl_counts.items(): + if missing_repl_count == 0: + mm_missing_repls[modality] = [] + elif missing_repl_count == mm_item_counts.get(modality, 0): + mm_missing_repls[modality] = mm_prompt_repls[modality] + else: + raise ValueError("Partial prompt replacement within " + f"{modality=} is not supported") # If HF processor already inserts placeholder tokens, # there is no need for us to insert them - mm_item_counts = mm_items.get_all_counts() - all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, - mm_item_counts) - - if all_placeholders and not self._always_apply_prompt_replacements(): - tokenizer = self._get_tokenizer() - prompt_text = _decode(tokenizer, prompt_ids) + if all(len(repls) == 0 for repls in mm_missing_repls.values()): + tokenizer = self.info.get_tokenizer() + prompt = decode_tokens(tokenizer, prompt_ids) + mm_placeholders = hf_mm_placeholders else: ( prompt_ids, - prompt_text, - all_placeholders, + prompt, + missing_mm_placeholders, ) = self._apply_prompt_replacements( prompt_ids, - prompt_repls, - mm_item_counts, + mm_missing_repls, + mm_missing_repl_counts, ) - mm_placeholders = dict[str, list[PlaceholderRange]]() - err_suffix = ("This suggests a problem with your implementation of " - "the merged multi-modal processor for this model, " - "particularly in the `_get_prompt_replacements` method.") + mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders} - for modality, placeholders in full_groupby_modality(all_placeholders): - if modality not in mm_items: - raise AssertionError( - f"Expected no placeholders for {modality=}, " - f"but found {placeholders=}. Input items: {mm_items}" - f"\n{err_suffix}") + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) - if len(placeholders) != len(mm_items[modality]): - raise AssertionError( - f"Expected length of {placeholders=} for {modality=} " - f"to equal that of input items: {mm_items[modality]}" - f"\n{err_suffix}") + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() + } - mm_placeholders[modality] = [ - item.to_range() for item in placeholders - ] - - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", - prompt=prompt_text, + prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_placeholders=mm_placeholders, - ) - - def _get_dummy_audios( - self, - *, - length: int, - num_audios: int, - ) -> list[npt.NDArray]: - audio = np.zeros((length, )) - return [audio] * num_audios - - def _get_dummy_images( - self, - *, - width: int, - height: int, - num_images: int, - ) -> list[Image.Image]: - image = Image.new("RGB", (width, height), color=0) - return [image] * num_images - - def _get_dummy_videos( - self, - *, - width: int, - height: int, - num_frames: int, - num_videos: int, - ) -> list[npt.NDArray]: - video = np.zeros((num_frames, width, height, 3)) - return [video] * num_videos - - @abstractmethod - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - """ - Build the multi-modal portion of the input which, after processing, - results in `mm_max_tokens` in :meth:`get_dummy_data`. - """ - raise NotImplementedError - - def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]: - mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt - supported_mm_limits = self.get_supported_mm_limits() - - mm_limits = { - modality: mm_limit_per_prompt.get(modality, 1) - for modality in supported_mm_limits - } - - for modality, supported_limit in supported_mm_limits.items(): - limit = mm_limits[modality] - if supported_limit is not None and supported_limit < limit: - raise ValueError( - f"You set {modality}={limit} (or defaulted to 1) in " - f"`--limit-mm-per-prompt`, but this model only supports " - f"at most {supported_limit} {modality} items.") - - return mm_limits - - def get_dummy_data(self, seq_len: int) -> DummyData: - # Avoid circular import - from vllm.sequence import SequenceData - - mm_counts = self._get_and_validate_dummy_mm_counts() - mm_max_tokens_per_item = self.get_mm_max_tokens_per_item() - if mm_counts.keys() != mm_max_tokens_per_item.keys(): - raise AssertionError( - "The keys returned by `get_supported_mm_limits`" - f"({set(mm_counts.keys())}) should be the same as those " - "returned by `get_mm_max_tokens_per_item` " - f"({set(mm_max_tokens_per_item.keys())})") - - processor_inputs = self._get_dummy_mm_inputs(mm_counts) - mm_inputs = self.apply( - prompt_text=processor_inputs.prompt_text, - mm_data=processor_inputs.mm_data, - hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, - ) - - prompt_token_ids = mm_inputs["prompt_token_ids"] - placeholders_by_modality = mm_inputs["mm_placeholders"] - - total_placeholders_by_modality = { - modality: sum(item["length"] for item in placeholders) - for modality, placeholders in placeholders_by_modality.items() - } - expected_placeholders_by_modality = { - modality: mm_max_tokens_per_item[modality] * mm_counts[modality] - for modality in placeholders_by_modality - } - if total_placeholders_by_modality != expected_placeholders_by_modality: - raise AssertionError( - f"The processed dummy data has a total of " - f"{total_placeholders_by_modality} placeholder tokens, which " - f"is not the expected {expected_placeholders_by_modality} " - "tokens.") - - total_len = len(prompt_token_ids) - if total_len > seq_len: - logger.warning( - "The context length (%d) of the model is too short " - "to hold the multi-modal embeddings in the worst case " - "(%d tokens in total, out of which %s are reserved for " - "multi-modal embeddings). This may cause certain multi-modal " - "inputs to fail during inference, even when the input text is " - "short. To avoid this, you should increase `max_model_len`, " - "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, - total_len, total_placeholders_by_modality) - - prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) - - return DummyData( - seq_data=SequenceData.from_seqs(prompt_token_ids), - multi_modal_data=mm_inputs["mm_kwargs"], - multi_modal_placeholders=placeholders_by_modality, + mm_hashes=mm_hashes, + mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py new file mode 100644 index 0000000000..c68edaff80 --- /dev/null +++ b/vllm/multimodal/profiling.py @@ -0,0 +1,206 @@ +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Generic, TypeVar + +import numpy as np +import numpy.typing as npt +from PIL import Image + +import vllm.envs as envs +from vllm.inputs import DummyData +from vllm.logger import init_logger + +from .inputs import MultiModalDataDict, MultiModalInputs +from .processing import BaseMultiModalProcessor, BaseProcessingInfo + +logger = init_logger(__name__) + + +@dataclass +class ProcessorInputs: + """ + Represents the keyword arguments to + :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + """ + prompt_text: str + mm_data: MultiModalDataDict + hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + + +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class BaseDummyInputsBuilder(ABC, Generic[_I]): + """ + Abstract base class that constructs the dummy data to profile + multi-modal models. + """ + + def __init__(self, info: _I) -> None: + super().__init__() + + self.info = info + + @abstractmethod + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + """ + Build the input which, after processing, results in + :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens. + """ + raise NotImplementedError + + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + + +class MultiModalProfiler(Generic[_I]): + """ + Contains code for running memory profiling for multi-modal models. + """ + + def __init__( + self, + processor: BaseMultiModalProcessor[_I], + ) -> None: + super().__init__() + + self.processor = processor + + @property + def processing_info(self) -> BaseProcessingInfo: + return self.processor.info + + @property + def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]: + return self.processor.dummy_inputs + + def get_mm_limits(self) -> Mapping[str, int]: + mm_config = self.processing_info.ctx.get_mm_config() + mm_limit_per_prompt = mm_config.limit_per_prompt + + supported_mm_limits = self.processing_info.get_supported_mm_limits() + + mm_limits = { + modality: mm_limit_per_prompt.get(modality, 1) + for modality in supported_mm_limits + } + + for modality, supported_limit in supported_mm_limits.items(): + limit = mm_limits[modality] + if supported_limit is not None and supported_limit < limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but this model only supports " + f"at most {supported_limit} {modality} items.") + + return mm_limits + + def _get_dummy_mm_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalInputs: + factory = self.dummy_inputs + processor_inputs = factory.get_dummy_processor_inputs( + seq_len, mm_counts) + + return self.processor.apply( + prompt=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) + + def get_dummy_data(self, seq_len: int) -> DummyData: + # Avoid circular import + from vllm.sequence import SequenceData + + mm_counts = self.get_mm_limits() + + info = self.processing_info + mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) + + if mm_counts.keys() != mm_max_tokens_per_item.keys(): + raise AssertionError( + "The keys returned by `get_supported_mm_limits`" + f"({set(mm_counts.keys())}) should be the same as those " + "returned by `get_mm_max_tokens_per_item` " + f"({set(mm_max_tokens_per_item.keys())})") + + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + prompt_token_ids = mm_inputs["prompt_token_ids"] + placeholders_by_modality = mm_inputs["mm_placeholders"] + + total_placeholders_by_modality = { + modality: sum(item["length"] for item in placeholders) + for modality, placeholders in placeholders_by_modality.items() + } + expected_placeholders_by_modality = { + modality: mm_max_tokens_per_item[modality] * mm_counts[modality] + for modality in placeholders_by_modality + } + if total_placeholders_by_modality != expected_placeholders_by_modality: + raise AssertionError( + f"The processed dummy data has a total of " + f"{total_placeholders_by_modality} placeholder tokens, which " + f"is not the expected {expected_placeholders_by_modality} " + "tokens.") + + total_len = len(prompt_token_ids) + + # V0 does not support chunked prefill. + if total_len > seq_len and not envs.VLLM_USE_V1: + logger.warning( + "The context length (%d) of the model is too short " + "to hold the multi-modal embeddings in the worst case " + "(%d tokens in total, out of which %s are reserved for " + "multi-modal embeddings). This may cause certain multi-modal " + "inputs to fail during inference, even when the input text is " + "short. To avoid this, you should increase `max_model_len`, " + "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, + total_len, total_placeholders_by_modality) + + return DummyData( + seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), + multi_modal_data=None, + multi_modal_placeholders=None, + ) + + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) + + return DummyData( + seq_data=SequenceData.from_seqs(prompt_token_ids), + multi_modal_data=mm_inputs["mm_kwargs"], + multi_modal_placeholders=placeholders_by_modality, + ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 073d49d7d2..7a4b85385c 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,7 +1,8 @@ import functools from collections import UserDict -from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol, - Sequence, Type, TypeVar) +from dataclasses import dataclass +from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional, + Protocol, Sequence, Type, TypeVar) import torch.nn as nn @@ -14,7 +15,9 @@ from .audio import AudioPlugin from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors -from .processing import BaseMultiModalProcessor, ProcessingCache +from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, + ProcessingCache) +from .profiling import BaseDummyInputsBuilder, MultiModalProfiler from .utils import cached_get_tokenizer from .video import VideoPlugin @@ -27,20 +30,59 @@ logger = init_logger(__name__) MM_CACHE_SIZE = 256 N = TypeVar("N", bound=Type[nn.Module]) +_I = TypeVar("_I", bound=BaseProcessingInfo) +_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) -class MultiModalProcessorFactory(Protocol): +class ProcessingInfoFactory(Protocol[_I_co]): """Constructs a :class:`MultiModalProcessor` instance from the context.""" def __call__( self, ctx: InputProcessingContext, + ) -> _I_co: + ... + + +class DummyInputsBuilderFactory(Protocol[_I]): + """ + Constructs a :class:`BaseDummyInputsBuilder` instance from the context. + """ + + def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: + ... + + +class MultiModalProcessorFactory(Protocol[_I]): + """Constructs a :class:`MultiModalProcessor` instance from the context.""" + + def __call__( + self, + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], *, cache: Optional[ProcessingCache] = None, - ) -> BaseMultiModalProcessor: + ) -> BaseMultiModalProcessor[_I]: ... +@dataclass(frozen=True) +class _ProcessorFactories(Generic[_I]): + info: ProcessingInfoFactory[_I] + processor: MultiModalProcessorFactory[_I] + dummy_inputs: DummyInputsBuilderFactory[_I] + + def build_processor( + self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + ): + info = self.info(ctx) + dummy_inputs_builder = self.dummy_inputs(info) + return self.processor(info, dummy_inputs_builder, cache=cache) + + class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message @@ -58,8 +100,7 @@ class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): class MultiModalRegistry: """ - A registry that dispatches data processing to the - :class:`~vllm.multimodal.MultiModalPlugin` for each modality. + A registry that dispatches data processing according to the model. """ DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin()) @@ -71,7 +112,7 @@ class MultiModalRegistry: self._plugins = {p.get_data_key(): p for p in plugins} self._processor_factories = ClassRegistry[nn.Module, - MultiModalProcessorFactory]() + _ProcessorFactories]() # This is used for non-multimodal models self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} @@ -83,9 +124,6 @@ class MultiModalRegistry: def register_plugin(self, plugin: MultiModalPlugin) -> None: """ Register a multi-modal plugin so it can be recognized by vLLM. - - See also: - :ref:`adding-multimodal-plugin` """ data_type_key = plugin.get_data_key() @@ -214,22 +252,45 @@ class MultiModalRegistry: model_config: "ModelConfig", ) -> Mapping[str, int]: """ - Get the maximum number of tokens per data item from each modality - for profiling the memory usage of a model. - - Note: - This is currently directly used only in V1. + Get the maximum number of tokens per data item from each modality based + on underlying model configuration. """ if self.has_processor(model_config): - tokenizer = cached_get_tokenizer(model_config.tokenizer) + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) processor = self.create_processor(model_config, tokenizer) - return processor.get_mm_max_tokens_per_item() + seq_len = model_config.max_model_len + return processor.info.get_mm_max_tokens_per_item(seq_len) return { key: plugin.get_max_multimodal_tokens(model_config) for key, plugin in self._plugins.items() } + def get_max_tokens_per_item_by_nonzero_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens per data item from each modality based + on underlying model configuration, excluding modalities that user + explicitly disabled via `limit_mm_per_prompt`. + + Note: + This is currently directly used only in V1 for profiling the memory + usage of a model. + """ + mm_limits = self.get_mm_limits_per_prompt(model_config) + + return { + key: max_tokens_per_mm_item + for key, max_tokens_per_mm_item in + self.get_max_tokens_per_item_by_modality(model_config).items() + if mm_limits[key] > 0 + } + def get_max_tokens_by_modality( self, model_config: "ModelConfig", @@ -243,10 +304,10 @@ class MultiModalRegistry: Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ - limits_per_plugin = self._limits_by_model[model_config] + mm_limits = self.get_mm_limits_per_prompt(model_config) return { - key: limits_per_plugin[key] * max_tokens_per_mm_item + key: mm_limits[key] * max_tokens_per_mm_item for key, max_tokens_per_mm_item in self.get_max_tokens_per_item_by_modality(model_config).items() } @@ -310,11 +371,23 @@ class MultiModalRegistry: Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = self.create_processor(model_config, tokenizer) + profiler = MultiModalProfiler(processor) + return profiler.get_mm_limits() + return self._limits_by_model[model_config] def register_processor( self, - factory: MultiModalProcessorFactory, + processor: MultiModalProcessorFactory[_I], + *, + info: ProcessingInfoFactory[_I], + dummy_inputs: DummyInputsBuilderFactory[_I], ): """ Register a multi-modal processor to a model class. The processor @@ -324,8 +397,7 @@ class MultiModalRegistry: invoked to transform the data into a dictionary of model inputs. See also: - - :ref:`input-processing-pipeline` - - :ref:`enabling-multimodal-inputs` + :ref:`mm-processing` """ def wrapper(model_cls: N) -> N: @@ -335,7 +407,11 @@ class MultiModalRegistry: "registered to %s. It is overwritten by the new one.", model_cls, self) - self._processor_factories[model_cls] = factory + self._processor_factories[model_cls] = _ProcessorFactories( + info=info, + dummy_inputs=dummy_inputs, + processor=processor, + ) return model_cls @@ -351,6 +427,9 @@ class MultiModalRegistry: def has_processor(self, model_config: "ModelConfig") -> bool: """ Test whether a multi-modal processor is defined for a specific model. + + See also: + :ref:`mm-processing` """ return self._get_model_cls(model_config) in self._processor_factories @@ -358,15 +437,18 @@ class MultiModalRegistry: self, model_config: "ModelConfig", tokenizer: AnyTokenizer, - ) -> BaseMultiModalProcessor: + ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. + + See also: + :ref:`mm-processing` """ model_cls = self._get_model_cls(model_config) - processor_factory = self._processor_factories[model_cls] + factories = self._processor_factories[model_cls] ctx = InputProcessingContext(model_config, tokenizer) cache = (None if model_config.disable_mm_preprocessor_cache else self._processing_cache) - return processor_factory(ctx, cache=cache) + return factories.build_processor(ctx, cache=cache) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 7b6ded6a27..900bed5929 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,11 +1,11 @@ from functools import lru_cache +from itertools import groupby from pathlib import Path -from typing import Optional, TypeVar, Union +from typing import TYPE_CHECKING, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse import numpy as np import numpy.typing as npt -import torch from PIL import Image import vllm.envs as envs @@ -25,6 +25,10 @@ cached_get_tokenizer = lru_cache(get_tokenizer) _M = TypeVar("_M") +if TYPE_CHECKING: + from .hasher import MultiModalHashDict + from .inputs import MultiModalKwargs, MultiModalPlaceholderDict + class MediaConnector: @@ -281,49 +285,6 @@ def encode_video_base64(frames: npt.NDArray) -> str: return video_io.encode_base64(frames) -def resolve_visual_encoder_outputs( - encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], - feature_sample_layers: Optional[list[int]], - post_layer_norm: Optional[torch.nn.LayerNorm], - max_possible_layers: int, -) -> torch.Tensor: - """Given the outputs a visual encoder module that may correspond to the - output of the last layer, or a list of hidden states to be stacked, - handle post normalization and resolve it into a single output tensor. - - Args: - encoder_outputs: Output of encoder's last layer or all hidden states. - feature_sample_layers: Optional layer indices to grab from the encoder - outputs; if provided, encoder outputs must be a list. - post_layer_norm: Post norm to apply to the output of the encoder. - max_possible_layers: Total layers in the fully loaded visual encoder. - - """ - if feature_sample_layers is None: - if post_layer_norm is not None: - return post_layer_norm(encoder_outputs) - return encoder_outputs - - # Get the hidden states corresponding to the layer indices. - # Negative values are relative to the full visual encoder, - # so offset them depending on how many layers were loaded. - # NOTE: this assumes that encoder_outputs contains a list - # of hidden states in the same order as the encoder layers - # that produced them. - offset = max_possible_layers - len(encoder_outputs) - hs_pool = [ - encoder_outputs[layer_idx] - if layer_idx >= 0 else encoder_outputs[layer_idx + offset] - for layer_idx in feature_sample_layers - ] - - # Apply post-norm on the final hidden state if we are using it - uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) - if post_layer_norm is not None and uses_last_layer: - hs_pool[-1] = post_layer_norm(encoder_outputs) - return torch.cat(hs_pool, dim=-1) - - # Utilities for input processors _T = TypeVar("_T", str, int) @@ -437,3 +398,119 @@ def consecutive_placeholder_ranges( PlaceholderRange(offset=initial_offset + i * item_size, length=item_size) for i in range(num_items) ] + + +def merge_and_sort_multimodal_metadata( + mm_positions: "MultiModalPlaceholderDict", + mm_hashes: Optional["MultiModalHashDict"], +) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]: + """Given a MultiModalPlaceholderDict, merge all PlaceholderRange + objects from all available modalities into a single list of + PlaceholderRange, sorted by their offset (starting index in the input + sequence) in the ascending order. + + Optionally if a MultiModalHashDict is given, same operation will be + applied to the object and the sorted list of hashes will be returned. + + Raises: + ValueError: If the input prompt has interleaved placeholders from + different modalities (e.g, "