Compare commits
155 Commits
v0.7.3
...
reduce_sca
| Author | SHA1 | Date | |
|---|---|---|---|
| 3679753af5 | |||
| 9b61dd41e7 | |||
| f7bee5c815 | |||
| e0734387fb | |||
| f58f8b5c96 | |||
| b3f7aaccd0 | |||
| b91660ddb8 | |||
| 76c89fcadd | |||
| b9e41734c5 | |||
| 1088f06242 | |||
| 73e0225ee9 | |||
| 6c85da3a18 | |||
| 67fc426845 | |||
| 9804145cac | |||
| 2e94b9cfbb | |||
| 8294773e48 | |||
| cd813c6d4d | |||
| 38acae6e97 | |||
| a2dd48c386 | |||
| 126f6beeb4 | |||
| 58d1b2aa77 | |||
| f1579b229d | |||
| 7864875879 | |||
| 1dd422b64a | |||
| 06c8f8d885 | |||
| 5677c9bb3e | |||
| 512d77d582 | |||
| 7f0be2aa24 | |||
| edf309ebbe | |||
| 788f284b53 | |||
| 4b1d141f49 | |||
| 10c3b8c1cf | |||
| a7f37314b7 | |||
| cd711c48b2 | |||
| 378b3ef6f8 | |||
| c9944acbf9 | |||
| ca377cf1b9 | |||
| a31614e386 | |||
| f95903909f | |||
| b382a7f28f | |||
| 4cb6fa0a9c | |||
| d08b285adf | |||
| b27122acc2 | |||
| 934bb99c71 | |||
| 3f808cc044 | |||
| ec8a5e5386 | |||
| 215bf150a6 | |||
| 0ecdd98031 | |||
| 7b700ec8c8 | |||
| 7ca1da020f | |||
| 5157338ed9 | |||
| e206b54331 | |||
| 1d35662e6d | |||
| e656f638de | |||
| 145944cb94 | |||
| 094b7d9496 | |||
| e1fe7591f2 | |||
| 5629f26df7 | |||
| 9ba28043b5 | |||
| 24679788ed | |||
| 07c4353057 | |||
| 34e3494e70 | |||
| f75aa72732 | |||
| 340e39e387 | |||
| f4133ce4e5 | |||
| 6522d55b6f | |||
| 6ff518626c | |||
| fa82074167 | |||
| 75e9d49796 | |||
| 32c3b6bfd1 | |||
| 37b6cb4985 | |||
| aabeb2688f | |||
| 2f42a4888c | |||
| 3173c3b34e | |||
| 2d87d7d1ac | |||
| aab392774b | |||
| 6724e79164 | |||
| 03f48b3db6 | |||
| 4d251ad00e | |||
| 18e505930d | |||
| 4a8cfc7551 | |||
| bc32bc73aa | |||
| ab1091d5f2 | |||
| 1e15aaef56 | |||
| 51010a1807 | |||
| 7196a3b1db | |||
| cdc1fa12eb | |||
| f61528d46d | |||
| 1f0ae3ed0a | |||
| db986c19ea | |||
| 227578480d | |||
| befc402d34 | |||
| 444b0f0f62 | |||
| ccc00515fd | |||
| 781096e385 | |||
| 7940d8a6a7 | |||
| c0e3ecd6d2 | |||
| 23eca9cf68 | |||
| 437b76ff59 | |||
| f90a375593 | |||
| e7ef74e26e | |||
| cbae7af552 | |||
| eb24dc4a45 | |||
| 9bebc9512f | |||
| 5a2ba16f5c | |||
| ba5106e519 | |||
| d5ca2110f1 | |||
| 2c5e637b57 | |||
| 322d2a27d6 | |||
| 82e0d601fc | |||
| 78ac0f591d | |||
| b56155e7f3 | |||
| 382f66fb08 | |||
| 8354f6640c | |||
| c904fdddf6 | |||
| 558db8083c | |||
| e109e598c7 | |||
| 8db1b9d0a1 | |||
| 2382ad29d1 | |||
| 3e472d882a | |||
| 7f6bae561c | |||
| 105b8ce4c0 | |||
| 2cb8c1540e | |||
| 1cd981da4f | |||
| fca20841c2 | |||
| da31b5333e | |||
| bb78fb318e | |||
| 8aca27fa11 | |||
| 95c617e04b | |||
| 9a1f1da5d1 | |||
| 68d630a0c7 | |||
| 68d535ef44 | |||
| c6ed93860f | |||
| 0ffdf8ce0c | |||
| 8c0dd3d4df | |||
| ada7c780d5 | |||
| 288cc6c234 | |||
| 900edbfa48 | |||
| b2c3fc5d65 | |||
| 839b27c6cc | |||
| 34ad27fe83 | |||
| 1c3c975766 | |||
| 1cdc88614a | |||
| 31aa045c11 | |||
| a30c093502 | |||
| c7b07a95a6 | |||
| 27a09dc52c | |||
| 981f3c831e | |||
| 44c33f01f3 | |||
| 33170081f1 | |||
| 71face8540 | |||
| bfbc0b32c6 | |||
| 6a417b8600 | |||
| d3ea50113c | |||
| 34aad515c8 |
@ -84,8 +84,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -99,8 +104,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -121,8 +131,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
|
||||
@ -309,11 +309,14 @@ run_serving_tests() {
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
|
||||
@ -32,4 +32,4 @@
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@ -1,4 +1,15 @@
|
||||
steps:
|
||||
- label: "Build wheel - CUDA 12.4"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.1"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
@ -37,7 +48,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
|
||||
@ -92,7 +92,9 @@ if [[ $commands == *" kernels "* ]]; then
|
||||
--ignore=kernels/test_moe.py \
|
||||
--ignore=kernels/test_prefix_prefill.py \
|
||||
--ignore=kernels/test_rand.py \
|
||||
--ignore=kernels/test_sampler.py"
|
||||
--ignore=kernels/test_sampler.py \
|
||||
--ignore=kernels/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/test_mamba_mixer2.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints tests
|
||||
|
||||
@ -134,7 +134,9 @@ steps:
|
||||
- tests/compile/test_basic_correctness
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
@ -276,7 +278,7 @@ steps:
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -287,7 +289,7 @@ steps:
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
|
||||
- label: "PyTorch Fullgraph Test" # 18min
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@ -501,6 +503,7 @@ steps:
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
commands:
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||
# if $normal_wheel matches cu121, do not upload the index.html
|
||||
echo "Skipping index files for cu121 wheels"
|
||||
else
|
||||
# only upload index.html for cu12 wheels (default wheels)
|
||||
# only upload index.html for cu124 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
fi
|
||||
@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||
# if $normal_wheel matches cu121, do not upload the index.html
|
||||
echo "Skipping index files for cu121 wheels"
|
||||
else
|
||||
# only upload index.html for cu12 wheels (default wheels)
|
||||
# only upload index.html for cu124 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
fi
|
||||
|
||||
|
||||
1
.github/mergify.yml
vendored
1
.github/mergify.yml
vendored
@ -5,6 +5,7 @@ pull_request_rules:
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
- files~=^examples/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
|
||||
2
.github/workflows/lint-and-deploy.yaml
vendored
2
.github/workflows/lint-and-deploy.yaml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
|
||||
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
||||
with:
|
||||
version: v3.14.4
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
default_stages:
|
||||
- pre-commit # Run locally
|
||||
- manual # Run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
repos:
|
||||
- repo: https://github.com/google/yapf
|
||||
rev: v0.43.0
|
||||
@ -8,13 +9,11 @@ repos:
|
||||
- id: yapf
|
||||
args: [--in-place, --verbose]
|
||||
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.3
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--output-format, github, --fix]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.0
|
||||
hooks:
|
||||
@ -22,10 +21,9 @@ repos:
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.13.2
|
||||
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
||||
hooks:
|
||||
- id: isort
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v19.1.7
|
||||
hooks:
|
||||
@ -38,12 +36,16 @@ repos:
|
||||
hooks:
|
||||
- id: pymarkdown
|
||||
args: [fix]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
hooks:
|
||||
- id: actionlint
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
rev: 0.6.2
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements-test.in, -o, requirements-test.txt]
|
||||
files: ^requirements-test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy-local
|
||||
@ -53,7 +55,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
entry: tools/mypy.sh 1 "3.9"
|
||||
@ -61,7 +62,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.10
|
||||
entry: tools/mypy.sh 1 "3.10"
|
||||
@ -69,7 +69,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.11
|
||||
entry: tools/mypy.sh 1 "3.11"
|
||||
@ -77,7 +76,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.12
|
||||
entry: tools/mypy.sh 1 "3.12"
|
||||
@ -85,19 +83,16 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: shellcheck
|
||||
name: Lint shell scripts
|
||||
entry: tools/shellcheck.sh
|
||||
language: script
|
||||
types: [shell]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: png-lint
|
||||
name: Lint PNG exports from excalidraw
|
||||
entry: tools/png-lint.sh
|
||||
language: script
|
||||
types: [png]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: signoff-commit
|
||||
name: Sign-off Commit
|
||||
entry: bash
|
||||
@ -110,13 +105,11 @@ repos:
|
||||
language: system
|
||||
verbose: true
|
||||
stages: [commit-msg]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: check-spdx-header
|
||||
name: Check SPDX headers
|
||||
entry: python tools/check_spdx_header.py
|
||||
language: python
|
||||
types: [python]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: check-filenames
|
||||
name: Check for spaces in all filenames
|
||||
entry: bash
|
||||
@ -126,7 +119,6 @@ repos:
|
||||
language: system
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
exclude: 'vllm/third_party/.*'
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
@ -134,5 +126,4 @@ repos:
|
||||
language: system
|
||||
verbose: true
|
||||
pass_filenames: false
|
||||
exclude: 'vllm/third_party/.*'
|
||||
# Insert new entries above the `suggestion` entry
|
||||
|
||||
106
CMakeLists.txt
106
CMakeLists.txt
@ -174,6 +174,25 @@ include(FetchContent)
|
||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||
|
||||
#
|
||||
# Set rocm version dev int.
|
||||
#
|
||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||
#
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
|
||||
|
||||
#
|
||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||
#
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Define other extension targets
|
||||
#
|
||||
@ -229,7 +248,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
# Please keep this in sync with FetchContent_Declare line below.
|
||||
set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -247,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||
GIT_TAG v3.7.0
|
||||
GIT_TAG v3.8.0
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
@ -267,6 +286,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp")
|
||||
|
||||
@ -301,7 +321,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
set(SRCS
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||
@ -381,8 +401,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# FP4 Archs and flags
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||
)
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@ -554,77 +575,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
# vllm-flash-attn currently only supported on CUDA
|
||||
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
return()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# we need to manually set VLLM_GPU_ARCHES here.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build vLLM flash attention from source
|
||||
#
|
||||
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||
# They should be identical but if they aren't, this is a massive footgun.
|
||||
#
|
||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||
# If no component is specified, vllm-flash-attn is still installed.
|
||||
|
||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||
# It can be set as an environment variable or passed as a cmake argument.
|
||||
# The environment variable takes precedence.
|
||||
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn SOURCE_DIR
|
||||
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
# Nothing after vllm-flash-attn, see comment about macros above
|
||||
|
||||
32
Dockerfile
32
Dockerfile
@ -28,7 +28,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
@ -53,14 +53,14 @@ WORKDIR /workspace
|
||||
# we need to install torch and torchvision from the nightly builds first,
|
||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||
# after this step
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||
fi
|
||||
|
||||
COPY requirements-common.txt requirements-common.txt
|
||||
COPY requirements-cuda.txt requirements-cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements-cuda.txt
|
||||
|
||||
# cuda arch list used by torch
|
||||
@ -81,7 +81,7 @@ ARG TARGETPLATFORM
|
||||
# install build dependencies
|
||||
COPY requirements-build.txt requirements-build.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements-build.txt
|
||||
|
||||
COPY . .
|
||||
@ -101,7 +101,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
@ -121,7 +121,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
@ -146,7 +146,7 @@ FROM base as dev
|
||||
COPY requirements-lint.txt requirements-lint.txt
|
||||
COPY requirements-test.txt requirements-test.txt
|
||||
COPY requirements-dev.txt requirements-dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements-dev.txt
|
||||
#################### DEV IMAGE ####################
|
||||
|
||||
@ -178,7 +178,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
@ -191,14 +191,14 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
# we need to install torch and torchvision from the nightly builds first,
|
||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||
# after this step
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||
fi
|
||||
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose
|
||||
|
||||
# If we need to build FlashInfer wheel before its release:
|
||||
@ -213,7 +213,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# $ ls dist
|
||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
|
||||
@ -225,7 +225,7 @@ COPY examples examples
|
||||
# install build dependencies for JIT compilation.
|
||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||
COPY requirements-build.txt requirements-build.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements-build.txt
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
@ -238,15 +238,15 @@ FROM vllm-base AS test
|
||||
ADD . /vllm-workspace/
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements-dev.txt
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -e tests/vllm_test_utils
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
|
||||
@ -266,7 +266,7 @@ RUN mv vllm test_docs/
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
else \
|
||||
|
||||
@ -46,6 +46,12 @@ def run_vllm(requests: List[SampleRequest],
|
||||
warmup: bool = False) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
llm = LLM(**vars(engine_args))
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (
|
||||
request.prompt_len + request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: List[str] = []
|
||||
@ -115,6 +121,13 @@ async def run_vllm_async(
|
||||
async with build_async_engine_client_from_engine_args(
|
||||
engine_args, disable_frontend_multiprocessing) as llm:
|
||||
|
||||
assert all(
|
||||
llm.model_config.max_model_len >= (request.prompt_len +
|
||||
request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: List[str] = []
|
||||
sampling_params: List[SamplingParams] = []
|
||||
|
||||
@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -30,8 +30,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
for k in ["avg_latency", "percentiles"]})
|
||||
if pt_records:
|
||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||
with open(pt_file, "w") as f:
|
||||
json.dump(pt_records, f)
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
@ -42,6 +41,10 @@ def main(args: argparse.Namespace):
|
||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||
# the engine will automatically process the request in multiple batches.
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
assert llm.llm_engine.model_config.max_model_len >= (
|
||||
args.input_len +
|
||||
args.output_len), ("Please ensure that max_model_len is greater than"
|
||||
" the sum of input_len and output_len.")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
|
||||
@ -13,6 +13,11 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
#Select a equi-probable random priority
|
||||
def get_random_flag():
|
||||
return 0 if random.random() < 0.5 else 1
|
||||
|
||||
|
||||
def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
@ -55,8 +60,7 @@ def sample_requests(
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
|
||||
#Select a equi-probable random priority
|
||||
priority = 0 if random.random() < 0.5 else 1
|
||||
priority = get_random_flag()
|
||||
|
||||
filtered_dataset.append((prompt, prompt_len, output_len, priority))
|
||||
|
||||
@ -71,6 +75,12 @@ def run_vllm(
|
||||
from vllm import LLM, SamplingParams
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" input_len and output_len for all requests.")
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts = []
|
||||
sampling_params = []
|
||||
@ -103,8 +113,8 @@ def main(args: argparse.Namespace):
|
||||
if args.dataset is None:
|
||||
# Synthesize a prompt with the given input length.
|
||||
prompt = "hi" * (args.input_len - 1)
|
||||
requests = [(prompt, args.input_len, args.output_len)
|
||||
for _ in range(args.num_prompts)]
|
||||
requests = [(prompt, args.input_len, args.output_len,
|
||||
get_random_flag()) for _ in range(args.num_prompts)]
|
||||
else:
|
||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||
args.output_len)
|
||||
|
||||
@ -56,7 +56,7 @@ try:
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||
|
||||
@ -841,8 +841,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
if pt_records:
|
||||
# Don't use json suffix here as we don't want CI to pick it up
|
||||
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
|
||||
with open(pt_file, "w") as f:
|
||||
json.dump(pt_records, f)
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
@ -867,18 +866,10 @@ def main(args: argparse.Namespace):
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=args.trust_remote_code)
|
||||
|
||||
if args.dataset is not None:
|
||||
warnings.warn(
|
||||
"The '--dataset' argument will be deprecated in the next "
|
||||
"release. Please use '--dataset-name' and "
|
||||
"'--dataset-path' in the future runs.",
|
||||
stacklevel=2)
|
||||
input_requests = sample_sharegpt_requests(
|
||||
dataset_path=args.dataset,
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
fixed_output_len=args.sharegpt_output_len,
|
||||
)
|
||||
if args.dataset_name is None:
|
||||
raise ValueError(
|
||||
"Please specify '--dataset-name' and the corresponding "
|
||||
"'--dataset-path' if required.")
|
||||
|
||||
elif args.dataset_name == "sharegpt":
|
||||
input_requests = sample_sharegpt_requests(
|
||||
@ -1052,13 +1043,6 @@ if __name__ == "__main__":
|
||||
default="/v1/completions",
|
||||
help="API endpoint.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the ShareGPT dataset, will be deprecated in the "
|
||||
"next release.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
|
||||
@ -9,7 +9,7 @@ On the server side, run one of the following commands:
|
||||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||
|
||||
On the client side, run:
|
||||
python benchmarks/benchmark_serving.py \
|
||||
python benchmarks/benchmark_serving_guided.py \
|
||||
--backend <backend> \
|
||||
--model <your_model> \
|
||||
--dataset json \
|
||||
@ -31,7 +31,7 @@ import random
|
||||
import time
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import AsyncGenerator, List, Optional, Tuple
|
||||
from typing import AsyncGenerator, Dict, List, Optional, Tuple
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
@ -264,6 +264,7 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: List[str],
|
||||
selected_percentiles: List[float],
|
||||
goodput_config_dict: Optional[Dict[str, float]] = None,
|
||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||
actual_output_lens: List[int] = []
|
||||
total_input = 0
|
||||
@ -287,10 +288,10 @@ def calculate_metrics(
|
||||
total_input += input_requests[i].prompt_len
|
||||
tpot = 0
|
||||
if output_len > 1:
|
||||
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
||||
1)
|
||||
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
|
||||
tpot = latency_minus_ttft / (output_len - 1)
|
||||
tpots.append(tpot)
|
||||
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
|
||||
outputs[i].tpot = tpot
|
||||
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||
all_tpots.append(tpot)
|
||||
itls += outputs[i].itl
|
||||
@ -300,6 +301,28 @@ def calculate_metrics(
|
||||
else:
|
||||
actual_output_lens.append(0)
|
||||
|
||||
if goodput_config_dict:
|
||||
valid_metrics = []
|
||||
slo_values = []
|
||||
|
||||
if "ttft" in goodput_config_dict:
|
||||
valid_metrics.append(ttfts)
|
||||
slo_values.append(goodput_config_dict["ttft"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "tpot" in goodput_config_dict:
|
||||
valid_metrics.append(all_tpots)
|
||||
slo_values.append(goodput_config_dict["tpot"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "e2el" in goodput_config_dict:
|
||||
valid_metrics.append(e2els)
|
||||
slo_values.append(goodput_config_dict["e2el"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
|
||||
for req_metric in zip(*valid_metrics):
|
||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||
if is_good_req:
|
||||
good_completed += 1
|
||||
|
||||
if completed == 0:
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
@ -356,6 +379,7 @@ async def benchmark(
|
||||
max_concurrency: Optional[int],
|
||||
guided_decoding_ratio: float,
|
||||
guided_decoding_backend: str,
|
||||
goodput_config_dict: Optional[Dict[str, float]] = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -483,6 +507,7 @@ async def benchmark(
|
||||
tokenizer=tokenizer,
|
||||
selected_percentile_metrics=selected_percentile_metrics,
|
||||
selected_percentiles=selected_percentiles,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
)
|
||||
|
||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||
@ -494,6 +519,9 @@ async def benchmark(
|
||||
metrics.total_output))
|
||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||
metrics.request_throughput))
|
||||
if goodput_config_dict:
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||
metrics.request_goodput))
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||
metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||
@ -617,6 +645,40 @@ def evaluate(ret, args):
|
||||
100) if len(not_none_scores) > 0 else None
|
||||
|
||||
|
||||
def parse_goodput(slo_pairs):
|
||||
goodput_config_dict = {}
|
||||
try:
|
||||
for slo_pair in slo_pairs:
|
||||
slo_name, slo_val = slo_pair.split(":")
|
||||
goodput_config_dict[slo_name] = float(slo_val)
|
||||
except ValueError as err:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Invalid format found for service level objectives. "
|
||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is a "
|
||||
"number in milliseconds.") from err
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
def check_goodput_args(args):
|
||||
goodput_config_dict = {}
|
||||
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||
if args.goodput:
|
||||
goodput_config_dict = parse_goodput(args.goodput)
|
||||
for slo_name, slo_val in goodput_config_dict.items():
|
||||
if slo_name not in VALID_NAMES:
|
||||
raise ValueError(
|
||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||
"The service level objective name should be one of "
|
||||
f"{str(VALID_NAMES)}. ")
|
||||
if slo_val < 0:
|
||||
raise ValueError(
|
||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||
"The service level objective value should be "
|
||||
"non-negative.")
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
@ -661,6 +723,8 @@ def main(args: argparse.Namespace):
|
||||
|
||||
input_requests = sample_requests(tokenizer, args)
|
||||
|
||||
goodput_config_dict = check_goodput_args(args)
|
||||
|
||||
benchmark_result, ret = asyncio.run(
|
||||
benchmark(
|
||||
backend=backend,
|
||||
@ -681,6 +745,7 @@ def main(args: argparse.Namespace):
|
||||
max_concurrency=args.max_concurrency,
|
||||
guided_decoding_ratio=args.guided_decoding_ratio,
|
||||
guided_decoding_backend=args.guided_decoding_backend,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
@ -865,6 +930,18 @@ if __name__ == "__main__":
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--goodput",
|
||||
nargs="+",
|
||||
required=False,
|
||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is in "
|
||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||
"separated by spaces. Allowed request level metric names are "
|
||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||
|
||||
parser.add_argument("--no-guided-decoding",
|
||||
action='store_true',
|
||||
default=False,
|
||||
|
||||
@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import uvloop
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
@ -171,7 +171,12 @@ def run_vllm(
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (
|
||||
request.prompt_len + request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
# Add the requests to the engine.
|
||||
prompts: List[TextPrompt] = []
|
||||
sampling_params: List[SamplingParams] = []
|
||||
@ -229,6 +234,12 @@ async def run_vllm_async(
|
||||
|
||||
async with build_async_engine_client_from_engine_args(
|
||||
engine_args, disable_frontend_multiprocessing) as llm:
|
||||
assert all(
|
||||
llm.model_config.max_model_len >= (request.prompt_len +
|
||||
request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: List[TextPrompt] = []
|
||||
@ -355,8 +366,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
if pt_records:
|
||||
# Don't use json suffix here as we don't want CI to pick it up
|
||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||
with open(pt_file, "w") as f:
|
||||
json.dump(pt_records, f)
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Any, Dict, List
|
||||
|
||||
@ -34,6 +36,34 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
"extra_info": extra_info,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||
"tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"][
|
||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
|
||||
def clear_inf(self, o: Any):
|
||||
if isinstance(o, dict):
|
||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||
elif isinstance(o, list):
|
||||
return [self.clear_inf(v) for v in o]
|
||||
elif isinstance(o, float) and math.isinf(o):
|
||||
return "inf"
|
||||
return o
|
||||
|
||||
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||
|
||||
|
||||
def write_to_json(filename: str, records: List) -> None:
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
|
||||
@ -89,7 +89,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
|
||||
sort_by_lora_id: bool,
|
||||
device: str) -> torch.Tensor:
|
||||
"""
|
||||
All prompts are mapped to a Lora ID in range [0, num_active_loras).
|
||||
All prompts are mapped to a LoRA ID in range [0, num_active_loras).
|
||||
where 0 refers to first lora, 1 refers to second lora and so on.
|
||||
"""
|
||||
assert num_active_loras > 0
|
||||
|
||||
@ -468,7 +468,8 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] == "DeepseekV3ForCausalLM":
|
||||
elif (config.architectures[0] == "DeepseekV3ForCausalLM"
|
||||
or config.architectures[0] == "DeepseekV2ForCausalLM"):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
|
||||
66
cmake/external_projects/flashmla.cmake
Normal file
66
cmake/external_projects/flashmla.cmake
Normal file
@ -0,0 +1,66 @@
|
||||
include(FetchContent)
|
||||
|
||||
# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory
|
||||
# instead of downloading.
|
||||
# It can be set as an environment variable or passed as a cmake argument.
|
||||
# The environment variable takes precedence.
|
||||
if (DEFINED ENV{FLASH_MLA_SRC_DIR})
|
||||
set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(FLASH_MLA_SRC_DIR)
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
SOURCE_DIR ${FLASH_MLA_SRC_DIR}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
|
||||
GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
FetchContent_MakeAvailable(flashmla)
|
||||
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
|
||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||
# Only build FlashMLA kernels if we are building for something compatible with
|
||||
# sm90a
|
||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
|
||||
set(FlashMLA_SOURCES
|
||||
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
|
||||
|
||||
set(FlashMLA_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/include)
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${FlashMLA_SOURCES}"
|
||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||
|
||||
define_gpu_extension_target(
|
||||
_flashmla_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
SOURCES ${FlashMLA_SOURCES}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
else()
|
||||
# Create an empty target for setup.py when not targeting sm90a systems
|
||||
add_custom_target(_flashmla_C)
|
||||
endif()
|
||||
|
||||
67
cmake/external_projects/vllm_flash_attn.cmake
Normal file
67
cmake/external_projects/vllm_flash_attn.cmake
Normal file
@ -0,0 +1,67 @@
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# we need to manually set VLLM_GPU_ARCHES here.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build vLLM flash attention from source
|
||||
#
|
||||
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||
# They should be identical but if they aren't, this is a massive footgun.
|
||||
#
|
||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||
# If no component is specified, vllm-flash-attn is still installed.
|
||||
|
||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||
# It can be set as an environment variable or passed as a cmake argument.
|
||||
# The environment variable takes precedence.
|
||||
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn SOURCE_DIR
|
||||
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
@ -39,3 +39,10 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
|
||||
// Just for unittest
|
||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
|
||||
void gather_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
@ -374,7 +375,7 @@ void reshape_and_cache(
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale) {
|
||||
int num_tokens = key.size(0);
|
||||
int num_tokens = slot_mapping.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
int block_size = key_cache.size(3);
|
||||
@ -570,3 +571,161 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
|
||||
}
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// grid is launched with dimensions (batch, num_splits)
|
||||
template <typename scalar_t>
|
||||
__global__ void gather_cache(
|
||||
const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t block_size, const int32_t entry_size,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
||||
// batch
|
||||
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = cu_seq_lens[bid];
|
||||
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||
const int32_t seq_len = seq_end - seq_start;
|
||||
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
|
||||
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
|
||||
|
||||
const int32_t split_start = split * split_blocks;
|
||||
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
|
||||
|
||||
const bool is_active_split = (split_start < tot_blocks);
|
||||
const bool is_last_split = (split_end == tot_blocks);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
int32_t full_blocks_end = split_end;
|
||||
int32_t partial_block_size = 0;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch.
|
||||
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
|
||||
// page_size)
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[bid] / block_size;
|
||||
}
|
||||
const int32_t* batch_block_table = block_table + batch_offset + offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
if (is_last_split) {
|
||||
partial_block_size = seq_len % block_size;
|
||||
if (partial_block_size) full_blocks_end -= 1;
|
||||
}
|
||||
|
||||
auto copy_entry = [&](const scalar_t* __restrict__ _src,
|
||||
scalar_t* __restrict__ _dst) {
|
||||
for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
|
||||
_dst[i] = _src[i];
|
||||
};
|
||||
|
||||
for (int pid = split_start; pid < full_blocks_end; ++pid) {
|
||||
auto block_id = batch_block_table[pid];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
}
|
||||
}
|
||||
|
||||
if (partial_block_size) {
|
||||
auto block_id = batch_block_table[full_blocks_end];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < partial_block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// Macro to dispatch the kernel based on the data type.
|
||||
#define CALL_GATHER_CACHE(CPY_DTYPE) \
|
||||
vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||
cache_entry_stride, dst_entry_stride, seq_starts_ptr);
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
||||
// (seq_starts[bid] / page_size)
|
||||
void gather_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
|
||||
"cu_seq_lens must be int32");
|
||||
if (seq_starts.has_value()) {
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == block_table.device(),
|
||||
"src_cache and block_table must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
|
||||
"src_cache and cu_seq_lens must be on the same device");
|
||||
if (seq_starts.has_value()) {
|
||||
TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
|
||||
"src_cache and seq_starts must be on the same device");
|
||||
}
|
||||
|
||||
int64_t block_table_stride = block_table.stride(0);
|
||||
int64_t cache_block_stride = src_cache.stride(0);
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size.
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(1024);
|
||||
|
||||
TORCH_CHECK(src_cache.dtype() == dst.dtype(),
|
||||
"src_cache and dst must have the same dtype");
|
||||
|
||||
const int dtype_bits = src_cache.element_size() * 8;
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
|
||||
if (dtype_bits == 32) {
|
||||
CALL_GATHER_CACHE(uint32_t);
|
||||
} else if (dtype_bits == 16) {
|
||||
CALL_GATHER_CACHE(uint16_t);
|
||||
} else if (dtype_bits == 8) {
|
||||
CALL_GATHER_CACHE(uint8_t);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
|
||||
}
|
||||
}
|
||||
|
||||
@ -7,8 +7,3 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
|
||||
if (num <= 1) return num;
|
||||
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
@ -2,10 +2,14 @@
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
|
||||
#define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
|
||||
#define DEVICE_INLINE __forceinline__ __device__
|
||||
#define HOST_INLINE __forceinline__ __host__
|
||||
#if defined(__HIPCC__)
|
||||
#define HOST_DEVICE_INLINE __host__ __device__
|
||||
#define DEVICE_INLINE __device__
|
||||
#define HOST_INLINE __host__
|
||||
#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
|
||||
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
|
||||
#define DEVICE_INLINE __device__ __forceinline__
|
||||
#define HOST_INLINE __host__ __forceinline__
|
||||
#else
|
||||
#define HOST_DEVICE_INLINE inline
|
||||
#define DEVICE_INLINE inline
|
||||
@ -25,3 +29,13 @@
|
||||
int64_t get_device_attribute(int64_t attribute, int64_t device_id);
|
||||
|
||||
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
|
||||
|
||||
namespace cuda_utils {
|
||||
|
||||
template <typename T>
|
||||
HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
|
||||
ceil_div(T a, T b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
}; // namespace cuda_utils
|
||||
@ -122,8 +122,8 @@ struct ScaledEpilogue
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
|
||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
||||
return ArgumentType{a_args, evt0_args};
|
||||
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||
return ArgumentType{a_args, evt0_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -167,8 +167,8 @@ struct ScaledEpilogueBias
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
||||
return ArgumentType{a_args, evt0_args, bias_args};
|
||||
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||
return ArgumentType{a_args, evt0_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -230,9 +230,10 @@ struct ScaledEpilogueBiasAzp
|
||||
auto azp_adj_args =
|
||||
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
|
||||
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||
b_args, evt_azp_args, {}};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -309,11 +310,12 @@ struct ScaledEpilogueBiasAzpToken
|
||||
auto azp_adj_args =
|
||||
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
|
||||
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
|
||||
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
|
||||
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||
b_args, evt_acc_args, {}};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
}; // namespace vllm::c2x
|
||||
}; // namespace vllm::c2x
|
||||
|
||||
@ -146,8 +146,8 @@ struct ScaledEpilogue
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
|
||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
||||
return ArgumentType{a_args, evt0_args};
|
||||
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||
return ArgumentType{a_args, evt0_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -193,8 +193,8 @@ struct ScaledEpilogueBias
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
||||
return ArgumentType{a_args, evt0_args, bias_args};
|
||||
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||
return ArgumentType{a_args, evt0_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -236,8 +236,8 @@ struct ScaledEpilogueColumnBias
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
||||
return ArgumentType{a_args, evt0_args, bias_args};
|
||||
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||
return ArgumentType{a_args, evt0_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -297,9 +297,10 @@ struct ScaledEpilogueBiasAzp
|
||||
auto azp_adj_args =
|
||||
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
|
||||
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||
b_args, evt_azp_args, {}};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
@ -374,10 +375,11 @@ struct ScaledEpilogueBiasAzpToken
|
||||
auto azp_adj_args =
|
||||
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
|
||||
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
|
||||
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
||||
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
|
||||
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
|
||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||
b_args, evt_acc_args, {}};
|
||||
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -152,6 +152,11 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
|
||||
int64_t row);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
|
||||
torch::Tensor const& B, torch::Tensor const& A_sf,
|
||||
torch::Tensor const& B_sf,
|
||||
torch::Tensor const& alpha);
|
||||
|
||||
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
||||
bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
|
||||
|
||||
|
||||
@ -30,12 +30,18 @@ static inline cute::Shape<int, int, int, int> get_problem_shape(
|
||||
}
|
||||
|
||||
template <typename GemmKernel>
|
||||
void cutlass_gemm_caller(torch::Device device,
|
||||
cute::Shape<int, int, int, int> prob_shape,
|
||||
typename GemmKernel::MainloopArguments mainloop_args,
|
||||
typename GemmKernel::EpilogueArguments epilogue_args) {
|
||||
void cutlass_gemm_caller(
|
||||
torch::Device device, cute::Shape<int, int, int, int> prob_shape,
|
||||
typename GemmKernel::MainloopArguments mainloop_args,
|
||||
typename GemmKernel::EpilogueArguments epilogue_args,
|
||||
typename GemmKernel::TileSchedulerArguments scheduler = {}) {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
prob_shape, mainloop_args, epilogue_args};
|
||||
prob_shape,
|
||||
mainloop_args,
|
||||
epilogue_args,
|
||||
hw_info,
|
||||
scheduler};
|
||||
|
||||
// Launch the CUTLASS GEMM kernel.
|
||||
using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
@ -22,8 +22,9 @@ namespace vllm {
|
||||
|
||||
using namespace cute;
|
||||
|
||||
template <typename OutType, int GroupSizeM_, int GroupSizeN_, int GroupSizeK_,
|
||||
int TileSizeM_ = 128, class ClusterShape = Shape<_1, _2, _1>>
|
||||
template <typename SchedulerType, typename OutType, int GroupSizeM_,
|
||||
int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
|
||||
class ClusterShape = Shape<_1, _2, _1>>
|
||||
struct cutlass_3x_gemm_fp8_blockwise {
|
||||
using GroupSizeM = Int<GroupSizeM_>;
|
||||
using GroupSizeN = Int<GroupSizeN_>;
|
||||
@ -84,7 +85,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
|
||||
|
||||
using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
|
||||
cutlass::gemm::PersistentScheduler>>;
|
||||
SchedulerType>>;
|
||||
|
||||
struct GemmKernel : public KernelType {};
|
||||
|
||||
@ -150,8 +151,24 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||
{}, c_ptr, c_stride, c_ptr, c_stride};
|
||||
|
||||
typename GemmKernel::TileSchedulerArguments scheduler;
|
||||
|
||||
static constexpr bool UsesStreamKScheduler =
|
||||
cute::is_same_v<typename GemmKernel::TileSchedulerTag,
|
||||
cutlass::gemm::StreamKScheduler>;
|
||||
|
||||
if constexpr (UsesStreamKScheduler) {
|
||||
using DecompositionMode = typename cutlass::gemm::kernel::detail::
|
||||
PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
|
||||
using ReductionMode = typename cutlass::gemm::kernel::detail::
|
||||
PersistentTileSchedulerSm90StreamKParams::ReductionMode;
|
||||
|
||||
scheduler.decomposition_mode = DecompositionMode::StreamK;
|
||||
scheduler.reduction_mode = ReductionMode::Nondeterministic;
|
||||
}
|
||||
|
||||
c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
|
||||
epilogue_args);
|
||||
epilogue_args, scheduler);
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
@ -160,9 +177,18 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales) {
|
||||
cutlass_gemm_caller_blockwise<
|
||||
cutlass_3x_gemm_fp8_blockwise<OutType, 1, 128, 128>>(out, a, b, a_scales,
|
||||
b_scales);
|
||||
auto k = a.size(1);
|
||||
auto n = b.size(1);
|
||||
|
||||
if (k > 3 * n) {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
@ -1,7 +1,7 @@
|
||||
#include <cudaTypedefs.h>
|
||||
#include "c3x/scaled_mm_kernels.hpp"
|
||||
|
||||
#include "core/math.hpp"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
/*
|
||||
This file defines quantized GEMM operations using the CUTLASS 3.x API, for
|
||||
@ -33,7 +33,8 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
auto make_group_shape = [](torch::Tensor const& x,
|
||||
torch::Tensor const& s) -> GroupShape {
|
||||
TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
|
||||
return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))};
|
||||
return {cuda_utils::ceil_div(x.size(0), s.size(0)),
|
||||
cuda_utils::ceil_div(x.size(1), s.size(1))};
|
||||
};
|
||||
|
||||
GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
|
||||
|
||||
@ -348,10 +348,7 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
|
||||
auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
|
||||
auto output_ptr = static_cast<int64_t*>(output.data_ptr());
|
||||
at::cuda::CUDAGuard device_guard{(char)input.get_device()};
|
||||
auto stream = at::cuda::getStreamFromPool(false, input.get_device());
|
||||
if (stream == nullptr) {
|
||||
std::cerr << "Warning: Null CUDA stream" << std::endl;
|
||||
}
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
|
||||
// We don't support e8m0 scales at this moment.
|
||||
bool useUE8M0 = false;
|
||||
|
||||
38
csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
Normal file
38
csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
||||
void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
|
||||
torch::Tensor const& B,
|
||||
torch::Tensor const& A_sf,
|
||||
torch::Tensor const& B_sf,
|
||||
torch::Tensor const& alpha);
|
||||
#endif
|
||||
|
||||
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
|
||||
torch::Tensor const& B, torch::Tensor const& A_sf,
|
||||
torch::Tensor const& B_sf,
|
||||
torch::Tensor const& alpha) {
|
||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
||||
return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"No compiled nvfp4 mm kernel, vLLM should "
|
||||
"be compiled using CUDA 12.8 and target "
|
||||
"compute capability 100 or above.");
|
||||
}
|
||||
281
csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
Normal file
281
csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
Normal file
@ -0,0 +1,281 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
// Kernel Perf config
|
||||
template <typename T>
|
||||
struct KernelTraits;
|
||||
|
||||
template <>
|
||||
struct KernelTraits<float> {
|
||||
using MmaTileShape = Shape<_128, _128, _256>;
|
||||
using ClusterShape = Shape<_1, _1, _1>;
|
||||
using PerSmTileShape_MNK = Shape<_128, _128, _256>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelTraits<cutlass::half_t> {
|
||||
using MmaTileShape = Shape<_256, _256, _256>;
|
||||
using ClusterShape = Shape<_4, _4, _1>;
|
||||
using PerSmTileShape_MNK = Shape<_128, _256, _256>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelTraits<cutlass::bfloat16_t> {
|
||||
using MmaTileShape = Shape<_256, _256, _256>;
|
||||
using ClusterShape = Shape<_4, _4, _1>;
|
||||
using PerSmTileShape_MNK = Shape<_128, _256, _256>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Fp4GemmSm100 {
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
using LayoutATag = cutlass::layout::RowMajor;
|
||||
static constexpr int AlignmentA = 32;
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
using LayoutBTag = cutlass::layout::ColumnMajor;
|
||||
static constexpr int AlignmentB = 32;
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementD = T;
|
||||
using ElementC = T;
|
||||
using LayoutCTag = cutlass::layout::RowMajor;
|
||||
using LayoutDTag = cutlass::layout::RowMajor;
|
||||
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||
// Kernel functional config
|
||||
using ElementAccumulator = float;
|
||||
using ArchTag = cutlass::arch::Sm100;
|
||||
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
|
||||
|
||||
// Kernel Perf config
|
||||
using MmaTileShape = typename KernelTraits<T>::MmaTileShape;
|
||||
using ClusterShape = typename KernelTraits<T>::ClusterShape;
|
||||
using PerSmTileShape_MNK = typename KernelTraits<T>::PerSmTileShape_MNK;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||
ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
|
||||
LayoutDTag, AlignmentD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
|
||||
LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
|
||||
ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||
using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
|
||||
using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
|
||||
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||
using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
|
||||
using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
|
||||
using StrideC = typename Gemm::GemmKernel::StrideC;
|
||||
using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
|
||||
using StrideD = typename Gemm::GemmKernel::StrideD;
|
||||
using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename T::Gemm::Arguments args_from_options(
|
||||
at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
|
||||
at::Tensor const& A_sf, at::Tensor const& B_sf, at::Tensor const& alpha,
|
||||
int64_t M, int64_t N, int64_t K) {
|
||||
using ElementA = typename T::Gemm::ElementA;
|
||||
using ElementB = typename T::Gemm::ElementB;
|
||||
using ElementSFA = cutlass::float_ue4m3_t;
|
||||
using ElementSFB = cutlass::float_ue4m3_t;
|
||||
using ElementD = typename T::Gemm::ElementD;
|
||||
using ElementCompute = float;
|
||||
using StrideA = typename T::StrideA;
|
||||
using StrideB = typename T::StrideB;
|
||||
using StrideD = typename T::StrideD;
|
||||
using Sm100BlkScaledConfig =
|
||||
typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
|
||||
|
||||
int m = static_cast<int>(M);
|
||||
int n = static_cast<int>(N);
|
||||
int k = static_cast<int>(K);
|
||||
auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
|
||||
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
|
||||
auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
|
||||
|
||||
auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(
|
||||
cute::make_shape(m, n, k, 1));
|
||||
auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(
|
||||
cute::make_shape(m, n, k, 1));
|
||||
|
||||
typename T::Gemm::Arguments arguments{
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{m, n, k, 1},
|
||||
{// Mainloop arguments
|
||||
static_cast<ElementA const*>(A.data_ptr()), stride_A,
|
||||
static_cast<ElementB const*>(B.data_ptr()), stride_B,
|
||||
static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
|
||||
static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
|
||||
{ // Epilogue arguments
|
||||
{}, // epilogue.thread
|
||||
static_cast<ElementD const*>(D.data_ptr()),
|
||||
stride_D,
|
||||
static_cast<ElementD*>(D.data_ptr()),
|
||||
stride_D}};
|
||||
auto& fusion_args = arguments.epilogue.thread;
|
||||
fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
|
||||
return arguments;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
|
||||
at::Tensor const& A_sf, at::Tensor const& B_sf,
|
||||
at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
|
||||
cudaStream_t stream) {
|
||||
typename Fp4GemmSm100<T>::Gemm gemm;
|
||||
|
||||
auto arguments =
|
||||
args_from_options<Fp4GemmSm100<T>>(D, A, B, A_sf, B_sf, alpha, m, n, k);
|
||||
|
||||
size_t workspace_size = Fp4GemmSm100<T>::Gemm::get_workspace_size(arguments);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
|
||||
|
||||
CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
|
||||
}
|
||||
#else
|
||||
template <typename T>
|
||||
void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
|
||||
at::Tensor const& A_sf, at::Tensor const& B_sf,
|
||||
at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
|
||||
cudaStream_t stream) {
|
||||
TORCH_CHECK(false,
|
||||
"Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
|
||||
"a CUTLASS 3.8 source directory to enable support.");
|
||||
}
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
#define CHECK_TYPE(x, st, m) \
|
||||
TORCH_CHECK(x.scalar_type() == st, "Inconsistency of Tensor type:", m)
|
||||
#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x, m) \
|
||||
TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
|
||||
#define CHECK_INPUT(x, st, m) \
|
||||
CHECK_TH_CUDA(x, m); \
|
||||
CHECK_CONTIGUOUS(x, m); \
|
||||
CHECK_TYPE(x, st, m)
|
||||
|
||||
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
|
||||
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
|
||||
|
||||
void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
|
||||
torch::Tensor const& B,
|
||||
torch::Tensor const& A_sf,
|
||||
torch::Tensor const& B_sf,
|
||||
torch::Tensor const& alpha) {
|
||||
CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
|
||||
CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
|
||||
|
||||
CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
|
||||
CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
|
||||
|
||||
CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
|
||||
|
||||
TORCH_CHECK(A.dim() == 2, "a must be a matrix");
|
||||
TORCH_CHECK(B.dim() == 2, "b must be a matrix");
|
||||
TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
|
||||
"a and b shapes cannot be multiplied (", A.sizes()[0], "x",
|
||||
A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
|
||||
|
||||
auto const m = A.sizes()[0];
|
||||
auto const n = B.sizes()[0];
|
||||
auto const k = A.sizes()[1] * 2;
|
||||
|
||||
constexpr int alignment = 32;
|
||||
TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
|
||||
", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
|
||||
"), k: ", k, ".");
|
||||
TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
|
||||
", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
|
||||
|
||||
auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
|
||||
int rounded_m = round_up(m, 128);
|
||||
int rounded_n = round_up(n, 128);
|
||||
// Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
|
||||
// integer.
|
||||
int rounded_k = round_up(k / 16, 4);
|
||||
|
||||
TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
|
||||
TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
|
||||
TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
|
||||
"scale_a and scale_b shapes cannot be multiplied (",
|
||||
A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
|
||||
"x", B_sf.sizes()[1], ")");
|
||||
TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
|
||||
"scale_a must be padded and swizzled to a shape (", rounded_m,
|
||||
"x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
|
||||
A_sf.sizes()[1], ")");
|
||||
TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
|
||||
"scale_b must be padded and swizzled to a shape (", rounded_n,
|
||||
"x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
|
||||
B_sf.sizes()[1], ")");
|
||||
|
||||
auto out_dtype = D.dtype();
|
||||
at::cuda::CUDAGuard device_guard{(char)A.get_device()};
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
|
||||
|
||||
if (out_dtype == at::ScalarType::Half) {
|
||||
runGemm<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||
} else if (out_dtype == at::ScalarType::BFloat16) {
|
||||
runGemm<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||
} else if (out_dtype == at::ScalarType::Float) {
|
||||
runGemm<float>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm");
|
||||
}
|
||||
}
|
||||
@ -1,137 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __HIPCC__
|
||||
#include <hip/hip_runtime.h>
|
||||
#else
|
||||
#include <type_traits>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
#endif
|
||||
|
||||
#include "hip_float8_impl.h"
|
||||
|
||||
struct alignas(1) hip_fp8 {
|
||||
struct from_bits_t {};
|
||||
HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() {
|
||||
return from_bits_t();
|
||||
}
|
||||
uint8_t data;
|
||||
|
||||
hip_fp8() = default;
|
||||
HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default;
|
||||
HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete;
|
||||
explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t)
|
||||
: data(v) {}
|
||||
|
||||
#ifdef __HIP__MI300__
|
||||
// NOTE: ON-DEVICE... always optimal bias
|
||||
explicit HIP_FP8_DEVICE hip_fp8(float v)
|
||||
: data(hip_fp8_impl::to_fp8_from_fp32(v)) {}
|
||||
|
||||
explicit HIP_FP8_DEVICE hip_fp8(_Float16 v)
|
||||
: hip_fp8(static_cast<float>(v)) {}
|
||||
|
||||
// Host only implementation using s/w simulation
|
||||
explicit HIP_FP8_HOST
|
||||
#else // __HIP__MI300__
|
||||
// both Host and DEVICE for non-MI300 using s/w simulation
|
||||
explicit HIP_FP8_HOST_DEVICE
|
||||
#endif // __HIP__MI300__
|
||||
hip_fp8(float v) {
|
||||
data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/,
|
||||
true /*clip*/>(v);
|
||||
}
|
||||
|
||||
explicit HIP_FP8_HOST_DEVICE hip_fp8(double v)
|
||||
: hip_fp8(static_cast<float>(v)) {}
|
||||
|
||||
#ifdef __HIP__MI300__
|
||||
// upcast using device specific intrinsic
|
||||
explicit inline HIP_FP8_DEVICE operator float() const {
|
||||
float fval;
|
||||
uint32_t i32val = static_cast<uint32_t>(data);
|
||||
|
||||
// upcast
|
||||
asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0"
|
||||
: "=v"(fval)
|
||||
: "v"(i32val));
|
||||
|
||||
return fval;
|
||||
}
|
||||
|
||||
explicit inline HIP_FP8_HOST operator float() const
|
||||
#else // __HIP__MI300__
|
||||
explicit inline HIP_FP8_HOST_DEVICE operator float() const
|
||||
#endif // __HIP__MI300__
|
||||
{
|
||||
return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(
|
||||
data);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
inline hip_fp8 sin(hip_fp8 a) { return hip_fp8(sinf(float(a))); }
|
||||
inline hip_fp8 cos(hip_fp8 a) { return hip_fp8(cosf(float(a))); }
|
||||
HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) { return a; }
|
||||
} // namespace std
|
||||
|
||||
// Special operator overloading
|
||||
inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) {
|
||||
return os << float(f8);
|
||||
}
|
||||
|
||||
// all + operator overloading with mixed types
|
||||
// mixed types, always converts to f32, does computation in f32, and returns
|
||||
// float
|
||||
inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) {
|
||||
return (fa + float(b));
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) {
|
||||
return (float(a) + fb);
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) {
|
||||
return hip_fp8(float(a) + float(b));
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) {
|
||||
return a = hip_fp8(float(a) + float(b));
|
||||
}
|
||||
|
||||
// overloading multiplication, always returns float,
|
||||
inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) {
|
||||
return float(a) * float(b);
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) {
|
||||
return (a * float(b));
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) {
|
||||
return (float(a) * b);
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) {
|
||||
return ((float)a * float(b));
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) {
|
||||
return ((float)a * float(b));
|
||||
}
|
||||
|
||||
// overloading for compare
|
||||
inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) {
|
||||
return (a.data == b.data);
|
||||
}
|
||||
inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) {
|
||||
return (a.data != b.data);
|
||||
}
|
||||
|
||||
inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) {
|
||||
return static_cast<float>(a) >= static_cast<float>(b);
|
||||
}
|
||||
inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) {
|
||||
return static_cast<float>(a) > static_cast<float>(b);
|
||||
}
|
||||
@ -1,315 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(__HIPCC__) && defined(__gfx942__)
|
||||
#define __HIP__MI300__
|
||||
#endif
|
||||
|
||||
#ifdef __HIPCC__
|
||||
#define HIP_FP8_HOST_DEVICE __host__ __device__
|
||||
#define HIP_FP8_HOST __host__
|
||||
#define HIP_FP8_DEVICE __device__
|
||||
#else
|
||||
#define HIP_FP8_HOST_DEVICE
|
||||
#define HIP_FP8_HOST
|
||||
#define HIP_FP8_DEVICE
|
||||
#endif
|
||||
|
||||
namespace hip_fp8_impl {
|
||||
|
||||
#ifdef __HIP__MI300__
|
||||
HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) {
|
||||
uint8_t i8data;
|
||||
union {
|
||||
float fval;
|
||||
uint32_t i32val;
|
||||
uint8_t i8val[4]; // NOTE: not endian independent
|
||||
} val;
|
||||
|
||||
uint32_t ival = 0;
|
||||
val.fval = v;
|
||||
|
||||
if ((val.i32val & 0x7F800000) !=
|
||||
0x7F800000) { /// propagate NAN/INF, no clipping
|
||||
val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
|
||||
}
|
||||
|
||||
ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival,
|
||||
false); // false -> WORD0
|
||||
val.i32val = ival;
|
||||
i8data = val.i8val[0];
|
||||
|
||||
return i8data;
|
||||
}
|
||||
#endif // __HIP__MI300__
|
||||
|
||||
HIP_FP8_HOST inline int clz(uint32_t x) { return __builtin_clz(x); }
|
||||
#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
|
||||
HIP_FP8_DEVICE inline int clz(uint32_t x) { return __clz(x); }
|
||||
#endif
|
||||
|
||||
template <int we, int wm, typename T, bool negative_zero_nan, bool clip>
|
||||
HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false,
|
||||
uint32_t rng = 0) {
|
||||
#ifdef __HIPCC__
|
||||
constexpr bool is_half = std::is_same<T, _Float16>::value;
|
||||
#else
|
||||
constexpr bool is_half = false;
|
||||
#endif
|
||||
constexpr bool is_float = std::is_same<T, float>::value;
|
||||
static_assert(wm + we == 7, "wm+we==7");
|
||||
static_assert(is_half || is_float, "Only half and float can be cast to f8");
|
||||
|
||||
const int mfmt = (sizeof(T) == 4) ? 23 : 10;
|
||||
uint32_t x;
|
||||
if (sizeof(T) == 4) {
|
||||
x = reinterpret_cast<uint32_t&>(_x);
|
||||
} else {
|
||||
x = reinterpret_cast<uint16_t&>(_x);
|
||||
}
|
||||
|
||||
uint32_t head, mantissa;
|
||||
int exponent, bias;
|
||||
uint32_t sign;
|
||||
|
||||
if (sizeof(T) == 4) {
|
||||
head = x & 0xFF800000;
|
||||
mantissa = x & 0x7FFFFF;
|
||||
exponent = (head >> 23) & 0xFF;
|
||||
sign = head >> 31;
|
||||
bias = 127;
|
||||
} else {
|
||||
head = x & 0xFC00;
|
||||
mantissa = x & 0x3FF;
|
||||
exponent = (head >> 10) & 0x1F;
|
||||
sign = head >> 15;
|
||||
bias = 15;
|
||||
}
|
||||
|
||||
uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
|
||||
|
||||
// Deal with inf and NaNs
|
||||
if (negative_zero_nan) {
|
||||
if (sizeof(T) == 4) {
|
||||
if ((x & 0x7F800000) == 0x7F800000) {
|
||||
return 0x80;
|
||||
}
|
||||
} else {
|
||||
// if(__hisinf(x) || __hisnan(x))
|
||||
if ((x & 0x7C00) == 0x7C00) {
|
||||
return 0x80;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (sizeof(T) == 4) {
|
||||
if ((x & 0x7F800000) == 0x7F800000) {
|
||||
return signed_inf + (mantissa != 0 ? 1 : 0);
|
||||
}
|
||||
} else {
|
||||
if ((x & 0x7C00) == 0x7C00) {
|
||||
return signed_inf + (mantissa != 0 ? 1 : 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (x == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// First need to check if it is normal or denorm as there is a difference of
|
||||
// implicit 1 Then need to adjust the exponent to align with the F8 exponent,
|
||||
// in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
|
||||
// to mantissa and truncate. And for RNE, no need to add rng. Then probably
|
||||
// need to check whether there is carry and adjust exponent and mantissa again
|
||||
|
||||
// For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
|
||||
// bits
|
||||
const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
|
||||
const int f8_denormal_act_exponent =
|
||||
1 - f8_bias; // actual exponent of f8 denormal
|
||||
// act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
|
||||
// f8_exponent is the converted f8 exponent with bias encoding
|
||||
// exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
|
||||
// the difference needs to be adjusted and mantissa shifted
|
||||
int act_exponent, f8_exponent, exponent_diff;
|
||||
|
||||
if (exponent == 0) { // fp32/fp16 is in denormal.
|
||||
/* fp32 denormal is below 2^-127 so it is usually not a concern here, we
|
||||
mostly concern fp16 here. In this case, f8 is usually in denormal. But there
|
||||
could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
|
||||
exponent bias 16. It means that there are some numbers in fp16 denormal but they
|
||||
are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
|
||||
where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
|
||||
(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1 */
|
||||
act_exponent = exponent - bias + 1;
|
||||
exponent_diff =
|
||||
f8_denormal_act_exponent -
|
||||
act_exponent; // actual exponent is exponent-bias+1 as it is denormal
|
||||
} else { // fp32/fp16 is normal with implicit 1
|
||||
act_exponent = exponent - bias;
|
||||
if (act_exponent <= f8_denormal_act_exponent) {
|
||||
/* This is the case where fp32/fp16 is normal but it is in f8 denormal
|
||||
range. For example fp8 nanoo mode, denormal exponent is -7, but if the
|
||||
fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1,
|
||||
Therefore it needs to be adjust to -6 and mantissa shift right by 1.
|
||||
So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
|
||||
exponent_diff = f8_denormal_act_exponent - act_exponent;
|
||||
} else { // both fp32/fp16 and f8 are in normal range
|
||||
exponent_diff = 0; // exponent_diff=0 does not mean there is no
|
||||
// difference for this case, act_exponent could be
|
||||
// larger. Just that it does not need shift mantissa
|
||||
}
|
||||
mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
|
||||
}
|
||||
|
||||
bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
|
||||
static_cast<uint32_t>(1 << (mfmt - wm + exponent_diff - 1));
|
||||
/* This part is a bit tricky. The judgment of whether it is a tie needs to be
|
||||
done before we shift right as shift right could rip off some residual part
|
||||
and make something not midpoint look like midpoint. For example, the fp16
|
||||
number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after
|
||||
shift right by 4 bits, it would look like midpoint.
|
||||
*/
|
||||
|
||||
if (exponent_diff > 0) {
|
||||
mantissa >>= exponent_diff;
|
||||
} else if (exponent_diff == -1) {
|
||||
mantissa <<= -exponent_diff;
|
||||
}
|
||||
bool implicit_one = mantissa & (1 << mfmt);
|
||||
// if there is no implicit 1, it means the f8 is denormal and need to adjust
|
||||
// to denorm exponent
|
||||
f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ +
|
||||
f8_bias - (implicit_one ? 0 : 1);
|
||||
|
||||
// Now we have the exponent and mantissa adjusted
|
||||
uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
|
||||
bool odd = mantissa & (1 << (mfmt - wm)); // if the least significant bit
|
||||
// that is not truncated is 1
|
||||
mantissa +=
|
||||
(stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) &
|
||||
drop_mask;
|
||||
|
||||
// Now we deal with overflow
|
||||
if (f8_exponent == 0) {
|
||||
if ((1 << mfmt) & mantissa) {
|
||||
f8_exponent = 1; // denormal overflow to become normal, promote exponent
|
||||
}
|
||||
} else {
|
||||
if ((1 << (mfmt + 1)) & mantissa) {
|
||||
mantissa >>= 1;
|
||||
f8_exponent++;
|
||||
}
|
||||
}
|
||||
|
||||
mantissa >>= (mfmt - wm);
|
||||
|
||||
// above range: quantize to maximum possible float of the same sign
|
||||
const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
|
||||
if (f8_exponent > max_exp) {
|
||||
if (clip) {
|
||||
mantissa = (1 << wm) - 1;
|
||||
f8_exponent = max_exp;
|
||||
} else {
|
||||
return signed_inf;
|
||||
}
|
||||
}
|
||||
|
||||
if (f8_exponent == 0 && mantissa == 0) {
|
||||
return negative_zero_nan ? 0 : (sign << 7);
|
||||
}
|
||||
mantissa &= (1 << wm) - 1;
|
||||
return (sign << 7) | (f8_exponent << wm) | mantissa;
|
||||
}
|
||||
|
||||
template <int we, int wm, typename T = float, bool negative_zero_nan = true>
|
||||
inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) {
|
||||
#ifdef __HIPCC__
|
||||
constexpr bool is_half = std::is_same<T, _Float16>::value;
|
||||
#else
|
||||
constexpr bool is_half = false;
|
||||
#endif
|
||||
constexpr bool is_float = std::is_same<T, float>::value;
|
||||
static_assert(is_half || is_float, "only half and float are supported");
|
||||
|
||||
constexpr int weo = is_half ? 5 : 8;
|
||||
constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
|
||||
|
||||
T fInf, fNegInf, fNaN, fNeg0;
|
||||
|
||||
#ifdef __HIPCC__
|
||||
if (is_half) {
|
||||
const uint16_t ihInf = 0x7C00;
|
||||
const uint16_t ihNegInf = 0xFC00;
|
||||
const uint16_t ihNaN = 0x7C01;
|
||||
const uint16_t ihNeg0 = 0x8000;
|
||||
fInf = reinterpret_cast<const _Float16&>(ihInf);
|
||||
fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
|
||||
fNaN = reinterpret_cast<const _Float16&>(ihNaN);
|
||||
fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
|
||||
} else
|
||||
#endif
|
||||
if (is_float) {
|
||||
const uint32_t ifInf = 0x7F800000;
|
||||
const uint32_t ifNegInf = 0xFF800000;
|
||||
const uint32_t ifNaN = 0x7F800001;
|
||||
const uint32_t ifNeg0 = 0x80000000;
|
||||
fInf = reinterpret_cast<const float&>(ifInf);
|
||||
fNegInf = reinterpret_cast<const float&>(ifNegInf);
|
||||
fNaN = reinterpret_cast<const float&>(ifNaN);
|
||||
fNeg0 = reinterpret_cast<const float&>(ifNeg0);
|
||||
}
|
||||
|
||||
if (x == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t sign = x >> 7;
|
||||
uint32_t mantissa = x & ((1 << wm) - 1);
|
||||
int exponent = (x & 0x7F) >> wm;
|
||||
if (negative_zero_nan) {
|
||||
if (x == 0x80) {
|
||||
return fNaN;
|
||||
}
|
||||
} else {
|
||||
if (x == 0x80) {
|
||||
return fNeg0;
|
||||
}
|
||||
if (exponent == ((1 << we) - 1)) {
|
||||
return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
|
||||
}
|
||||
}
|
||||
typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
|
||||
if (we == 5 && is_half && !negative_zero_nan) {
|
||||
retval = x << 8;
|
||||
return reinterpret_cast<const T&>(retval);
|
||||
}
|
||||
|
||||
const int exp_low_cutoff =
|
||||
(1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
|
||||
|
||||
// subnormal input
|
||||
if (exponent == 0) {
|
||||
// guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
|
||||
int sh = 1 + clz(mantissa) - (32 - wm);
|
||||
mantissa <<= sh;
|
||||
exponent += 1 - sh;
|
||||
mantissa &= ((1 << wm) - 1);
|
||||
}
|
||||
exponent += exp_low_cutoff - 1;
|
||||
mantissa <<= wmo - wm;
|
||||
|
||||
// subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
|
||||
if (exponent <= 0) {
|
||||
mantissa |= 1 << wmo;
|
||||
mantissa >>= 1 - exponent;
|
||||
exponent = 0;
|
||||
}
|
||||
|
||||
if (sizeof(T) == 2) {
|
||||
retval = (sign << 15) | (exponent << 10) | mantissa;
|
||||
} else {
|
||||
retval = (sign << 31) | (exponent << 23) | mantissa;
|
||||
}
|
||||
return reinterpret_cast<const T&>(retval);
|
||||
}
|
||||
|
||||
} // namespace hip_fp8_impl
|
||||
@ -1,13 +1,11 @@
|
||||
#pragma once
|
||||
#include "hip_float8.h"
|
||||
#include <hip/hip_fp8.h>
|
||||
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
|
||||
#include "../../../attention/dtype_fp8.cuh"
|
||||
#include "../../../attention/dtype_float32.cuh"
|
||||
#include "../../../attention/dtype_bfloat16.cuh"
|
||||
#include "../../../attention/attention_dtypes.h"
|
||||
|
||||
namespace vllm {
|
||||
#ifdef USE_ROCM
|
||||
@ -26,40 +24,31 @@ __inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
|
||||
return x;
|
||||
}
|
||||
|
||||
#if HIP_FP8_TYPE_OCP
|
||||
using fp8_type = __hip_fp8_e4m3;
|
||||
using fp8x2_type = __hip_fp8x2_e4m3;
|
||||
#else
|
||||
using fp8_type = __hip_fp8_e4m3_fnuz;
|
||||
using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
|
||||
#endif
|
||||
|
||||
// fp8 -> half
|
||||
template <>
|
||||
__inline__ __device__ uint16_t
|
||||
vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
|
||||
hip_fp8 f8{a, hip_fp8::from_bits()};
|
||||
__half_raw res;
|
||||
res.data = static_cast<float>(f8);
|
||||
return res.x;
|
||||
return __hip_cvt_fp8_to_halfraw(a, fp8_type::__default_interpret).x;
|
||||
}
|
||||
|
||||
// fp8x2 -> half2
|
||||
template <>
|
||||
__inline__ __device__ uint32_t
|
||||
vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
|
||||
#if defined(__HIP__MI300__) && \
|
||||
defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
|
||||
const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
|
||||
union {
|
||||
__half2_raw h2r;
|
||||
uint32_t ui32;
|
||||
} tmp;
|
||||
tmp.h2r.x.data = f2[0];
|
||||
tmp.h2r.y.data = f2[1];
|
||||
tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
|
||||
return tmp.ui32;
|
||||
#else
|
||||
union {
|
||||
uint16_t u16[2];
|
||||
uint32_t u32;
|
||||
} tmp;
|
||||
|
||||
tmp.u16[0] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a));
|
||||
tmp.u16[1] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U));
|
||||
return tmp.u32;
|
||||
#endif
|
||||
}
|
||||
|
||||
// fp8x4 -> half2x2
|
||||
@ -92,9 +81,9 @@ using __nv_bfloat16 = __hip_bfloat16;
|
||||
template <>
|
||||
__inline__ __device__ __nv_bfloat16
|
||||
vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
|
||||
hip_fp8 f8{a, hip_fp8::from_bits()};
|
||||
float f{f8};
|
||||
return __float2bfloat16(f);
|
||||
fp8_type f8;
|
||||
f8.__x = a;
|
||||
return __float2bfloat16(static_cast<float>(f8));
|
||||
}
|
||||
|
||||
using __nv_bfloat162 = __hip_bfloat162;
|
||||
@ -136,27 +125,18 @@ __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) {
|
||||
// fp8 -> float
|
||||
template <>
|
||||
__inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
|
||||
hip_fp8 fp8{a, hip_fp8::from_bits()};
|
||||
return static_cast<float>(fp8);
|
||||
fp8_type f8;
|
||||
f8.__x = a;
|
||||
return static_cast<float>(f8);
|
||||
}
|
||||
|
||||
// fp8x2 -> float2
|
||||
template <>
|
||||
__inline__ __device__ float2
|
||||
vec_conversion<float2, uint16_t>(const uint16_t& a) {
|
||||
#if defined(__HIP__MI300__) && \
|
||||
defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
|
||||
float2 res;
|
||||
const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
|
||||
res.x = f2[0];
|
||||
res.y = f2[1];
|
||||
return res;
|
||||
#else
|
||||
float2 res;
|
||||
res.x = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a));
|
||||
res.y = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U));
|
||||
return res;
|
||||
#endif
|
||||
fp8x2_type f8x2;
|
||||
f8x2.__x = a;
|
||||
return static_cast<float2>(f8x2);
|
||||
}
|
||||
|
||||
// fp8x4 -> float4
|
||||
@ -169,6 +149,15 @@ vec_conversion<Float4_, uint32_t>(const uint32_t& a) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// fp8x4 -> float4
|
||||
template <>
|
||||
__inline__ __device__ float4
|
||||
vec_conversion<float4, uint32_t>(const uint32_t& a) {
|
||||
Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
|
||||
float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
|
||||
return res;
|
||||
}
|
||||
|
||||
// fp8x8 -> float8
|
||||
template <>
|
||||
__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
|
||||
@ -189,33 +178,36 @@ __inline__ __device__ uint8_t
|
||||
vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
|
||||
__half_raw tmp;
|
||||
tmp.x = a;
|
||||
return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
hip_fp8 f8{static_cast<float>(tmp.data)};
|
||||
return f8.data;
|
||||
template <>
|
||||
__inline__ __device__ uint16_t
|
||||
vec_conversion<uint16_t, uint32_t>(const uint32_t& a) {
|
||||
union {
|
||||
uint32_t ui32;
|
||||
__half2_raw h2r;
|
||||
} tmp;
|
||||
tmp.ui32 = a;
|
||||
return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// bf16 -> fp8
|
||||
template <>
|
||||
__inline__ __device__ uint8_t
|
||||
vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
|
||||
hip_fp8 res{__bfloat162float(a)};
|
||||
return res.data;
|
||||
return __hip_cvt_float_to_fp8(__bfloat162float(a),
|
||||
fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// float -> fp8
|
||||
template <>
|
||||
__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
|
||||
hip_fp8 f8(a);
|
||||
return f8.data;
|
||||
}
|
||||
|
||||
// fp8x4 -> float4
|
||||
template <>
|
||||
__inline__ __device__ float4
|
||||
vec_conversion<float4, uint32_t>(const uint32_t& a) {
|
||||
Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
|
||||
float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
|
||||
return res;
|
||||
return __hip_cvt_float_to_fp8(a, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// float2 -> half2
|
||||
@ -307,90 +299,22 @@ vec_conversion<bf16_8_t, Float8_>(const Float8_& a) {
|
||||
|
||||
*/
|
||||
|
||||
// fp8 -> half
|
||||
template <>
|
||||
__inline__ __device__ uint16_t
|
||||
scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, const float scale) {
|
||||
hip_fp8 f8{a, hip_fp8::from_bits()};
|
||||
__half_raw res;
|
||||
res.data = static_cast<float>(f8) * scale;
|
||||
return res.x;
|
||||
}
|
||||
|
||||
// fp8x2 -> half2
|
||||
template <>
|
||||
__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
|
||||
const uint16_t& a, const float scale) {
|
||||
#if defined(__HIP__MI300__) && \
|
||||
defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
|
||||
const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
|
||||
union {
|
||||
__half2_raw h2r;
|
||||
uint32_t ui32;
|
||||
} tmp;
|
||||
tmp.h2r.x.data = f2[0] * scale;
|
||||
tmp.h2r.y.data = f2[1] * scale;
|
||||
return tmp.ui32;
|
||||
#else
|
||||
union {
|
||||
uint16_t u16[2];
|
||||
uint32_t u32;
|
||||
} tmp;
|
||||
|
||||
tmp.u16[0] =
|
||||
scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a), scale);
|
||||
tmp.u16[1] = scaled_vec_conversion<uint16_t, uint8_t>(
|
||||
static_cast<uint8_t>(a >> 8U), scale);
|
||||
return tmp.u32;
|
||||
#endif
|
||||
}
|
||||
|
||||
// fp8x4 -> half2x2
|
||||
template <>
|
||||
__inline__ __device__ uint2
|
||||
scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, const float scale) {
|
||||
union {
|
||||
uint2 u32x2;
|
||||
uint32_t u32[2];
|
||||
} tmp;
|
||||
tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
|
||||
tmp.u32[1] =
|
||||
scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
|
||||
return tmp.u32x2;
|
||||
}
|
||||
|
||||
// fp8x8 -> half2x4
|
||||
template <>
|
||||
__inline__ __device__ uint4
|
||||
scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale) {
|
||||
union {
|
||||
uint4 u64x2;
|
||||
uint2 u64[2];
|
||||
} tmp;
|
||||
tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
|
||||
tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
|
||||
return tmp.u64x2;
|
||||
}
|
||||
|
||||
using __nv_bfloat16 = __hip_bfloat16;
|
||||
|
||||
// fp8 -> __nv_bfloat16
|
||||
template <>
|
||||
__inline__ __device__ __nv_bfloat16
|
||||
scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a,
|
||||
const float scale) {
|
||||
hip_fp8 f8{a, hip_fp8::from_bits()};
|
||||
float f{f8};
|
||||
return __float2bfloat16(f * scale);
|
||||
scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, float scale) {
|
||||
fp8_type f8;
|
||||
f8.__x = a;
|
||||
return __float2bfloat16(static_cast<float>(f8) * scale);
|
||||
}
|
||||
|
||||
using __nv_bfloat162 = __hip_bfloat162;
|
||||
|
||||
// fp8x2 -> __nv_bfloat162
|
||||
template <>
|
||||
__inline__ __device__ __nv_bfloat162
|
||||
scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
|
||||
const float scale) {
|
||||
float scale) {
|
||||
__nv_bfloat162 res;
|
||||
res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
|
||||
res.y =
|
||||
@ -400,8 +324,8 @@ scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
|
||||
|
||||
// fp8x4 -> bf16_4_t
|
||||
template <>
|
||||
__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
|
||||
const uint32_t& a, const float scale) {
|
||||
__inline__ __device__ bf16_4_t
|
||||
scaled_vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a, float scale) {
|
||||
bf16_4_t res;
|
||||
res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
|
||||
res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
|
||||
@ -412,7 +336,7 @@ __inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
|
||||
// fp8x8 -> bf16_8_t
|
||||
template <>
|
||||
__inline__ __device__ bf16_8_t
|
||||
scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
|
||||
scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, float scale) {
|
||||
bf16_4_t tmp1, tmp2;
|
||||
tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
|
||||
tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
|
||||
@ -427,29 +351,19 @@ scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
|
||||
// fp8 -> float
|
||||
template <>
|
||||
__inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
|
||||
const uint8_t& a, const float scale) {
|
||||
hip_fp8 fp8{a, hip_fp8::from_bits()};
|
||||
return static_cast<float>(fp8) * scale;
|
||||
const uint8_t& a, float scale) {
|
||||
fp8_type f8;
|
||||
f8.__x = a;
|
||||
return static_cast<float>(f8) * scale;
|
||||
}
|
||||
|
||||
// fp8x2 -> float2
|
||||
template <>
|
||||
__inline__ __device__ float2
|
||||
scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, const float scale) {
|
||||
#if defined(__HIP__MI300__) && \
|
||||
defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
|
||||
float2 res;
|
||||
const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
|
||||
res.x = f2[0] * scale;
|
||||
res.y = f2[1] * scale;
|
||||
return res;
|
||||
#else
|
||||
float2 res;
|
||||
res.x = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a), scale);
|
||||
res.y = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U),
|
||||
scale);
|
||||
return res;
|
||||
#endif
|
||||
scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, float scale) {
|
||||
fp8x2_type f8x2;
|
||||
f8x2.__x = a;
|
||||
return static_cast<float2>(f8x2) * scale;
|
||||
}
|
||||
|
||||
// fp8x4 -> float4
|
||||
@ -462,10 +376,18 @@ scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// fp8x4 -> float4
|
||||
template <>
|
||||
__inline__ __device__ float4
|
||||
scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, float scale) {
|
||||
Float4_ res = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
|
||||
return {res.x.x, res.x.y, res.y.x, res.y.y};
|
||||
}
|
||||
|
||||
// fp8x8 -> float8
|
||||
template <>
|
||||
__inline__ __device__ Float8_
|
||||
scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
|
||||
scaled_vec_conversion<Float8_, uint2>(const uint2& a, float scale) {
|
||||
Float4_ tmp1, tmp2;
|
||||
tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
|
||||
tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
|
||||
@ -477,44 +399,184 @@ scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Quantize(HP / scale) => FP8 */
|
||||
// fp8 -> half
|
||||
template <>
|
||||
__inline__ __device__ uint16_t
|
||||
scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
|
||||
__half_raw res;
|
||||
res.data = scaled_vec_conversion<float, uint8_t>(a, scale);
|
||||
return res.x;
|
||||
}
|
||||
|
||||
// TODO(Hai): vectorized to add
|
||||
// fp8x2 -> half2
|
||||
template <>
|
||||
__inline__ __device__ uint32_t
|
||||
scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
|
||||
__half2_raw h2r =
|
||||
__hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
|
||||
union {
|
||||
__half2_raw h2r;
|
||||
uint32_t ui32;
|
||||
} tmp;
|
||||
tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
|
||||
tmp.h2r.x.data *= scale;
|
||||
tmp.h2r.y.data *= scale;
|
||||
return tmp.ui32;
|
||||
}
|
||||
|
||||
// fp8x4 -> half2x2
|
||||
template <>
|
||||
__inline__ __device__ uint2
|
||||
scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, float scale) {
|
||||
union {
|
||||
uint2 u32x2;
|
||||
uint32_t u32[2];
|
||||
} tmp;
|
||||
tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
|
||||
tmp.u32[1] =
|
||||
scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
|
||||
return tmp.u32x2;
|
||||
}
|
||||
|
||||
// fp8x8 -> half2x4
|
||||
template <>
|
||||
__inline__ __device__ uint4 scaled_vec_conversion<uint4, uint2>(const uint2& a,
|
||||
float scale) {
|
||||
union {
|
||||
uint4 u64x2;
|
||||
uint2 u64[2];
|
||||
} tmp;
|
||||
tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
|
||||
tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
|
||||
return tmp.u64x2;
|
||||
}
|
||||
|
||||
// half -> fp8
|
||||
template <>
|
||||
__inline__ __device__ uint8_t
|
||||
scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, const float scale) {
|
||||
scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, float scale) {
|
||||
__half_raw tmp;
|
||||
tmp.x = a;
|
||||
tmp.data /= scale;
|
||||
return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
hip_fp8 f8{static_cast<float>(tmp.data) / scale};
|
||||
return f8.data;
|
||||
// halfx2 -> fp8x2
|
||||
template <>
|
||||
__inline__ __device__ uint16_t
|
||||
scaled_vec_conversion<uint16_t, uint32_t>(const uint32_t& a, float scale) {
|
||||
union {
|
||||
uint32_t ui32;
|
||||
__half2_raw h2r;
|
||||
} tmp;
|
||||
tmp.ui32 = a;
|
||||
tmp.h2r.x.data /= scale;
|
||||
tmp.h2r.y.data /= scale;
|
||||
return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// half2x2 -> fp8x4
|
||||
template <>
|
||||
__inline__ __device__ uint32_t
|
||||
scaled_vec_conversion<uint32_t, uint2>(const uint2& a, float scale) {
|
||||
union {
|
||||
uint16_t ui16[2];
|
||||
uint32_t ui32;
|
||||
} tmp;
|
||||
tmp.ui16[0] = scaled_vec_conversion<uint16_t, uint32_t>(a.x, scale);
|
||||
tmp.ui16[1] = scaled_vec_conversion<uint16_t, uint32_t>(a.y, scale);
|
||||
return tmp.ui32;
|
||||
}
|
||||
|
||||
// half2x4 -> fp8x8
|
||||
template <>
|
||||
__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint4>(const uint4& a,
|
||||
float scale) {
|
||||
union {
|
||||
uint2 ui2[2];
|
||||
uint4 ui4;
|
||||
} tmp;
|
||||
tmp.ui4 = a;
|
||||
uint2 res;
|
||||
res.x = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[0], scale);
|
||||
res.y = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[1], scale);
|
||||
return res;
|
||||
}
|
||||
|
||||
// bf16 -> fp8
|
||||
template <>
|
||||
__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
|
||||
const __nv_bfloat16& a, const float scale) {
|
||||
hip_fp8 res{__bfloat162float(a) / scale};
|
||||
return res.data;
|
||||
const __nv_bfloat16& a, float scale) {
|
||||
return __hip_cvt_float_to_fp8(__bfloat162float(a) / scale,
|
||||
fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// bf16x2 -> fp8x2
|
||||
template <>
|
||||
__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, __nv_bfloat162>(
|
||||
const __nv_bfloat162& a, float scale) {
|
||||
union {
|
||||
uint8_t ui8[2];
|
||||
uint16_t ui16;
|
||||
} tmp;
|
||||
tmp.ui8[0] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.x, scale);
|
||||
tmp.ui8[1] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.y, scale);
|
||||
return tmp.ui16;
|
||||
}
|
||||
|
||||
// bf16x4 -> fp8x4
|
||||
template <>
|
||||
__inline__ __device__ uint32_t
|
||||
scaled_vec_conversion<uint32_t, bf16_4_t>(const bf16_4_t& a, float scale) {
|
||||
union {
|
||||
uint16_t ui16[2];
|
||||
uint32_t ui32;
|
||||
} tmp;
|
||||
tmp.ui16[0] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.x, scale);
|
||||
tmp.ui16[1] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.y, scale);
|
||||
return tmp.ui32;
|
||||
}
|
||||
|
||||
// bf16x8 -> fp8x8
|
||||
template <>
|
||||
__inline__ __device__ uint2
|
||||
scaled_vec_conversion<uint2, bf16_8_t>(const bf16_8_t& a, float scale) {
|
||||
uint2 res;
|
||||
res.x = scaled_vec_conversion<uint32_t, bf16_4_t>({a.x, a.y}, scale);
|
||||
res.y = scaled_vec_conversion<uint32_t, bf16_4_t>({a.z, a.w}, scale);
|
||||
return res;
|
||||
}
|
||||
|
||||
// float -> fp8
|
||||
template <>
|
||||
__inline__ __device__ uint8_t
|
||||
scaled_vec_conversion<uint8_t, float>(const float& a, const float scale) {
|
||||
hip_fp8 f8(a / scale);
|
||||
return f8.data;
|
||||
scaled_vec_conversion<uint8_t, float>(const float& a, float scale) {
|
||||
return __hip_cvt_float_to_fp8(a / scale, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// fp8x4 -> float4
|
||||
// floatx2 -> fp8x2
|
||||
template <>
|
||||
__inline__ __device__ float4
|
||||
scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, const float scale) {
|
||||
Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
|
||||
float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
|
||||
return res;
|
||||
__inline__ __device__ uint16_t
|
||||
scaled_vec_conversion<uint16_t, float2>(const float2& a, float scale) {
|
||||
return __hip_cvt_float2_to_fp8x2(a / scale, fp8_type::__default_saturation,
|
||||
fp8_type::__default_interpret);
|
||||
}
|
||||
|
||||
// floatx4 -> fp8x4
|
||||
template <>
|
||||
__inline__ __device__ uint32_t
|
||||
scaled_vec_conversion<uint32_t, float4>(const float4& a, float scale) {
|
||||
union {
|
||||
uint16_t ui16[2];
|
||||
uint32_t ui32;
|
||||
} tmp;
|
||||
tmp.ui16[0] = scaled_vec_conversion<uint16_t, float2>({a.x, a.y}, scale);
|
||||
tmp.ui16[1] = scaled_vec_conversion<uint16_t, float2>({a.z, a.w}, scale);
|
||||
return tmp.ui32;
|
||||
}
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
|
||||
std::numeric_limits<FP8_TYPE>::max();
|
||||
#else
|
||||
#include <c10/util/Float8_e4m3fnuz.h>
|
||||
#include "amd/hip_float8.h"
|
||||
#include "amd/quant_utils.cuh"
|
||||
using FP8_TYPE = c10::Float8_e4m3fnuz;
|
||||
// Using the default max value from pytorch (240.0) will cause accuracy
|
||||
// issue when running dynamic quantization. Here use 224.0f for rocm.
|
||||
@ -47,8 +47,10 @@ __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
|
||||
return static_cast<c10::Float8_e4m3fn>(r);
|
||||
#else
|
||||
// Use hardware cvt instruction for fp8 on rocm
|
||||
return c10::Float8_e4m3fnuz(hip_fp8(r).data,
|
||||
c10::Float8_e4m3fnuz::from_bits());
|
||||
return c10::Float8_e4m3fnuz(
|
||||
__hip_cvt_float_to_fp8(r, fp8::fp8_type::__default_saturation,
|
||||
fp8::fp8_type::__default_interpret),
|
||||
c10::Float8_e4m3fnuz::from_bits());
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -37,6 +37,8 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
||||
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||
}
|
||||
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
#define VDR_Q4_0_Q8_1_MMVQ 2
|
||||
#define VDR_Q4_0_Q8_1_MMQ 4
|
||||
|
||||
@ -302,6 +302,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"SymInt size_k) -> Tensor");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
// CUTLASS nvfp4 block scaled GEMM
|
||||
ops.def(
|
||||
"cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
|
||||
" Tensor block_scale_a, Tensor block_scale_b,"
|
||||
" Tensor alpha) -> ()");
|
||||
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
|
||||
|
||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||
// quantization, as well as bias
|
||||
ops.def(
|
||||
@ -493,6 +500,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
|
||||
"str kv_cache_dtype) -> ()");
|
||||
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
|
||||
|
||||
// Gather cache blocks from src_cache to dst.
|
||||
cache_ops.def(
|
||||
"gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
|
||||
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("gather_cache", torch::kCUDA, &gather_cache);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
|
||||
|
||||
@ -74,8 +74,6 @@ def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
@ -16,8 +16,6 @@ Further update the model as follows:
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
```
|
||||
@ -722,13 +720,13 @@ def _get_mm_fields_config(
|
||||
|
||||
:::::
|
||||
|
||||
### Prompt replacements
|
||||
### Prompt updates
|
||||
|
||||
Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
|
||||
return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
|
||||
Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
|
||||
return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.
|
||||
|
||||
Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
|
||||
operation performed by the HF processor.
|
||||
Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
|
||||
(e.g.: insertion, replacement) performed by the HF processor.
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Basic example: LLaVA
|
||||
@ -745,15 +743,15 @@ for sample in text:
|
||||
```
|
||||
|
||||
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
||||
Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` as follows:
|
||||
Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:
|
||||
|
||||
```python
|
||||
def _get_prompt_replacements(
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> list[PromptReplacement]:
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
@ -861,7 +859,7 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||
)
|
||||
```
|
||||
|
||||
To accommodate this, instead of a string you can return an instance of `PromptReplacementDetails`
|
||||
To accommodate this, instead of a string you can return an instance of `PromptUpdateDetails`
|
||||
with different `full` and `feature` attributes:
|
||||
|
||||
```python
|
||||
@ -880,7 +878,7 @@ def get_replacement_fuyu(item_idx: int):
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptReplacementDetails(
|
||||
return PromptUpdateDetails(
|
||||
full=image_tokens + [bos_token_id],
|
||||
features=image_tokens,
|
||||
)
|
||||
@ -890,12 +888,12 @@ Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the
|
||||
we can search for it to conduct the replacement at the start of the string:
|
||||
|
||||
```python
|
||||
def _get_prompt_replacements(
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> list[PromptReplacement]:
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
assert isinstance(bos_token_id, int)
|
||||
@ -915,7 +913,7 @@ def _get_prompt_replacements(
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptReplacementDetails(
|
||||
return PromptUpdateDetails(
|
||||
full=image_tokens + [bos_token_id],
|
||||
features=image_tokens,
|
||||
)
|
||||
|
||||
@ -145,6 +145,9 @@ review process:
|
||||
- Please respond to all comments within a reasonable time frame. If a comment
|
||||
isn't clear or you disagree with a suggestion, feel free to ask for
|
||||
clarification or discuss the suggestion.
|
||||
- Note that not all CI checks will be executed due to limited computational
|
||||
resources. The reviewer will add `ready` label to the PR when the PR is
|
||||
ready to merge or a full CI run is needed.
|
||||
|
||||
## Thank You
|
||||
|
||||
|
||||
@ -27,6 +27,36 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar
|
||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
||||
|
||||
If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:v0.7.3
|
||||
|
||||
# e.g. install the `audio` and `video` optional dependencies
|
||||
# NOTE: Make sure the version of vLLM matches the base image!
|
||||
RUN uv pip install --system vllm[audio,video]==0.7.3
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
|
||||
|
||||
To use the development version of `transformers`, create a custom Dockerfile on top of the base image
|
||||
with an extra layer that installs their code from source:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:latest
|
||||
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
(deployment-docker-build-image-from-source)=
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
@ -6,4 +6,5 @@
|
||||
kserve
|
||||
kubeai
|
||||
llamastack
|
||||
llmaz
|
||||
:::
|
||||
|
||||
7
docs/source/deployment/integrations/llmaz.md
Normal file
7
docs/source/deployment/integrations/llmaz.md
Normal file
@ -0,0 +1,7 @@
|
||||
(deployment-llmaz)=
|
||||
|
||||
# llmaz
|
||||
|
||||
[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
|
||||
|
||||
Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
|
||||
@ -6,11 +6,16 @@ To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefi
|
||||
|
||||
Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
|
||||
|
||||
## Prompt Replacement Detection
|
||||
## Prompt Update Detection
|
||||
|
||||
One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
|
||||
One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
|
||||
|
||||
In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt.
|
||||
- Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
|
||||
- Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
|
||||
|
||||
The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
|
||||
|
||||
In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
|
||||
|
||||
## Tokenized Prompt Inputs
|
||||
|
||||
@ -22,7 +27,7 @@ Consider that HF processors follow these main steps:
|
||||
|
||||
1. Tokenize the text
|
||||
2. Process multi-modal inputs
|
||||
3. Perform prompt replacement
|
||||
3. Perform prompt updates
|
||||
|
||||
And we require that:
|
||||
|
||||
@ -44,16 +49,16 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
|
||||
|
||||
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
|
||||
|
||||
(mm-automatic-prompt-replacement)=
|
||||
(mm-automatic-prompt-updating)=
|
||||
|
||||
### Automatic prompt replacement
|
||||
### Automatic prompt updating
|
||||
|
||||
We address the second issue by implementing model-agnostic code in
|
||||
{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`.
|
||||
{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
|
||||
|
||||
### Summary
|
||||
|
||||
With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
|
||||
With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
|
||||
|
||||
## Processor Output Caching
|
||||
|
||||
@ -61,4 +66,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238)
|
||||
|
||||
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
||||
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
|
||||
@ -170,7 +170,7 @@ Now, you can specify a base_model_name alongside the name and path using JSON fo
|
||||
|
||||
To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
|
||||
|
||||
## Lora model lineage in model card
|
||||
## LoRA model lineage in model card
|
||||
|
||||
The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
|
||||
|
||||
|
||||
@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
|
||||
Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
|
||||
The main benefits are lower latency and memory usage.
|
||||
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
|
||||
|
||||
```console
|
||||
pip install autoawq
|
||||
```
|
||||
|
||||
After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
|
||||
```python
|
||||
from awq import AutoAWQForCausalLM
|
||||
|
||||
@ -29,6 +29,13 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
|
||||
We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
|
||||
:::
|
||||
|
||||
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
|
||||
|
||||
```console
|
||||
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
|
||||
```
|
||||
|
||||
You can also use the GGUF model directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
|
||||
@ -16,7 +16,7 @@ The following parameters are supported, which must be added as extra parameters:
|
||||
- `guided_json`: the output will follow the JSON schema.
|
||||
- `guided_grammar`: the output will follow the context free grammar.
|
||||
- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
|
||||
- `guided_decoding_backend`: used to select the guided decoding backend to use.
|
||||
- `guided_decoding_backend`: used to select the guided decoding backend to use. Additional backend-specific options can be supplied in a comma separated list following a colon after the backend name. For example `"xgrammar:no-fallback"` will not allow vLLM to fallback to a different backend on error.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
|
||||
|
||||
|
||||
@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
|
||||
You can install vLLM using either `pip` or `uv pip`:
|
||||
|
||||
```console
|
||||
# Install vLLM with CUDA 12.1.
|
||||
# Install vLLM with CUDA 12.4.
|
||||
pip install vllm # If you are using pip.
|
||||
uv pip install vllm # If you are using uv.
|
||||
```
|
||||
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
|
||||
|
||||
```console
|
||||
# Install vLLM with CUDA 11.8.
|
||||
|
||||
@ -24,6 +24,12 @@ source myenv/bin/activate
|
||||
uv pip install vllm
|
||||
```
|
||||
|
||||
Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating an environment:
|
||||
|
||||
```console
|
||||
uv run --with vllm vllm --help
|
||||
```
|
||||
|
||||
You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
|
||||
|
||||
```console
|
||||
@ -184,3 +190,13 @@ chat_response = client.chat.completions.create(
|
||||
)
|
||||
print("Chat response:", chat_response)
|
||||
```
|
||||
|
||||
## On Attention Backends
|
||||
|
||||
Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.
|
||||
|
||||
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
|
||||
|
||||
```{attention}
|
||||
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
|
||||
```
|
||||
|
||||
@ -108,8 +108,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/clas
|
||||
### `LLM.score`
|
||||
|
||||
The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
|
||||
It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
|
||||
These types of models serve as rerankers between candidate query-document pairs in RAG systems.
|
||||
It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
|
||||
|
||||
:::{note}
|
||||
vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
|
||||
|
||||
@ -286,6 +286,11 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `parasail-ai/GritLM-7B-vllm`.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Grok1ModelForCausalLM`
|
||||
* Grok1
|
||||
* `hpcai-tech/grok-1`.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `InternLMForCausalLM`
|
||||
* InternLM
|
||||
* `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
|
||||
@ -710,6 +715,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Florence2ForConditionalGeneration`
|
||||
* Florence-2
|
||||
* T + I
|
||||
* `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.
|
||||
*
|
||||
*
|
||||
*
|
||||
- * `FuyuForCausalLM`
|
||||
* Fuyu
|
||||
* T + I
|
||||
|
||||
@ -36,3 +36,11 @@ The following metrics are exposed:
|
||||
:language: python
|
||||
:start-after: begin-metrics-definitions
|
||||
:::
|
||||
|
||||
The following metrics are deprecated and due to be removed in a future version:
|
||||
|
||||
- *(No metrics are currently deprecated)*
|
||||
|
||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||
and are then removed in version `X.Y+2`.
|
||||
|
||||
@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
|
||||
- `prompt`: The prompt should follow the format that is documented on HuggingFace.
|
||||
- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
|
||||
|
||||
### Image
|
||||
### Image Inputs
|
||||
|
||||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||||
|
||||
@ -120,20 +120,20 @@ for o in outputs:
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
### Video
|
||||
### Video Inputs
|
||||
|
||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||
instead of using multi-image input.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
|
||||
### Audio
|
||||
### Audio Inputs
|
||||
|
||||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/audio_language.py>
|
||||
|
||||
### Embedding
|
||||
### Embedding Inputs
|
||||
|
||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||
@ -211,7 +211,7 @@ The chat template can be inferred based on the documentation on the model's Hugg
|
||||
For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
|
||||
:::
|
||||
|
||||
### Image
|
||||
### Image Inputs
|
||||
|
||||
Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
|
||||
Here is a simple example using Phi-3.5-Vision.
|
||||
@ -293,7 +293,7 @@ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
|
||||
|
||||
:::
|
||||
|
||||
### Video
|
||||
### Video Inputs
|
||||
|
||||
Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
|
||||
|
||||
@ -356,7 +356,7 @@ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
|
||||
|
||||
:::
|
||||
|
||||
### Audio
|
||||
### Audio Inputs
|
||||
|
||||
Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
|
||||
Here is a simple example using Ultravox-v0.5-1B.
|
||||
@ -460,77 +460,6 @@ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
|
||||
|
||||
:::
|
||||
|
||||
### Embedding
|
||||
### Embedding Inputs
|
||||
|
||||
vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
|
||||
where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
|
||||
|
||||
:::{tip}
|
||||
The schema of `messages` is exactly the same as in Chat Completions API.
|
||||
You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
|
||||
:::
|
||||
|
||||
Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
|
||||
Refer to the examples below for illustration.
|
||||
|
||||
Here is an end-to-end example using VLM2Vec. To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
|
||||
--trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
|
||||
```
|
||||
|
||||
:::{important}
|
||||
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: <gh-file:examples/template_vlm2vec.jinja>
|
||||
:::
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
```
|
||||
|
||||
Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model.
|
||||
|
||||
```bash
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
|
||||
--trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
|
||||
```
|
||||
|
||||
:::{important}
|
||||
Like with VLM2Vec, we have to explicitly pass `--task embed`.
|
||||
|
||||
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
|
||||
by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
|
||||
:::
|
||||
|
||||
:::{important}
|
||||
Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
:::
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
|
||||
TBD
|
||||
|
||||
@ -51,7 +51,7 @@ In addition, we have the following custom APIs:
|
||||
- [Pooling API](#pooling-api) (`/pooling`)
|
||||
- Applicable to all [pooling models](../models/pooling_models.md).
|
||||
- [Score API](#score-api) (`/score`)
|
||||
- Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
|
||||
- Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
|
||||
- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
- Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
|
||||
- Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
|
||||
@ -266,11 +266,85 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
||||
If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
|
||||
which will be treated as a single prompt to the model.
|
||||
|
||||
:::{tip}
|
||||
This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
|
||||
Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
|
||||
and passing a list of `messages` in the request. Refer to the examples below for illustration.
|
||||
|
||||
:::::{tab-set}
|
||||
::::{tab-item} VLM2Vec
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
|
||||
--trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
|
||||
```
|
||||
|
||||
:::{important}
|
||||
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: <gh-file:examples/template_vlm2vec.jinja>
|
||||
:::
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} DSE-Qwen2-MRL
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
|
||||
--trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
|
||||
```
|
||||
|
||||
:::{important}
|
||||
Like with VLM2Vec, we have to explicitly pass `--task embed`.
|
||||
|
||||
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
|
||||
by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
|
||||
:::
|
||||
|
||||
:::{important}
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -333,10 +407,10 @@ Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
|
||||
|
||||
### Score API
|
||||
|
||||
Our Score API applies a cross-encoder model to predict scores for sentence pairs.
|
||||
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
||||
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
|
||||
|
||||
You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
|
||||
|
||||
@ -496,11 +570,11 @@ The following extra parameters are supported:
|
||||
|
||||
### Re-rank API
|
||||
|
||||
Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
|
||||
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
|
||||
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
|
||||
a scale of 0 to 1.
|
||||
|
||||
You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
|
||||
`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
|
||||
|
||||
65
examples/offline_inference/cpu_offload_lmcache.py
Normal file
65
examples/offline_inference/cpu_offload_lmcache.py
Normal file
@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of cpu offloading
|
||||
with LMCache.
|
||||
|
||||
Note that `pip install lmcache` is needed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
# LMCache-related environment variables
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
# LMCache is set to use 256 tokens per chunk
|
||||
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
|
||||
# Enable local CPU backend in LMCache
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "True"
|
||||
# Set local CPU memory limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
|
||||
# This example script runs two requests with a shared prefix.
|
||||
shared_prompt = "Hello, how are you?" * 1000
|
||||
first_prompt = [
|
||||
shared_prompt + "Hello, my name is",
|
||||
]
|
||||
second_prompt = [
|
||||
shared_prompt + "Tell me a very long story",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
# Note that LMCache is not compatible with chunked prefill for now.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
enable_chunked_prefill=False,
|
||||
gpu_memory_utilization=0.8)
|
||||
|
||||
outputs = llm.generate(first_prompt, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("First request done.")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
outputs = llm.generate(second_prompt, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("Second request done.")
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
77
examples/offline_inference/data_parallel.py
Normal file
77
examples/offline_inference/data_parallel.py
Normal file
@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# usage: VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
|
||||
# we need to have a launcher to create multiple data parallel
|
||||
# ranks. And each rank will create a vLLM instance to process its own prompts.
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import get_open_port
|
||||
|
||||
|
||||
def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
|
||||
os.environ["VLLM_DP_RANK"] = str(dp_rank)
|
||||
os.environ["VLLM_DP_SIZE"] = str(dp_size)
|
||||
os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
|
||||
os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
|
||||
# set devices for each dp_rank
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
|
||||
str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) *
|
||||
GPUs_per_dp_rank))
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# with DP, each rank should process different prompts.
|
||||
# usually all the DP ranks process a full dataset,
|
||||
# and each rank processes a different part of the dataset.
|
||||
promts_per_rank = len(prompts) // dp_size
|
||||
start = dp_rank * promts_per_rank
|
||||
end = start + promts_per_rank
|
||||
prompts = prompts[start:end]
|
||||
if len(prompts) == 0:
|
||||
# if any rank has no prompts to process,
|
||||
# we need to set a placeholder prompt
|
||||
prompts = ["Placeholder"]
|
||||
print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts")
|
||||
|
||||
# Create a sampling params object.
|
||||
# since we are doing data parallel, every rank can have different
|
||||
# sampling params. here we set different max_tokens for different
|
||||
# ranks for demonstration.
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=16 * (dp_rank + 1))
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="facebook/opt-125m",
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=True)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"DP rank {dp_rank}, Prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from multiprocessing import Process
|
||||
dp_size = 2
|
||||
GPUs_per_dp_rank = 2
|
||||
dp_master_ip = "127.0.0.1"
|
||||
dp_master_port = get_open_port()
|
||||
procs = []
|
||||
for i in range(dp_size):
|
||||
proc = Process(target=main,
|
||||
args=(dp_size, i, dp_master_ip, dp_master_port,
|
||||
GPUs_per_dp_rank))
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
for proc in procs:
|
||||
proc.join()
|
||||
130
examples/offline_inference/disaggregated_prefill_lmcache.py
Normal file
130
examples/offline_inference/disaggregated_prefill_lmcache.py
Normal file
@ -0,0 +1,130 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of disaggregated prefilling
|
||||
with LMCache.
|
||||
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
|
||||
and launch an additional LMCache server.
|
||||
KV cache is transferred in the following manner:
|
||||
VLLM prefill node -> LMCache server -> VLLM decode node.
|
||||
|
||||
Note that `pip install lmcache` is needed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from multiprocessing import Event, Process
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
# LMCache-related environment variables
|
||||
# The port to start LMCache server
|
||||
port = 8100
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
# LMCache is set to use 256 tokens per chunk
|
||||
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
|
||||
# Disable local CPU backend in LMCache
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "False"
|
||||
# Set local CPU memory buffer limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
# Set the remote URL for LMCache server
|
||||
os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
|
||||
# Set the serializer/deserializer between vllm and LMCache server
|
||||
# `naive` indicates using raw bytes of the tensor without any compression
|
||||
os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
|
||||
|
||||
|
||||
def run_prefill(prefill_done, prompts):
|
||||
# We use GPU 0 for prefill node.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
|
||||
)
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
|
||||
#llm.generate(prompts, sampling_params)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("Prefill node is finished.")
|
||||
prefill_done.set()
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
|
||||
|
||||
def run_decode(prefill_done, prompts, timeout=1):
|
||||
# We use GPU 1 for decode node.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
|
||||
)
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# of memory. Reduce the value if your GPU has less memory.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
|
||||
print("Waiting for prefill node to finish...")
|
||||
prefill_done.wait()
|
||||
time.sleep(timeout)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
|
||||
|
||||
def run_lmcache_server(port):
|
||||
server_proc = subprocess.Popen([
|
||||
"python", "-m", "lmcache.experimental.server", "localhost",
|
||||
str(port)
|
||||
])
|
||||
return server_proc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
prompts = [
|
||||
"Hello, how are you?" * 1000,
|
||||
]
|
||||
|
||||
prefill_done = Event()
|
||||
prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
|
||||
decode_process = Process(target=run_decode, args=(prefill_done, prompts))
|
||||
lmcache_server_process = run_lmcache_server(port)
|
||||
|
||||
# Start prefill node
|
||||
prefill_process.start()
|
||||
|
||||
# Start decode node
|
||||
decode_process.start()
|
||||
|
||||
# Clean up the processes
|
||||
decode_process.join()
|
||||
prefill_process.terminate()
|
||||
lmcache_server_process.terminate()
|
||||
lmcache_server_process.wait()
|
||||
@ -1,34 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
'''
|
||||
"""
|
||||
Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically Florence-2
|
||||
'''
|
||||
"""
|
||||
# TODO(Isotr0py):
|
||||
# Move to offline_inference/vision_language.py
|
||||
# after porting vision backbone
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
dtype = "float"
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
# Create a Florence-2 encoder/decoder model instance
|
||||
llm = LLM(
|
||||
model="microsoft/Florence-2-base",
|
||||
tokenizer="facebook/bart-base",
|
||||
dtype=dtype,
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
|
||||
"<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
|
||||
"<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
|
||||
{ # implicit prompt with task token
|
||||
"prompt": "<DETAILED_CAPTION>",
|
||||
"multi_modal_data": {
|
||||
"image": ImageAsset("stop_sign").pil_image
|
||||
},
|
||||
},
|
||||
{ # explicit encoder/decoder prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "Describe in detail what is shown in the image.",
|
||||
"multi_modal_data": {
|
||||
"image": ImageAsset("cherry_blossom").pil_image
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "",
|
||||
},
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
min_tokens=0,
|
||||
max_tokens=20,
|
||||
max_tokens=128,
|
||||
)
|
||||
|
||||
# Generate output tokens from the prompts. The output is a list of
|
||||
@ -38,9 +49,5 @@ outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Encoder prompt: {encoder_prompt!r}, "
|
||||
f"Decoder prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
|
||||
@ -82,6 +82,22 @@ def run_deepseek_vl2(question: str, modality: str):
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Florence2
|
||||
def run_florence2(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
prompt = "<MORE_DETAILED_CAPTION>"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
@ -571,6 +587,7 @@ model_example_map = {
|
||||
"blip-2": run_blip2,
|
||||
"chameleon": run_chameleon,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"florence2": run_florence2,
|
||||
"fuyu": run_fuyu,
|
||||
"glm4v": run_glm4v,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
|
||||
@ -439,7 +439,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
else:
|
||||
image_data, _ = process_vision_info(messages,
|
||||
return_video_sample_fps=False)
|
||||
return_video_kwargs=False)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
|
||||
94
examples/online_serving/multi-node-serving.sh
Normal file
94
examples/online_serving/multi-node-serving.sh
Normal file
@ -0,0 +1,94 @@
|
||||
#!/bin/bash
|
||||
|
||||
subcommand=$1
|
||||
shift
|
||||
|
||||
ray_port=6379
|
||||
ray_init_timeout=300
|
||||
declare -a start_params
|
||||
|
||||
case "$subcommand" in
|
||||
worker)
|
||||
ray_address=""
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--ray_address=*)
|
||||
ray_address="${1#*=}"
|
||||
;;
|
||||
--ray_port=*)
|
||||
ray_port="${1#*=}"
|
||||
;;
|
||||
--ray_init_timeout=*)
|
||||
ray_init_timeout="${1#*=}"
|
||||
;;
|
||||
*)
|
||||
start_params+=("$1")
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [ -z "$ray_address" ]; then
|
||||
echo "Error: Missing argument --ray_address"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for (( i=0; i < $ray_init_timeout; i+=5 )); do
|
||||
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
|
||||
exit 0
|
||||
fi
|
||||
echo "Waiting until the ray worker is active..."
|
||||
sleep 5s;
|
||||
done
|
||||
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
|
||||
exit 1
|
||||
;;
|
||||
|
||||
leader)
|
||||
ray_cluster_size=""
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--ray_port=*)
|
||||
ray_port="${1#*=}"
|
||||
;;
|
||||
--ray_cluster_size=*)
|
||||
ray_cluster_size="${1#*=}"
|
||||
;;
|
||||
--ray_init_timeout=*)
|
||||
ray_init_timeout="${1#*=}"
|
||||
;;
|
||||
*)
|
||||
start_params+=("$1")
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [ -z "$ray_cluster_size" ]; then
|
||||
echo "Error: Missing argument --ray_cluster_size"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# start the ray daemon
|
||||
ray start --head --port=$ray_port "${start_params[@]}"
|
||||
|
||||
# wait until all workers are active
|
||||
for (( i=0; i < $ray_init_timeout; i+=5 )); do
|
||||
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
|
||||
if [ $active_nodes -eq $ray_cluster_size ]; then
|
||||
echo "All ray workers are active and the ray cluster is initialized successfully."
|
||||
exit 0
|
||||
fi
|
||||
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
|
||||
sleep 5s;
|
||||
done
|
||||
|
||||
echo "Waiting for all ray workers to be active timed out."
|
||||
exit 1
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "unknown subcommand: $subcommand"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import OpenAI
|
||||
from openai import BadRequestError, OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
client = OpenAI(
|
||||
@ -94,3 +94,26 @@ completion = client.chat.completions.create(
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
# Extra backend options
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
try:
|
||||
# The no-fallback option forces vLLM to use xgrammar, so when it fails
|
||||
# you get a 400 with the reason why
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_backend": "xgrammar:no-fallback"
|
||||
},
|
||||
)
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
|
||||
@ -102,7 +102,7 @@ if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
"Script to call a specified VLM through the API. Make sure to serve "
|
||||
"the model with --task embed before running this.")
|
||||
parser.add_argument("model",
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
choices=["vlm2vec", "dse_qwen2_vl"],
|
||||
required=True,
|
||||
|
||||
@ -1260,7 +1260,7 @@
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "edx8memhpd9tsa"
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
@ -1360,7 +1360,7 @@
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "edx8memhpd9tsa"
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
@ -1473,7 +1473,7 @@
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "edx8memhpd9tsa"
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
@ -1523,7 +1523,7 @@
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "edx8memhpd9tsa"
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"definition": "label_values(model_name)",
|
||||
"hide": 0,
|
||||
|
||||
@ -49,7 +49,8 @@ disabled, an error will occur while starting vLLM.
|
||||
### Example 1: Customize vLLM root logger
|
||||
|
||||
For this example, we will customize the vLLM root logger to use
|
||||
[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
|
||||
[`python-json-logger`](https://github.com/nhairs/python-json-logger)
|
||||
(which is part of the container image) to log to
|
||||
STDOUT of the console in JSON format with a log level of `INFO`.
|
||||
|
||||
To begin, first, create an appropriate JSON logging configuration file:
|
||||
@ -82,12 +83,6 @@ To begin, first, create an appropriate JSON logging configuration file:
|
||||
}
|
||||
```
|
||||
|
||||
Next, install the `python-json-logger` package if it's not already installed:
|
||||
|
||||
```bash
|
||||
pip install python-json-logger
|
||||
```
|
||||
|
||||
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
|
||||
to the path of the custom logging configuration JSON file:
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ https://github.com/coreweave/tensorizer
|
||||
To serialize a model, install vLLM from source, then run something
|
||||
like this from the root level of this repository:
|
||||
|
||||
python -m examples.offline_inference.tensorize_vllm_model \
|
||||
python -m examples.other.tensorize_vllm_model \
|
||||
--model facebook/opt-125m \
|
||||
serialize \
|
||||
--serialized-directory s3://my-bucket \
|
||||
@ -47,7 +47,7 @@ providing a `--keyfile` argument.
|
||||
To deserialize a model, you can run something like this from the root
|
||||
level of this repository:
|
||||
|
||||
python -m examples.offline_inference.tensorize_vllm_model \
|
||||
python -m examples.other.tensorize_vllm_model \
|
||||
--model EleutherAI/gpt-j-6B \
|
||||
--dtype float16 \
|
||||
deserialize \
|
||||
@ -65,11 +65,11 @@ shard's rank. Sharded models serialized with this script will be named as
|
||||
model-rank-%03d.tensors
|
||||
|
||||
For more information on the available arguments for serializing, run
|
||||
`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
|
||||
`python -m examples.other.tensorize_vllm_model serialize --help`.
|
||||
|
||||
Or for deserializing:
|
||||
|
||||
`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
|
||||
`python -m examples.other.tensorize_vllm_model deserialize --help`.
|
||||
|
||||
Once a model is serialized, tensorizer can be invoked with the `LLM` class
|
||||
directly to load models:
|
||||
@ -90,7 +90,7 @@ TensorizerConfig arguments desired.
|
||||
In order to see all of the available arguments usable to configure
|
||||
loading with tensorizer that are given to `TensorizerConfig`, run:
|
||||
|
||||
`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
|
||||
`python -m examples.other.tensorize_vllm_model deserialize --help`
|
||||
|
||||
under the `tensorizer options` section. These can also be used for
|
||||
deserialization in this example script, although `--tensorizer-uri` and
|
||||
|
||||
@ -9,8 +9,7 @@ py-cpuinfo
|
||||
transformers >= 4.48.2 # Required for Bamba model and Transformers backend.
|
||||
tokenizers >= 0.19.1 # Required for Llama 3.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
|
||||
fastapi[standard] >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
aiohttp
|
||||
openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
|
||||
pydantic >= 2.9
|
||||
@ -18,9 +17,9 @@ prometheus_client >= 0.18.0
|
||||
pillow # Required for image processing
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer >= 0.10.9, < 0.11
|
||||
lm-format-enforcer >= 0.10.11, < 0.11
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.11; platform_machine == "x86_64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
@ -34,6 +33,8 @@ pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
einops # Required for Qwen2-VL.
|
||||
compressed-tensors == 0.9.1 # required for compressed-tensors
|
||||
compressed-tensors == 0.9.2 # required for compressed-tensors
|
||||
depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
python-json-logger # Used by logging as per examples/other/logging_configuration.md
|
||||
|
||||
@ -1,9 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.12
|
||||
# by the following command:
|
||||
#
|
||||
# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
|
||||
#
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile requirements-test.in -o requirements-test.txt
|
||||
absl-py==2.1.0
|
||||
# via rouge-score
|
||||
accelerate==1.0.1
|
||||
@ -141,7 +137,7 @@ frozenlist==1.5.0
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
# ray
|
||||
fsspec[http]==2024.9.0
|
||||
fsspec==2024.9.0
|
||||
# via
|
||||
# datasets
|
||||
# evaluate
|
||||
@ -221,7 +217,7 @@ librosa==0.10.2.post1
|
||||
# via -r requirements-test.in
|
||||
llvmlite==0.43.0
|
||||
# via numba
|
||||
lm-eval[api]==0.4.4
|
||||
lm-eval==0.4.4
|
||||
# via -r requirements-test.in
|
||||
lxml==5.3.0
|
||||
# via sacrebleu
|
||||
@ -238,10 +234,8 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common[opencv]==1.5.1
|
||||
# via
|
||||
# -r requirements-test.in
|
||||
# mistral-common
|
||||
mistral-common==1.5.1
|
||||
# via -r requirements-test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
mpmath==1.3.0
|
||||
@ -418,7 +412,7 @@ pybind11==2.13.6
|
||||
# via lm-eval
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic[email]==2.9.2
|
||||
pydantic==2.9.2
|
||||
# via
|
||||
# datamodel-code-generator
|
||||
# mistral-common
|
||||
@ -478,7 +472,7 @@ pyyaml==6.0.2
|
||||
# vocos
|
||||
rapidfuzz==3.12.1
|
||||
# via jiwer
|
||||
ray[adag]==2.40.0
|
||||
ray==2.40.0
|
||||
# via -r requirements-test.in
|
||||
redis==5.2.0
|
||||
# via tensorizer
|
||||
@ -549,6 +543,10 @@ sentence-transformers==3.2.1
|
||||
# via -r requirements-test.in
|
||||
sentencepiece==0.2.0
|
||||
# via mistral-common
|
||||
setuptools==75.8.0
|
||||
# via
|
||||
# pytablewriter
|
||||
# torch
|
||||
six==1.16.0
|
||||
# via
|
||||
# python-dateutil
|
||||
@ -646,7 +644,7 @@ tritonclient==2.51.0
|
||||
# via
|
||||
# -r requirements-test.in
|
||||
# genai-perf
|
||||
typepy[datetime]==1.3.2
|
||||
typepy==1.3.2
|
||||
# via
|
||||
# dataproperty
|
||||
# pytablewriter
|
||||
@ -683,6 +681,3 @@ yarl==1.17.1
|
||||
# via aiohttp
|
||||
zstandard==0.23.0
|
||||
# via lm-eval
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
|
||||
@ -17,7 +17,9 @@ ray[default]
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch==2.6.0.dev20241216+cpu
|
||||
torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
||||
@ -6,6 +6,7 @@ cmake>=3.26
|
||||
ninja
|
||||
packaging
|
||||
setuptools-scm>=8
|
||||
setuptools>=75.8.0
|
||||
wheel
|
||||
jinja2
|
||||
|
||||
|
||||
13
setup.py
13
setup.py
@ -54,7 +54,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
|
||||
# fallback to cpu
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
|
||||
MAIN_CUDA_VERSION = "12.1"
|
||||
MAIN_CUDA_VERSION = "12.4"
|
||||
|
||||
|
||||
def is_sccache_available() -> bool:
|
||||
@ -328,6 +328,7 @@ class repackage_wheel(build_ext):
|
||||
files_to_copy = [
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.py",
|
||||
@ -570,9 +571,8 @@ def get_requirements() -> List[str]:
|
||||
cuda_major, cuda_minor = torch.version.cuda.split(".")
|
||||
modified_requirements = []
|
||||
for req in requirements:
|
||||
if ("vllm-flash-attn" in req
|
||||
and not (cuda_major == "12" and cuda_minor == "1")):
|
||||
# vllm-flash-attn is built only for CUDA 12.1.
|
||||
if ("vllm-flash-attn" in req and cuda_major != "12"):
|
||||
# vllm-flash-attn is built only for CUDA 12.x.
|
||||
# Skip for other versions.
|
||||
continue
|
||||
modified_requirements.append(req)
|
||||
@ -612,6 +612,11 @@ if _is_cuda():
|
||||
# FA3 requires CUDA 12.0 or later
|
||||
ext_modules.append(
|
||||
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
||||
# Optional since this doesn't get built (produce an .so file) when
|
||||
# not targeting a hopper system
|
||||
ext_modules.append(
|
||||
CMakeExtension(name="vllm._flashmla_C", optional=True))
|
||||
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
|
||||
|
||||
if _build_custom_ops():
|
||||
|
||||
@ -9,7 +9,6 @@ import weakref
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import VllmRunner
|
||||
@ -34,7 +33,7 @@ def v1(run_with_both_engines):
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
|
||||
llm = LLM("distilbert/distilgpt2")
|
||||
weak_llm = weakref.ref(llm)
|
||||
del llm
|
||||
# If there's any circular reference to vllm, this fails
|
||||
@ -43,10 +42,10 @@ def test_vllm_gc_ed():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
@pytest.mark.parametrize("enforce_eager", [False])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
model: str,
|
||||
@ -97,8 +96,8 @@ def test_models(
|
||||
"test_suite", [
|
||||
("distilbert/distilgpt2", "ray", "", "L4"),
|
||||
("distilbert/distilgpt2", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||
("distilbert/distilgpt2", "ray", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
|
||||
@ -118,7 +117,7 @@ def test_models_distributed(
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
# test Ray Compiled Graph
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
|
||||
@ -93,7 +93,7 @@ def test_models_distributed(
|
||||
|
||||
if (model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||
and distributed_executor_backend == "ray"):
|
||||
# test ray adag
|
||||
# test Ray Compiled Graph
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
|
||||
@ -4,11 +4,9 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
|
||||
"model, use_v1",
|
||||
[
|
||||
# sleep mode with safetensors
|
||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
|
||||
("meta-llama/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
("facebook/opt-125m", False),
|
||||
])
|
||||
@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
|
||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
load_format = LoadFormat.AUTO
|
||||
if "Llama" in model:
|
||||
load_format = LoadFormat.RUNAI_STREAMER
|
||||
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
output = llm.generate(prompt, sampling_params)
|
||||
|
||||
@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
|
||||
from vllm.config import TaskOption, TokenizerPoolConfig
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
@ -47,70 +47,6 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
|
||||
|
||||
_M = TypeVar("_M")
|
||||
|
||||
MODELS_ON_S3 = [
|
||||
"distilbert/distilgpt2",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"openai-community/gpt2",
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"ai21labs/Jamba-tiny-random",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
|
||||
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
|
||||
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
||||
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
||||
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
|
||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
|
||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||
]
|
||||
|
||||
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
|
||||
|
||||
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
|
||||
|
||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||
@ -664,8 +600,8 @@ class HfRunner:
|
||||
if images is not None and images[i] is not None:
|
||||
processor_kwargs["images"] = images[i]
|
||||
|
||||
encoder_input_ids = self.wrap_device(
|
||||
self.processor(**processor_kwargs).input_ids,
|
||||
encoder_inputs = self.wrap_device(
|
||||
self.processor(**processor_kwargs),
|
||||
device=self.model.device.type,
|
||||
)
|
||||
|
||||
@ -679,13 +615,13 @@ class HfRunner:
|
||||
)
|
||||
|
||||
output = self.model.generate(
|
||||
encoder_input_ids,
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
output_hidden_states=True,
|
||||
return_dict_in_generate=True,
|
||||
**encoder_inputs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -742,14 +678,8 @@ class VllmRunner:
|
||||
enable_chunked_prefill: bool = False,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: Optional[bool] = False,
|
||||
load_format: Optional[LoadFormat] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if model_name in MODELS_ON_S3 and not load_format:
|
||||
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
|
||||
load_format = LoadFormat.RUNAI_STREAMER
|
||||
if not load_format:
|
||||
load_format = LoadFormat.AUTO
|
||||
self.model = LLM(
|
||||
model=model_name,
|
||||
task=task,
|
||||
@ -764,7 +694,6 @@ class VllmRunner:
|
||||
max_model_len=max_model_len,
|
||||
block_size=block_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
load_format=load_format,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .conftest import get_text_from_llm_generator
|
||||
|
||||
@ -42,6 +43,11 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
|
||||
Additionally, we compare the results of the v1 and v2 managers.
|
||||
"""
|
||||
if backend == "FLASHINFER" and current_platform.is_rocm():
|
||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
@ -101,6 +107,10 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
||||
The results with and without chunked prefill are not the same due to
|
||||
numerical instabilities.
|
||||
"""
|
||||
if backend == "FLASHINFER" and current_platform.is_rocm():
|
||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
|
||||
@ -491,7 +491,7 @@ def test_prefill_schedule_max_lora():
|
||||
lora_path="abc"))
|
||||
scheduler.add_seq_group(seq_group)
|
||||
# Add two more requests to verify lora is prioritized.
|
||||
# 0: Lora, 1: Lora, 2: regular, 3: regular
|
||||
# 0: LoRA, 1: LoRA, 2: regular, 3: regular
|
||||
# In the first iteration, index 0, 2 is scheduled.
|
||||
# If a request is not scheduled because it hits max lora, it is
|
||||
# prioritized. Verify that.
|
||||
|
||||
227
tests/distributed/test_expert_parallel.py
Normal file
227
tests/distributed/test_expert_parallel.py
Normal file
@ -0,0 +1,227 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_expert_parallel")
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class EPTestOptions(NamedTuple):
|
||||
trust_remote_code: bool
|
||||
tokenizer_mode: Optional[str]
|
||||
load_format: Optional[str] = None
|
||||
hf_overrides: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EPTestSettings:
|
||||
parallel_setups: List[ParallelSetup]
|
||||
distributed_backends: List[str]
|
||||
task: TaskOption
|
||||
test_options: EPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
task=task,
|
||||
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
task=task,
|
||||
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
)
|
||||
|
||||
def iter_params(self, model_name: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
self.task, opts)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
# yapf: disable
|
||||
TEST_MODELS = {
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
|
||||
trust_remote_code=True),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
|
||||
}
|
||||
|
||||
|
||||
def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate"],
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
(
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
load_format,
|
||||
hf_overrides,
|
||||
) = test_options
|
||||
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Need at least {tp_size} GPUs")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
"--load-format",
|
||||
"auto",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", hf_overrides])
|
||||
|
||||
ep_env = {
|
||||
"VLLM_TEST_ENABLE_EP": "1",
|
||||
}
|
||||
|
||||
ep_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
# compare without expert parallelism
|
||||
tp_env = {
|
||||
"VLLM_TEST_ENABLE_EP": "0",
|
||||
}
|
||||
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_name,
|
||||
ep_args,
|
||||
tp_args,
|
||||
ep_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=360)
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in TEST_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_ep(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
@ -324,8 +324,8 @@ def _compare_tp(
|
||||
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
|
||||
if distributed_backend == "ray" and (vllm_major_version == "1"
|
||||
or specific_case):
|
||||
# For V1, test Ray ADAG for all the tests
|
||||
# For V0, test Ray ADAG for a subset of the tests
|
||||
# For V1, test Ray Compiled Graph for all the tests
|
||||
# For V0, test Ray Compiled Graph for a subset of the tests
|
||||
pp_env = {
|
||||
"VLLM_USE_V1": vllm_major_version,
|
||||
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
||||
@ -333,7 +333,7 @@ def _compare_tp(
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
}
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of aDAG issue.
|
||||
# terminate because of a Ray Compiled Graph issue.
|
||||
common_args.append("--disable-frontend-multiprocessing")
|
||||
else:
|
||||
pp_env = None
|
||||
@ -367,8 +367,9 @@ def _compare_tp(
|
||||
if pp_env is None:
|
||||
raise
|
||||
else:
|
||||
# Ray ADAG tests are flaky, so we don't want to fail the test
|
||||
logger.exception("Ray ADAG tests failed")
|
||||
# Ray Compiled Graph tests are flaky,
|
||||
# so we don't want to fail the test
|
||||
logger.exception("Ray Compiled Graph tests failed")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -34,3 +34,27 @@ def test_custom_layer_partition():
|
||||
# Wrong number of layers
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_hidden_layers,pp_size,pp_rank,indices",
|
||||
[
|
||||
# pp_size 2
|
||||
(2, 2, 0, (0, 1)),
|
||||
(2, 2, 1, (1, 2)),
|
||||
(3, 2, 0, (0, 2)),
|
||||
(3, 2, 1, (2, 3)),
|
||||
# pp_size 3
|
||||
(3, 3, 0, (0, 1)),
|
||||
(3, 3, 1, (1, 2)),
|
||||
(3, 3, 2, (2, 3)),
|
||||
(4, 3, 0, (0, 1)),
|
||||
(4, 3, 1, (1, 3)),
|
||||
(4, 3, 2, (3, 4)),
|
||||
(5, 3, 0, (0, 2)),
|
||||
(5, 3, 1, (2, 4)),
|
||||
(5, 3, 2, (4, 5)),
|
||||
])
|
||||
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
|
||||
pp_rank: int, indices: tuple[int, int]):
|
||||
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
|
||||
|
||||
@ -48,6 +48,12 @@ test_consistent_across_ranks(
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
|
||||
# make sure we can access the model parameters from the calling process
|
||||
# of the `LLM` instance.
|
||||
params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
|
||||
model.parameters())
|
||||
test_consistent_across_ranks(len(params))
|
||||
|
||||
# all ranks should have the same outputs
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
|
||||
@ -2,16 +2,12 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_computed_prefix_blocks(model: str, block_size: int):
|
||||
# This test checks if we are able to run the engine to completion
|
||||
@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
|
||||
"decoration.")
|
||||
|
||||
engine_args = EngineArgs(model=model,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
block_size=block_size,
|
||||
enable_prefix_caching=True)
|
||||
|
||||
|
||||
@ -2,15 +2,11 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_computed_prefix_blocks(model: str):
|
||||
# This test checks if the engine generates completions both with and
|
||||
# without optional detokenization, that detokenization includes text
|
||||
@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
|
||||
"paper clips? Is there an easy to follow video tutorial available "
|
||||
"online for free?")
|
||||
|
||||
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
|
||||
llm = LLM(model=model)
|
||||
sampling_params = SamplingParams(max_tokens=10,
|
||||
temperature=0.0,
|
||||
detokenize=False)
|
||||
|
||||
@ -6,17 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.executor.uniproc_executor import UniProcExecutor
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
|
||||
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
|
||||
|
||||
|
||||
class Mock:
|
||||
...
|
||||
@ -38,12 +33,10 @@ class CustomUniExecutor(UniProcExecutor):
|
||||
CustomUniExecutorAsync = CustomUniExecutor
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_custom_executor_type_checking(model):
|
||||
with pytest.raises(ValueError):
|
||||
engine_args = EngineArgs(model=model,
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
||||
distributed_executor_backend=Mock)
|
||||
LLMEngine.from_engine_args(engine_args)
|
||||
with pytest.raises(ValueError):
|
||||
@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
|
||||
AsyncLLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_custom_executor(model, tmp_path):
|
||||
cwd = os.path.abspath(".")
|
||||
os.chdir(tmp_path)
|
||||
@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
||||
distributed_executor_backend=CustomUniExecutor,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_custom_executor_async(model, tmp_path):
|
||||
cwd = os.path.abspath(".")
|
||||
os.chdir(tmp_path)
|
||||
@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model,
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
||||
distributed_executor_backend=CustomUniExecutorAsync,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
@ -106,8 +95,7 @@ def test_custom_executor_async(model, tmp_path):
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_respect_ray(model):
|
||||
# even for TP=1 and PP=1,
|
||||
# if users specify ray, we should use ray.
|
||||
@ -116,7 +104,6 @@ def test_respect_ray(model):
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
distributed_executor_backend="ray",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
@ -2,22 +2,19 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model",
|
||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_skip_tokenizer_initialization(model: str):
|
||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||
# token ids.
|
||||
llm = LLM(model=model,
|
||||
skip_tokenizer_init=True,
|
||||
load_format=LoadFormat.RUNAI_STREAMER)
|
||||
llm = LLM(
|
||||
model=model,
|
||||
skip_tokenizer_init=True,
|
||||
)
|
||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||
|
||||
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||
|
||||
@ -5,17 +5,12 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
|
||||
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||
|
||||
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
|
||||
|
||||
|
||||
def test_chat():
|
||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
@ -33,8 +28,7 @@ def test_chat():
|
||||
|
||||
|
||||
def test_multi_chat():
|
||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
@ -71,8 +65,7 @@ def test_multi_chat():
|
||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||
def test_chat_multi_image(image_urls: List[str]):
|
||||
llm = LLM(
|
||||
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
|
||||
@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
load_format="dummy",
|
||||
tensor_parallel_size=tp_size,
|
||||
|
||||
@ -6,10 +6,9 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, PoolingParams, PoolingRequestOutput
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@ -33,7 +32,6 @@ def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
|
||||
@ -6,10 +6,9 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, RequestOutput, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
|
||||
MODEL_NAME = "distilbert/distilgpt2"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@ -31,7 +30,6 @@ def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
|
||||
@ -7,11 +7,10 @@ import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@ -28,7 +27,6 @@ def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
tensor_parallel_size=1,
|
||||
max_model_len=8192,
|
||||
enable_lora=True,
|
||||
|
||||
@ -6,14 +6,14 @@ import weakref
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
|
||||
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
|
||||
|
||||
@ -21,9 +21,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
max_model_len=1024)
|
||||
llm = LLM(model=MODEL_NAME, max_model_len=1024)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
@ -280,6 +278,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
|
||||
guided_options_request=dict(guided_regex=sample_regex))
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_disable_guided_decoding_fallback(sample_regex, llm):
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
regex=sample_regex,
|
||||
backend="xgrammar:no-fallback"))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="xgrammar does not support regex guided decoding"):
|
||||
llm.generate(prompts="This should fail",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
def test_guided_json_object(llm, guided_decoding_backend: str):
|
||||
@ -309,3 +323,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_json_with_any_whitespace_disabled(llm):
|
||||
|
||||
class ResponseSchema(BaseModel):
|
||||
clarifying_question: str
|
||||
cost_per_serving: str
|
||||
calories: str
|
||||
type_dish_ids: str
|
||||
type_meal_ids: str
|
||||
product_ids: list[str]
|
||||
exclude_product_ids: list[str]
|
||||
allergen_ids: list[str]
|
||||
total_cooking_time: str
|
||||
kitchen_ids: str
|
||||
holiday_ids: str
|
||||
|
||||
# Note: Without this setting, the response is sometimes full of `\n`
|
||||
# for some models. This option prevents that.
|
||||
guided_decoding_backend = 'xgrammar:disable-any-whitespace'
|
||||
|
||||
schema = ResponseSchema.model_json_schema()
|
||||
guided_params = GuidedDecodingParams(json=schema,
|
||||
backend=\
|
||||
guided_decoding_backend)
|
||||
sampling_params = SamplingParams(max_tokens=2000,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=-1.1,
|
||||
repetition_penalty=1.3,
|
||||
guided_decoding=guided_params)
|
||||
|
||||
prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
|
||||
"are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
|
||||
"quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
|
||||
outputs = llm.generate(prompts=prompt,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
assert "\n" not in generated_text
|
||||
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
jsonschema.validate(instance=parsed_json, schema=schema)
|
||||
|
||||
@ -6,7 +6,6 @@ from contextlib import nullcontext
|
||||
from vllm_test_utils import BlameResult, blame
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
|
||||
@ -44,8 +43,7 @@ def run_normal():
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM without guided decoding as a baseline.
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.3)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@ -61,8 +59,7 @@ def run_normal():
|
||||
|
||||
def run_lmfe(sample_regex):
|
||||
# Create an LLM with guided decoding enabled.
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
guided_decoding_backend="lm-format-enforcer",
|
||||
gpu_memory_utilization=0.3)
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@ -15,17 +14,13 @@ def v1(run_with_both_engines):
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='Prompt cannot be empty'):
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='out of vocabulary'):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
|
||||
@ -83,7 +83,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=201, total_tokens=211)
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
@ -140,7 +140,7 @@ async def test_single_chat_session_audio_base64encoded(
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=201, total_tokens=211)
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
@ -196,7 +196,7 @@ async def test_single_chat_session_input_audio(
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=201, total_tokens=211)
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user