Compare commits

..

75 Commits

Author SHA1 Message Date
c00ddd6834 Add buffer donation to benchmark 2024-04-30 21:58:47 +00:00
881b884046 Add block size 2024-04-27 22:35:28 +00:00
98a3df0f8d Disable memory tracking 2024-04-26 08:56:26 +00:00
3f6288cc89 Fix for binary cache 2024-04-26 08:56:12 +00:00
408ff4950c Tune pages_per_compute_block 2024-04-26 08:55:23 +00:00
278e8a1adc Add tpu 2024-04-26 08:54:52 +00:00
07be6ed3eb Improve benchmark 2024-04-26 08:54:41 +00:00
f6637dba18 Use persistent cache 2024-04-26 07:09:44 +00:00
707a5f6473 Move JAX-smi to worker 2024-04-26 07:05:51 +00:00
57690a9c09 Fix bucketing 2024-04-26 07:05:27 +00:00
b15db234ba Add precompilation step 2024-04-26 05:43:08 +00:00
d1591f0f1f Add op benchmark scripts 2024-04-26 05:35:19 +00:00
85d4488458 yapf 2024-04-26 05:31:31 +00:00
8d072dbfbd yapf 2024-04-26 05:30:25 +00:00
d830766c0c yapf 2024-04-26 05:30:08 +00:00
5ae2f81c2b Add warmup + formatting 2024-04-26 05:28:09 +00:00
4ea41d01a9 yapf 2024-04-26 05:27:38 +00:00
d16a348477 Add comment 2024-04-26 05:27:27 +00:00
aa092834bb Format gemma.py 2024-04-26 05:26:38 +00:00
d2c6a32c0c Fix is_tpu 2024-04-26 05:26:24 +00:00
21f35c2289 Change version 2024-04-26 05:00:26 +00:00
2aa9831dd3 Minor 2024-04-25 23:40:44 +00:00
028f528aad Fix KV cache shape 2024-04-25 23:38:07 +00:00
fa5bacd5b0 Add warmup 2024-04-25 05:06:41 +00:00
b62170e4e3 Fix scheduler 2024-04-25 05:06:22 +00:00
98eda57899 Add timer 2024-04-25 05:06:11 +00:00
81b8b813f1 Pad to avoid recompilation 2024-04-25 04:43:33 +00:00
e2c7dedb3a Minor 2024-04-25 03:28:53 +00:00
5323969fcf Increase #blocks 2024-04-24 08:56:58 +00:00
f42b4c27d8 Include argmax to jit 2024-04-24 08:56:45 +00:00
620e7646d3 Fix cache write 2024-04-24 08:56:30 +00:00
d5fb1c20c1 Fix JAX jit OOM 2024-04-24 07:52:56 +00:00
092e3d6d6d Remove hardcoded path 2024-04-19 08:18:10 +00:00
84284302d8 Minor 2024-04-19 08:08:25 +00:00
743695f586 Fix write_to_kv_cache 2024-04-19 07:51:54 +00:00
62b870fa07 Use FlashAttention kernel 2024-04-17 20:24:45 +00:00
7e3a230c38 Fix paged_attn 2024-04-17 20:06:26 +00:00
186c88c497 explictly return new_kv_caches 2024-04-17 18:42:34 +00:00
ef762cb110 Write kV 2024-04-17 18:21:39 +00:00
756c4e78d3 Add write_to_cache ops 2024-04-17 18:20:55 +00:00
4880de35d2 Add attn_mask 2024-04-17 18:12:20 +00:00
0fb07c08d0 Minor 2024-04-17 18:08:33 +00:00
e4377dd698 Add model runner 2024-04-17 18:04:54 +00:00
5cb213c85e Add flash-attn op 2024-04-17 18:02:28 +00:00
25bbc21ef6 Minor 2024-04-17 18:02:16 +00:00
b25fcc06c2 Minor 2024-04-17 18:02:13 +00:00
6661c030c4 Add paged_attn op 2024-04-17 18:02:00 +00:00
8888d1c474 Fix logit indices 2024-04-17 18:01:43 +00:00
cedb67028a Add gemma 2024-04-17 17:00:10 +00:00
91b47e3f2f JAX-based TPU worker 2024-04-16 17:37:11 +00:00
6d62e4c6aa Add torch to dependencies 2024-04-16 17:06:35 +00:00
de82e95787 Minor 2024-04-16 17:04:46 +00:00
b3b89cf755 Renew TPU executor 2024-04-16 09:42:15 +00:00
6692a30266 Minor 2024-04-16 09:41:53 +00:00
eb0a0466a9 Add JAX requirements 2024-04-16 08:05:54 +00:00
c59c1e7b2c Remove 2024-04-16 08:05:36 +00:00
d4adf92beb Merge branch 'main' into woosuk-tpu 2024-04-16 07:56:53 +00:00
363e6a950f Fix flashattn 2024-04-10 08:02:40 +00:00
696b653193 yapf 2024-04-10 08:02:21 +00:00
0d6402ddfd Fix requirements 2024-04-10 07:52:45 +00:00
60ff6b8c5c Merge branch 'main' into woosuk-tpu 2024-04-10 07:51:35 +00:00
d899009a63 [WIP] Add TPU worker 2024-04-01 08:24:23 +00:00
6894d3efef Add JAX to requirements.txt 2024-04-01 08:23:59 +00:00
38e3d33a62 Add TPU to device config 2024-04-01 08:23:44 +00:00
02e614d922 [WIP] Add Pallas backend 2024-04-01 08:23:32 +00:00
46b31ed98d Fix RoPE output shape 2024-04-01 08:22:47 +00:00
31d05f7edb yapf 2024-04-01 07:07:57 +00:00
4cdb732cef Add TPU to setup 2024-04-01 07:07:38 +00:00
27c592b97b Add get_dtype_size 2024-04-01 06:33:06 +00:00
5083aa9092 Add TPUExecutor 2024-04-01 03:24:07 +00:00
824521c987 Add TPU to DeviceConfig 2024-04-01 03:19:17 +00:00
3b8f43024f Add is_tpu 2024-04-01 03:18:36 +00:00
d148c2ef00 Add requirements 2024-04-01 03:17:43 +00:00
86f073edd6 Add reference 2024-04-01 02:02:13 +00:00
52a1e908e4 Add TPU gemma 2024-04-01 02:01:28 +00:00
340 changed files with 7958 additions and 21602 deletions

View File

@ -1,36 +0,0 @@
import os
import zipfile
MAX_SIZE_MB = 100
def print_top_10_largest_files(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
def check_wheel_size(directory):
for root, _, files in os.walk(directory):
for f in files:
if f.endswith(".whl"):
wheel_path = os.path.join(root, f)
wheel_size = os.path.getsize(wheel_path)
wheel_size_mb = wheel_size / (1024 * 1024)
if wheel_size_mb > MAX_SIZE_MB:
print(
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb} MB).")
return 0
if __name__ == "__main__":
import sys
sys.exit(check_wheel_size(sys.argv[1]))

View File

@ -1,44 +1,38 @@
# This script build the ROCm docker image and runs test inside it.
# This script build the ROCm docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Print ROCm version
echo "--- ROCm info"
rocminfo
echo "--- Resetting GPUs"
# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Building container"
sha=$(git rev-parse --short HEAD)
container_name=rocm_${sha}
docker build \
-t ${container_name} \
-f Dockerfile.rocm \
--progress plain \
.
remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
}
# Setup cleanup
remove_docker_container() { docker rm -f rocm || true; }
trap remove_docker_container EXIT
remove_docker_container
echo "--- Running container"
# Run the image
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--rm \
-e HF_TOKEN \
--name ${container_name} \
${container_name} \
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'

View File

@ -53,11 +53,6 @@ echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
exit 0
fi
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

View File

@ -1,51 +0,0 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
echo $current_time > /tmp/neuron-docker-build-timestamp
fi
else
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'

View File

@ -15,41 +15,31 @@ steps:
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test
mirror_hardwares: [amd]
command: pytest -v -s core
- label: Distributed Comm Ops Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2
num_gpus: 2 # only support 1 or 2 for now.
- label: Distributed Tests
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
mirror_hardwares: [amd]
commands:
- pytest -v -s test_pynccl_library.py
- pytest -v -s test_pynccl.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
- label: Distributed Tests (Multiple Groups)
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 4
commands:
- pytest -v -s test_pynccl.py
- label: Engine Test
mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
command: pytest -v -s engine tokenization test_sequence.py test_config.py
- label: Entrypoints Test
commands:
@ -59,7 +49,6 @@ steps:
- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- pip install awscli
@ -73,19 +62,16 @@ steps:
parallelism: 4
- label: Models Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
- label: Llava Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py
- label: Prefix Caching Test
mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching
@ -93,15 +79,12 @@ steps:
command: pytest -v -s samplers
- label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py
- label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
mirror_hardwares: [amd]
command: pytest -v -s spec_decode
- label: LoRA Test %N
@ -109,17 +92,13 @@ steps:
parallelism: 4
- label: Tensorizer Test
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
command: apt-get install curl libsodium23 && pytest -v -s tensorizer
- label: Metrics Test
command: pytest -v -s metrics
- label: Quantization Test
command: pytest -v -s quantization
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
commands:
- pip install aiohttp
- bash run-benchmarks.sh

View File

@ -3,6 +3,13 @@
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: "AMD Test"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh
- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh
- label: ":docker: build image"
commands:
@ -16,31 +23,6 @@ steps:
limit: 5
- wait
- group: "AMD Tests"
depends_on: ~
steps:
{% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
{% endfor %}
- label: "Neuron Test"
depends_on: ~
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
soft_fail: true
- label: "Intel Test"
depends_on: ~
command: bash .buildkite/run-cpu-test.sh
{% for step in steps %}
- label: "{{ step.label }}"
agents:
@ -56,9 +38,6 @@ steps:
plugins:
- kubernetes:
podSpec:
{% if step.num_gpus %}
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
{% endif %}
volumes:
- name: dshm
emptyDir:

View File

@ -18,7 +18,6 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`

View File

@ -18,7 +18,6 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`

View File

@ -18,7 +18,6 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
@ -58,8 +57,6 @@ body:
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.

View File

@ -39,7 +39,6 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`

View File

@ -1,49 +0,0 @@
name: 💬 Request for comments (RFC).
description: Ask for feedback on major architectural changes or design choices.
title: "[RFC]: "
labels: ["RFC"]
body:
- type: markdown
attributes:
value: >
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
- type: textarea
attributes:
label: Motivation.
description: >
The motivation of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Proposed Change.
description: >
The proposed change of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Feedback Period.
description: >
The feedback period of the RFC. Usually at least one week.
validations:
required: false
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC.
validations:
required: false
- type: textarea
attributes:
label: Any Other Things.
description: >
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -32,19 +32,19 @@ jobs:
pip install types-setuptools
- name: Mypy
run: |
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
# TODO(sang): Follow up
# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml

View File

@ -49,16 +49,13 @@ jobs:
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11']
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
@ -79,8 +76,6 @@ jobs:
- name: Build wheel
shell: bash
env:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename)

View File

@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
generate_release_notes: true,
name: process.env.RELEASE_TAG,
owner: context.repo.owner,
prerelease: true,
prerelease: false,
repo: context.repo.repo,
tag_name: process.env.RELEASE_TAG,
});

2
.gitignore vendored
View File

@ -70,8 +70,6 @@ instance/
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/*.rst
!**/*.template.rst
# PyBuilder
.pybuilder/

View File

@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
@ -167,18 +167,14 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/fp8/fp8_cuda_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/custom_all_reduce.cu")
endif()
@ -214,11 +210,23 @@ define_gpu_extension_target(
set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
"csrc/punica/punica_ops.cc")
#

View File

@ -1,13 +1,9 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN apt-get update -y \
&& apt-get install -y python3-pip git
@ -16,7 +12,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.4/compat/
RUN ldconfig /usr/local/cuda-12.1/compat/
WORKDIR /workspace
@ -75,10 +71,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
@ -93,7 +85,7 @@ FROM dev as flash-attn-builder
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.8
ARG flash_attn_version=v2.5.6
ENV FLASH_ATTN_VERSION=${flash_attn_version}
WORKDIR /usr/src/flash-attention-v2
@ -106,7 +98,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace
RUN apt-get update -y \
@ -116,7 +108,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.4/compat/
RUN ldconfig /usr/local/cuda-12.1/compat/
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \

View File

@ -1,36 +0,0 @@
# default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
ENV VLLM_BUILD_WITH_NEURON 1
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..
CMD ["/bin/bash"]

View File

@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
ARG FA_BRANCH="ae7928c"
ARG FA_BRANCH="3d2b6f5"
RUN echo "FA_BRANCH is $FA_BRANCH"
# whether to build flash-attention
@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/vllm-workspace
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
@ -89,16 +89,18 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
&& cd ../..; \
fi
WORKDIR /vllm-workspace
COPY . .
COPY ./ /app/vllm
RUN python3 -m pip install --upgrade pip numba
RUN python3 -m pip install xformers==0.0.23 --no-deps
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
&& if [ "$BUILD_FA" = "1" ]; then \
bash patch_xformers.rocm.sh; fi \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd ..
RUN python3 -m pip install --upgrade pip

View File

@ -1,9 +1,6 @@
include LICENSE
include requirements-common.txt
include requirements-cuda.txt
include requirements-rocm.txt
include requirements-neuron.txt
include requirements-cpu.txt
include CMakeLists.txt
recursive-include cmake *

View File

@ -69,16 +69,15 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)

View File

@ -135,7 +135,6 @@ async def async_request_trt_llm(
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
@ -150,6 +149,7 @@ async def async_request_trt_llm(
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.generated_text = json.loads(data)["text_output"]
output.success = True
else:

View File

@ -0,0 +1,148 @@
import functools
import time
from typing import Tuple
import chex
import jax
import jax.numpy as jnp
_PAD_SLOT_ID = -1
@jax.jit
def write_to_kv_cache1(
key: jax.Array, # [batch_size, seq_len, num_heads, head_size]
value: jax.Array, # [batch_size, seq_len, num_heads, head_size]
k_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
v_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
slot_mapping: jax.Array, # [batch_size, seq_len]
) -> Tuple[jax.Array, jax.Array]:
num_heads = key.shape[-2]
head_size = key.shape[-1]
key = key.reshape(-1, num_heads, head_size)
key = key.transpose((1, 0, 2))
value = value.reshape(-1, num_heads, head_size)
value = value.transpose((1, 0, 2))
k_cache = k_cache.at[:, slot_mapping.reshape(-1), :].set(key)
v_cache = v_cache.at[:, slot_mapping.reshape(-1), :].set(value)
return k_cache, v_cache
@functools.partial(jax.jit, donate_argnums=(2, 3))
def write_to_kv_cache2(
key: jax.Array, # [batch_size, seq_len, num_heads, head_size]
value: jax.Array, # [batch_size, seq_len, num_heads, head_size]
k_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
v_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
slot_mapping: jax.Array, # [batch_size, seq_len]
) -> Tuple[jax.Array, jax.Array]:
batch_size = slot_mapping.shape[0]
def cond(val: _IteratorState):
return val.idx < batch_size
def body(val: _IteratorState):
k_cache, v_cache = _write_seq_to_kv_cache(
key[val.idx],
value[val.idx],
val.k_cache,
val.v_cache,
slot_mapping[val.idx],
)
val.k_cache = k_cache
val.v_cache = v_cache
val.idx += 1
return val
iterator = _IteratorState(idx=0, k_cache=k_cache, v_cache=v_cache)
iterator = jax.lax.while_loop(cond, body, iterator)
return iterator.k_cache, iterator.v_cache
@functools.partial(jax.jit, donate_argnums=(2, 3))
def _write_seq_to_kv_cache(
key: jax.Array, # [seq_len, num_heads, head_size]
value: jax.Array, # [seq_len, num_heads, head_size]
k_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
v_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
slot_mapping: jax.Array, # [seq_len]
) -> Tuple[jax.Array, jax.Array]:
seq_len = slot_mapping.shape[0]
num_heads, _, head_size = k_cache.shape
# Reshape to match the rank of kv_cache.
key = key.reshape(seq_len, num_heads, 1, head_size)
value = value.reshape(seq_len, num_heads, 1, head_size)
def cond(val: _IteratorState):
return jnp.logical_and(
val.idx < seq_len, slot_mapping[val.idx] != _PAD_SLOT_ID)
def body(val: _IteratorState):
slot_idx = slot_mapping[val.idx]
val.k_cache = jax.lax.dynamic_update_slice(
val.k_cache,
key[val.idx],
(0, slot_idx, 0),
)
val.v_cache = jax.lax.dynamic_update_slice(
val.v_cache,
value[val.idx],
(0, slot_idx, 0),
)
val.idx += 1
return val
iterator = _IteratorState(idx=0, k_cache=k_cache, v_cache=v_cache)
iterator = jax.lax.while_loop(cond, body, iterator)
return iterator.k_cache, iterator.v_cache
@chex.dataclass
class _IteratorState:
idx: jnp.int32
k_cache: jnp.ndarray # [num_heads, num_blocks, block_size, head_size]
v_cache: jnp.ndarray # [num_heads, num_blocks, block_size, head_size]
def benchmark_write_to_kv_cache(
batch_size: int,
seq_len: int,
num_kv_heads: int,
head_size: int,
num_blocks: int,
block_size: int,
version: int = 1,
):
if version == 1:
f = write_to_kv_cache1
elif version == 2:
f = write_to_kv_cache2
else:
raise ValueError(f"Invalid version: {version}")
rng_key = jax.random.PRNGKey(0)
key = jax.random.normal(rng_key, (batch_size, seq_len, num_kv_heads, head_size), dtype=jnp.bfloat16)
value = jax.random.normal(rng_key, (batch_size, seq_len, num_kv_heads, head_size), dtype=jnp.bfloat16)
k_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
v_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
slot_mapping = jax.random.randint(rng_key, (batch_size, seq_len), 0, num_blocks * block_size, dtype=jnp.int32)
# For JIT compilation.
k_cache, v_cache = f(key, value, k_cache, v_cache, slot_mapping)
k_cache.block_until_ready()
start = time.time()
for _ in range(100):
k_cache, v_cache = f(key, value, k_cache, v_cache, slot_mapping)
k_cache.block_until_ready()
end = time.time()
print(f"Time taken: {(end - start) * 10:.2f} ms")
if __name__ == "__main__":
for num_blocks in [16, 256, 512, 1024, 2048, 8192, 16384]:
print(f"Benchmarking Write to KV Cache w/ {num_blocks} blocks")
benchmark_write_to_kv_cache(16, 256, 16, 256, num_blocks, 16, version=1)

View File

@ -0,0 +1,101 @@
import argparse
import functools
import time
import jax
import jax.numpy as jnp
from jax.experimental.pallas.ops.tpu.paged_attention import paged_attention
BLOCK_SIZE = 16
MAX_NUM_BLOCKS_PER_SEQ = 512
@functools.partial(jax.jit, static_argnums=(6, 7))
def paged_attn(
q: jax.Array, # [batch, 1, num_heads, head_size]
k_cache: jax.Array, # [num_kv_heads, num_blocks * block_size, head_size]
v_cache: jax.Array, # [num_kv_heads, num_blocks * block_size, head_size]
sm_scale: float,
block_tables: jax.Array, # [batch, max_num_blocks_per_batch]
context_lens: jax.Array, # [batch]
block_size: int,
pages_per_compute_block: int,
) -> jax.Array: # [batch, 1, num_heads, head_size]
q = q.squeeze(1)
q = q * sm_scale
head_size = q.shape[-1]
num_slots = k_cache.shape[-2]
k_cache = k_cache.reshape(-1, num_slots // block_size, block_size, head_size)
v_cache = v_cache.reshape(-1, num_slots // block_size, block_size, head_size)
output = paged_attention(
q,
k_cache,
v_cache,
context_lens,
block_tables,
pages_per_compute_block=pages_per_compute_block,
)
return output.reshape(q.shape[0], 1, q.shape[1], q.shape[2])
def benchmark_paged_attn(
batch_size: int,
num_heads: int,
num_kv_heads: int,
head_size: int,
context_len: int,
num_blocks: int,
block_size: int,
pages_per_compute_block: int,
):
rng_key = jax.random.PRNGKey(0)
query = jax.random.normal(rng_key, (batch_size, 1, num_heads, head_size), dtype=jnp.bfloat16)
k_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
v_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
sm_scale = head_size ** -0.5
block_tables = jax.random.randint(rng_key, (batch_size, MAX_NUM_BLOCKS_PER_SEQ), 0, num_blocks, dtype=jnp.int32)
context_lens = jnp.array([context_len] * batch_size, dtype=jnp.int32)
# For JIT compilation.
output = paged_attn(query, k_cache, v_cache, sm_scale, block_tables, context_lens, block_size, pages_per_compute_block)
output.block_until_ready()
start = time.time()
for _ in range(100):
output = paged_attn(query, k_cache, v_cache, sm_scale, block_tables, context_lens, block_size, pages_per_compute_block)
output.block_until_ready()
end = time.time()
print(f"Time taken: {(end - start) * 10000:.2f} us")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--num-heads", type=int, default=16)
parser.add_argument("--num-kv-heads", type=int, default=16)
parser.add_argument("--head-size", type=int, default=256)
parser.add_argument("--context-len", type=int, default=512)
parser.add_argument("--num-blocks", type=int, default=2048)
args = parser.parse_args()
print(args)
for block_size in [16, 32, 64, 128]:
for pages_per_compute_block in [1, 2, 4, 8, 16, 32, 64, 128]:
if pages_per_compute_block > MAX_NUM_BLOCKS_PER_SEQ:
continue
if block_size * pages_per_compute_block > 1024:
continue
print(f"block_size {block_size}, pages_per_compute_block: {pages_per_compute_block}")
benchmark_paged_attn(
args.batch_size,
args.num_heads,
args.num_kv_heads,
args.head_size,
args.context_len,
args.num_blocks,
block_size,
pages_per_compute_block,
)

View File

@ -9,7 +9,6 @@ import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def main(args: argparse.Namespace):
@ -102,7 +101,7 @@ if __name__ == '__main__':
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=[*QUANTIZATION_METHODS, None],
choices=['awq', 'gptq', 'squeezellm', None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)

View File

@ -16,22 +16,20 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
def main(args):
llm = LLM(model=args.model,
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
use_v2_block_manager=args.use_v2_block_manager,
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100
prompts = [PROMPT] * num_prompts
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
sampling_params = SamplingParams(temperature=0, max_tokens=100)
print("------warm up------")
test_prefix(
llm=llm,
prompts=prompts,
prompts=prompts[:1],
sampling_params=sampling_params,
)
@ -47,16 +45,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
parser.add_argument('--model',
type=str,
default='baichuan-inc/Baichuan2-13B-Chat')
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)

View File

@ -27,7 +27,7 @@ import time
import warnings
from dataclasses import dataclass
from datetime import datetime
from typing import AsyncGenerator, List, Optional, Tuple
from typing import AsyncGenerator, List, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@ -58,11 +58,7 @@ def sample_sharegpt_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
@ -72,32 +68,38 @@ def sample_sharegpt_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# some of these will be filtered out, so sample more than we need
sampled_indices = random.sample(range(len(dataset)),
int(num_requests * 1.2))
dataset = [dataset[i] for i in sampled_indices]
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
# Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids
completions = [completion for _, completion in dataset]
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
output_len = len(completion_token_ids[i])
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
# Filter out too long sequences.
filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset:
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
# This is because TGI causes errors when the input or output length
# is too short.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
return filtered_dataset
# Sample the requests.
sampled_requests = random.sample(filtered_dataset, num_requests)
return sampled_requests
def sample_sonnet_requests(
@ -359,7 +361,6 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sharegpt":
@ -367,7 +368,6 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sonnet":
@ -524,12 +524,6 @@ if __name__ == "__main__":
default=1000,
help="Number of prompts to process.",
)
parser.add_argument(
"--sharegpt-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output length "
"from the ShareGPT dataset.")
parser.add_argument(
"--sonnet-input-len",
type=int,

View File

@ -10,8 +10,6 @@ from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def sample_requests(
dataset_path: str,
@ -103,22 +101,25 @@ def run_vllm(
)
# Add the requests to the engine.
prompts = []
sampling_params = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
))
sampling_params = SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
)
# FIXME(woosuk): Do not use internal method.
llm._add_request(
prompt=prompt,
prompt_token_ids=None,
sampling_params=sampling_params,
)
start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True)
# FIXME(woosuk): Do not use internal method.
llm._run_engine(use_tqdm=True)
end = time.perf_counter()
return end - start
@ -266,7 +267,7 @@ if __name__ == "__main__":
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=[*QUANTIZATION_METHODS, None],
choices=['awq', 'gptq', 'squeezellm', None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
@ -334,7 +335,7 @@ if __name__ == "__main__":
"--device",
type=str,
default="cuda",
choices=["cuda", "cpu"],
choices=["cuda", "cpu", "tpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument(
"--enable-prefix-caching",

View File

@ -1,302 +0,0 @@
import argparse
import os
import sys
from typing import Optional
import torch
import torch.nn.functional as F
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
optimized_dequantize_gemm)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def torch_mult(
input: torch.Tensor, # [..., in_features]
weights: torch.Tensor,
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
) -> torch.Tensor:
output = F.linear(input, weights)
return output
def dequant_out_scale(
input: torch.Tensor, # [..., in_features]
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
codebooks: torch.
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
if bias is None:
output = F.linear(input, weights, bias)
orig_shape = output.shape
flattened_output = output.view(-1, output.size(-1))
f_scales = scales.view(-1, scales.shape[0])
b_scales = f_scales.expand(flattened_output.shape[0], -1)
flattened_output *= b_scales
return flattened_output.view(orig_shape)
else:
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_weight_scale(
input: torch.Tensor, # [..., in_features]
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
codebooks: torch.
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_no_scale(
input: torch.Tensor, # [..., in_features]
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
codebooks: torch.
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
return F.linear(input, weights, bias)
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
n = parts.sum().item()
device = torch.device('cuda:0')
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device)
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device)
count = 0
for index in range(16):
for i in range(8):
for book in range(nbooks):
codebooks[book, index, 0, i] = count * (10**book)
count += 1
print("codes shape", codes.shape)
for i in range(16):
for book in range(nbooks):
codes[0, i, book] = i
codes[0, -i, book] = i
weights = dequantize_weight(codes, codebooks, None)
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
print("weights shape:", weights.shape)
print("weights2 shape:", weights2.shape)
print("weights are:", weights)
print("weights2 are:", weights2)
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
print("last 128 weights are", weights[0, -128:])
print("last 128 weights2 are:", weights2[0, -128:])
def main():
parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
# Add arguments
parser.add_argument("--nbooks",
type=int,
default=1,
help="Number of codebooks (default: 1)")
parser.add_argument("--bits",
type=int,
default=16,
help="Number of bits per code element (default: 16)")
parser.add_argument(
"--test",
type=bool,
default=False,
help="Run the decompression/dequant tester rather than benchmarking "
"(default: False)")
# Parse the arguments
args = parser.parse_args()
# Extract values
nbooks = args.nbooks
bits = args.bits
if args.test:
dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
return
# Otherwise, benchmark.
methods = [
ops.aqlm_gemm,
dequant_out_scale,
generic_dequantize_gemm,
optimized_dequantize_gemm,
dequant_weight_scale,
torch_mult,
dequant_no_scale,
]
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
print(f"writing benchmarks to file {filename}")
with open(filename, "w") as f:
sys.stdout = f
print('m | k | n | n parts', end='')
for method in methods:
print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
print('')
# These are reasonable prefill sizes.
ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
(4096, (11008, 11008)), (11008, (4096, )))
# reasonable ranges for m.
for m in [
1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
128, 256, 512, 1024, 1536, 2048, 3072, 4096
]:
print(f'{m}', file=sys.__stdout__)
for ksp in ksandpartions:
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
methods)
sys.stdout = sys.__stdout__
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
methods):
# I didn't see visible improvements from increasing these, but feel free :)
num_warmup_trials = 1
num_trials = 1
num_calls = 100
# warmup.
for method in methods:
for _ in range(num_warmup_trials):
run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
n = parts.sum().item()
print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
for method in methods:
best_time_us = 1e20
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
kernel_dur_us = 1000 * kernel_dur_ms
if kernel_dur_us < best_time_us:
best_time_us = kernel_dur_us
print(f' | {kernel_dur_us:.0f}', end='')
print('')
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
nbooks: int, bits: int, method) -> float:
n = parts.sum().item()
device = torch.device('cuda:0')
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device)
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device)
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
# for comparison to just a pytorch mult.
weights = torch.randn((n, k), dtype=torch.float16, device=device)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
if method is torch_mult:
for i in range(num_calls):
torch_mult(input, weights, scales)
else:
for i in range(num_calls):
method(input, codes, codebooks, scales, parts, None)
end_event.record()
end_event.synchronize()
dur_ms = start_event.elapsed_time(end_event) / num_calls
return dur_ms
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,4 +1,3 @@
import argparse
import json
import os
import sys
@ -6,7 +5,6 @@ import sys
import torch
import torch.nn.functional as F
import triton
from tqdm import tqdm
from vllm.model_executor.layers.fused_moe import (fused_moe,
get_config_file_name)
@ -14,16 +12,16 @@ from vllm.model_executor.layers.fused_moe import (fused_moe,
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def main(dtype: str):
def main():
method = fused_moe
for bs in [
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
2048, 3072, 4096
]:
run_grid(bs, method=method, dtype=dtype)
run_grid(bs, method=method)
def run_grid(bs, method, dtype: str):
def run_grid(bs, method):
d_model = 4096
num_total_experts = 8
top_k = 2
@ -36,29 +34,39 @@ def run_grid(bs, method, dtype: str):
num_trials = 1
configs = []
if bs <= 16:
BLOCK_SIZES_M = [16]
elif bs <= 32:
BLOCK_SIZES_M = [16, 32]
elif bs <= 64:
BLOCK_SIZES_M = [16, 32, 64]
elif bs <= 128:
BLOCK_SIZES_M = [16, 32, 64, 128]
else:
BLOCK_SIZES_M = [16, 32, 64, 128, 256]
for block_size_n in [32, 64, 128, 256]:
for block_size_m in [16, 32, 64, 128, 256]:
for block_size_m in BLOCK_SIZES_M:
for block_size_k in [64, 128, 256]:
for group_size_m in [1, 16, 32, 64]:
for num_warps in [4, 8]:
for num_stages in [2, 3, 4, 5]:
configs.append({
"BLOCK_SIZE_M": block_size_m,
"BLOCK_SIZE_N": block_size_n,
"BLOCK_SIZE_K": block_size_k,
"GROUP_SIZE_M": group_size_m,
"num_warps": num_warps,
"num_stages": num_stages,
})
configs.append({
"BLOCK_SIZE_M": block_size_m,
"BLOCK_SIZE_N": block_size_n,
"BLOCK_SIZE_K": block_size_k,
"GROUP_SIZE_M": group_size_m,
"num_warps": num_warps,
"num_stages": 4,
})
best_config = None
best_time_us = 1e20
print(f'{tp_size=} {bs=}')
for config in tqdm(configs):
for config in configs:
print(f'{tp_size=} {bs=}')
print(f'{config}')
# warmup
print('warming up')
try:
for _ in range(num_warmup_trials):
run_timing(
@ -71,12 +79,12 @@ def run_grid(bs, method, dtype: str):
model_intermediate_size=model_intermediate_size,
method=method,
config=config,
dtype=dtype,
)
except triton.runtime.autotuner.OutOfResources:
continue
# trial
print('benchmarking')
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
@ -88,7 +96,6 @@ def run_grid(bs, method, dtype: str):
model_intermediate_size=model_intermediate_size,
method=method,
config=config,
dtype=dtype,
)
kernel_dur_us = 1000 * kernel_dur_ms
@ -98,18 +105,16 @@ def run_grid(bs, method, dtype: str):
best_config = config
best_time_us = kernel_dur_us
tqdm.write(
f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
f'{d_model=} {model_intermediate_size=} {num_layers=}')
print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
f'{d_model=} {model_intermediate_size=} {num_layers=}')
print("best_time_us", best_time_us)
print("best_config", best_config)
# holds Dict[str, Dict[str, int]]
filename = get_config_file_name(num_total_experts,
model_intermediate_size // tp_size,
"float8" if dtype == "float8" else None)
model_intermediate_size // tp_size)
print(f"writing config to file {filename}")
existing_content = {}
if os.path.exists(filename):
@ -123,48 +128,27 @@ def run_grid(bs, method, dtype: str):
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
top_k: int, tp_size: int, model_intermediate_size: int, method,
config, dtype: str) -> float:
config) -> float:
shard_intermediate_size = model_intermediate_size // tp_size
hidden_states = torch.rand(
(bs, d_model),
device="cuda:0",
dtype=torch.float16,
dtype=torch.bfloat16,
)
w1 = torch.rand(
ws = torch.rand(
(num_total_experts, 2 * shard_intermediate_size, d_model),
device=hidden_states.device,
dtype=hidden_states.dtype,
)
w2 = torch.rand(
w2s = torch.rand(
(num_total_experts, d_model, shard_intermediate_size),
device=hidden_states.device,
dtype=hidden_states.dtype,
)
w1_scale = None
w2_scale = None
a1_scale = None
a2_scale = None
if dtype == "float8":
w1 = w1.to(torch.float8_e4m3fn)
w2 = w2.to(torch.float8_e4m3fn)
w1_scale = torch.ones(num_total_experts,
device=hidden_states.device,
dtype=torch.float32)
w2_scale = torch.ones(num_total_experts,
device=hidden_states.device,
dtype=torch.float32)
a1_scale = torch.ones(1,
device=hidden_states.device,
dtype=torch.float32)
a2_scale = torch.ones(1,
device=hidden_states.device,
dtype=torch.float32)
gating_output = F.softmax(torch.rand(
(num_calls, bs, num_total_experts),
device=hidden_states.device,
@ -179,18 +163,13 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
for i in range(num_calls):
hidden_states = method(
hidden_states=hidden_states,
w1=w1,
w2=w2,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
w1=ws,
w2=w2s,
gating_output=gating_output[i],
topk=2,
renormalize=True,
inplace=True,
override_config=config,
use_fp8=dtype == "float8",
)
end_event.record()
end_event.synchronize()
@ -200,16 +179,4 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='benchmark_mixtral_moe',
description='Benchmark and tune the fused_moe kernel',
)
parser.add_argument(
'--dtype',
type=str,
default='auto',
choices=['float8', 'float16'],
help='Data type used for fused_moe kernel computations',
)
args = parser.parse_args()
sys.exit(main(args.dtype))
sys.exit(main())

View File

@ -16,7 +16,7 @@ PARTITION_SIZE = 512
def main(
version: str,
num_seqs: int,
seq_len: int,
context_len: int,
num_query_heads: int,
num_kv_heads: int,
head_size: int,
@ -48,12 +48,12 @@ def main(
dtype=torch.float,
device=device)
seq_lens = [seq_len for _ in range(num_seqs)]
max_seq_len = max(seq_lens)
seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
context_lens = [context_len for _ in range(num_seqs)]
max_context_len = max(context_lens)
context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
# Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
block_tables = []
for _ in range(num_seqs):
block_table = [
@ -77,7 +77,8 @@ def main(
# Prepare for the paged attention kernel.
output = torch.empty_like(query)
if version == "v2":
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
PARTITION_SIZE)
tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size),
dtype=output.dtype,
@ -109,9 +110,9 @@ def main(
num_kv_heads,
scale,
block_tables,
seq_lens,
context_lens,
block_size,
max_seq_len,
max_context_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
@ -128,9 +129,9 @@ def main(
num_kv_heads,
scale,
block_tables,
seq_lens,
context_lens,
block_size,
max_seq_len,
max_context_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
@ -165,7 +166,7 @@ if __name__ == '__main__':
choices=["v1", "v2"],
default="v2")
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--seq_len", type=int, default=4096)
parser.add_argument("--context-len", type=int, default=4096)
parser.add_argument("--num-query-heads", type=int, default=64)
parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size",
@ -198,7 +199,7 @@ if __name__ == '__main__':
main(
version=args.version,
num_seqs=args.batch_size,
seq_len=args.seq_len,
context_len=args.context_len,
num_query_heads=args.num_query_heads,
num_kv_heads=args.num_kv_heads,
head_size=args.head_size,

View File

@ -63,7 +63,6 @@ DEFAULT_CONDA_PATTERNS = {
"magma",
"triton",
"optree",
"nccl",
}
DEFAULT_PIP_PATTERNS = {
@ -74,7 +73,6 @@ DEFAULT_PIP_PATTERNS = {
"triton",
"optree",
"onnx",
"nccl",
}

View File

@ -104,7 +104,7 @@ __device__ void paged_attention_kernel(
const int num_kv_heads, // [num_heads]
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride,
@ -115,23 +115,23 @@ __device__ void paged_attention_kernel(
const int partition_idx = blockIdx.z;
const int max_num_partitions = gridDim.z;
constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
const int seq_len = seq_lens[seq_idx];
if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
const int context_len = context_lens[seq_idx];
if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
// No work to do. Terminate the thread block.
return;
}
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
// [start_block_idx, end_block_idx) is the range of blocks to process.
const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
const int num_blocks = end_block_idx - start_block_idx;
// [start_token_idx, end_token_idx) is the range of tokens to process.
const int start_token_idx = start_block_idx * BLOCK_SIZE;
const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
const int num_tokens = end_token_idx - start_token_idx;
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
@ -245,12 +245,12 @@ __device__ void paged_attention_kernel(
// This includes a reduction across the threads in the same thread group.
float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
// Add the ALiBi bias if slopes are given.
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
if (thread_group_offset == 0) {
// Store the partial reductions to shared memory.
// NOTE(woosuk): It is required to zero out the masked logits.
const bool mask = token_idx >= seq_len;
const bool mask = token_idx >= context_len;
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
// Update the max value.
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
@ -364,14 +364,14 @@ __device__ void paged_attention_kernel(
} else {
v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
}
if (block_idx == num_seq_blocks - 1) {
if (block_idx == num_context_blocks - 1) {
// NOTE(woosuk): When v_vec contains the tokens that are out of the context,
// we should explicitly zero out the values since they may contain NaNs.
// See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
#pragma unroll
for (int j = 0; j < V_VEC_SIZE; j++) {
v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
}
}
accs[i] += dot(logits_vec, v_vec);
@ -457,7 +457,7 @@ __global__ void paged_attention_v1_kernel(
const int num_kv_heads, // [num_heads]
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride,
@ -466,7 +466,7 @@ __global__ void paged_attention_v1_kernel(
const float kv_scale) {
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE>(
/* exp_sums */ nullptr, /* max_logits */ nullptr,
out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens,
out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
}
@ -489,7 +489,7 @@ __global__ void paged_attention_v2_kernel(
const int num_kv_heads, // [num_heads]
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride,
@ -498,7 +498,7 @@ __global__ void paged_attention_v2_kernel(
const float kv_scale) {
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE, PARTITION_SIZE>(
exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes,
block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
q_stride, kv_block_stride, kv_head_stride, kv_scale);
}
@ -513,13 +513,13 @@ __global__ void paged_attention_v2_reduce_kernel(
const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_partitions) {
const int num_heads = gridDim.x;
const int head_idx = blockIdx.x;
const int seq_idx = blockIdx.y;
const int seq_len = seq_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
if (num_partitions == 1) {
// No need to reduce. Only copy tmp_out to out.
scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
@ -616,7 +616,7 @@ __global__ void paged_attention_v2_reduce_kernel(
num_kv_heads, \
scale, \
block_tables_ptr, \
seq_lens_ptr, \
context_lens_ptr, \
max_num_blocks_per_seq, \
alibi_slopes_ptr, \
q_stride, \
@ -639,8 +639,8 @@ void paged_attention_v1_launcher(
int num_kv_heads,
float scale,
torch::Tensor& block_tables,
torch::Tensor& seq_lens,
int max_seq_len,
torch::Tensor& context_lens,
int max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
float kv_scale) {
int num_seqs = query.size(0);
@ -664,11 +664,11 @@ void paged_attention_v1_launcher(
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr<int>();
int* seq_lens_ptr = seq_lens.data_ptr<int>();
int* context_lens_ptr = context_lens.data_ptr<int>();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
int logits_size = padded_max_seq_len * sizeof(float);
int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
int logits_size = padded_max_context_len * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
// Keep that in sync with the logic here!
@ -715,8 +715,8 @@ void paged_attention_v1_launcher(
num_kv_heads, \
scale, \
block_tables, \
seq_lens, \
max_seq_len, \
context_lens, \
max_context_len, \
alibi_slopes, \
kv_scale);
@ -746,9 +746,9 @@ void paged_attention_v1(
int num_kv_heads, // [num_heads]
float scale,
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
torch::Tensor& context_lens, // [num_seqs]
int block_size,
int max_seq_len,
int max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype,
float kv_scale) {
@ -790,7 +790,7 @@ void paged_attention_v1(
num_kv_heads, \
scale, \
block_tables_ptr, \
seq_lens_ptr, \
context_lens_ptr, \
max_num_blocks_per_seq, \
alibi_slopes_ptr, \
q_stride, \
@ -803,7 +803,7 @@ void paged_attention_v1(
exp_sums_ptr, \
max_logits_ptr, \
tmp_out_ptr, \
seq_lens_ptr, \
context_lens_ptr, \
max_num_partitions);
template<
@ -824,8 +824,8 @@ void paged_attention_v2_launcher(
int num_kv_heads,
float scale,
torch::Tensor& block_tables,
torch::Tensor& seq_lens,
int max_seq_len,
torch::Tensor& context_lens,
int max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
float kv_scale) {
int num_seqs = query.size(0);
@ -852,10 +852,10 @@ void paged_attention_v2_launcher(
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr<int>();
int* seq_lens_ptr = seq_lens.data_ptr<int>();
int* context_lens_ptr = context_lens.data_ptr<int>();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
int logits_size = PARTITION_SIZE * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
@ -909,8 +909,8 @@ void paged_attention_v2_launcher(
num_kv_heads, \
scale, \
block_tables, \
seq_lens, \
max_seq_len, \
context_lens, \
max_context_len, \
alibi_slopes, \
kv_scale);
@ -943,9 +943,9 @@ void paged_attention_v2(
int num_kv_heads, // [num_heads]
float scale,
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
torch::Tensor& context_lens, // [num_seqs]
int block_size,
int max_seq_len,
int max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype,
float kv_scale) {

View File

@ -24,14 +24,6 @@ void reshape_and_cache(
const std::string& kv_cache_dtype,
const float kv_scale);
void reshape_and_cache_flash(
torch::Tensor& key,
torch::Tensor& value,
torch::Tensor& key_cache,
torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype);
// Just for unittest
void convert_fp8(
torch::Tensor& src_cache,

View File

@ -215,41 +215,6 @@ __global__ void reshape_and_cache_kernel(
}
}
template<typename scalar_t>
__global__ void reshape_and_cache_flash_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads, head_size]
scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads, head_size]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride,
const int key_stride,
const int value_stride,
const int num_heads,
const int head_size,
const int block_size) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
if (slot_idx < 0) {
return;
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
const int n = num_heads * head_size;
for (int i = threadIdx.x; i < n; i += blockDim.x) {
const int64_t src_key_idx = token_idx * key_stride + i;
const int64_t src_value_idx = token_idx * value_stride + i;
const int head_idx = i / head_size;
const int head_offset = i % head_size;
const int64_t tgt_value_idx = block_idx * block_stride
+ block_offset * num_heads * head_size
+ head_idx * head_size
+ head_offset;
k_cache[tgt_value_idx] = key[src_key_idx];
v_cache[tgt_value_idx] = value[src_value_idx];
}
}
} // namespace vllm
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \
@ -310,51 +275,6 @@ void reshape_and_cache(
}
}
void reshape_and_cache_flash(
torch::Tensor& key, // [num_tokens, num_heads, head_size]
torch::Tensor& value, // [num_tokens, num_heads, head_size]
torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& slot_mapping, // [num_tokens]
const std::string& kv_cache_dtype)
{
// FIXME: only support auto datatype, does not support fp8
if (kv_cache_dtype != "auto") {
TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
}
int num_tokens = key.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = k_cache.size(1);
int key_stride = key.stride(0);
int value_stride = value.stride(0);
int block_stride = k_cache.stride(0);
TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(),
"reshape_and_cache_flash",
[&] {
vllm::reshape_and_cache_flash_kernel<scalar_t><<<grid, block, 0, stream>>>(
key.data_ptr<scalar_t>(),
value.data_ptr<scalar_t>(),
k_cache.data_ptr<scalar_t>(),
v_cache.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(),
block_stride,
key_stride,
value_stride,
num_heads,
head_size,
block_size);
});
}
namespace vllm {
template<typename Tout, typename Tin>

View File

@ -70,11 +70,11 @@ template <typename T>
FORCE_INLINE std::pair<T, T>
reduceSoftmaxAlibi(T *data, const int size, const int capacity,
const float alibi_slope, const int start_index,
const int seq_len) {
data[0] += alibi_slope * (start_index - seq_len + 1);
const int context_len) {
data[0] += alibi_slope * (start_index - context_len + 1);
T max = data[0];
for (int i = 1; i < size; ++i) {
T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
T qk = data[i] + alibi_slope * (start_index + i - context_len + 1);
data[i] = qk;
max = max >= qk ? max : qk;
}
@ -225,7 +225,7 @@ struct paged_attention_v1_impl {
const int num_kv_heads, const float scale,
const int
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int *__restrict__ seq_lens, // [num_seqs]
const int *__restrict__ context_lens, // [num_seqs]
const int max_num_blocks_per_seq,
const float *__restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int kv_block_stride, const int kv_head_stride,
@ -235,32 +235,32 @@ struct paged_attention_v1_impl {
static_assert(BLOCK_SIZE == 16);
int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
int max_context_len = max_num_blocks_per_seq * BLOCK_SIZE;
int max_context_len_padded = (max_context_len + 15) & 0xFFFFFFF0;
TORCH_CHECK((max_context_len_padded * sizeof(float)) % 64 == 0);
const int parallel_work_item_num = omp_get_max_threads();
size_t logits_bytes =
parallel_work_item_num * max_seq_len_padded * sizeof(float);
parallel_work_item_num * max_context_len_padded * sizeof(float);
float *logits = (float *)std::aligned_alloc(
64, logits_bytes); // Cacheline alignment for each context token.
// [parallel_work_item_num, max_seq_len_padded]
// [parallel_work_item_num, max_context_len_padded]
#pragma omp parallel for collapse(2) schedule(dynamic, 1)
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
int seq_len = seq_lens[seq_idx];
int context_len = context_lens[seq_idx];
const int *seq_block_table =
block_tables + max_num_blocks_per_seq * seq_idx;
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int block_num = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
const scalar_t *__restrict__ q_vec_ptr =
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
const int last_block_token_num =
seq_len - (block_num - 1) * BLOCK_SIZE;
context_len - (block_num - 1) * BLOCK_SIZE;
float *__restrict__ thread_block_logits =
logits + omp_get_thread_num() * max_seq_len_padded;
logits + omp_get_thread_num() * max_context_len_padded;
// Compute logits
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
@ -278,11 +278,11 @@ struct paged_attention_v1_impl {
// Compute softmax
if (alibi_slopes) {
reduceSoftmaxAlibi(thread_block_logits, seq_len,
reduceSoftmaxAlibi(thread_block_logits, context_len,
block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
seq_len);
context_len);
} else {
reduceSoftmax(thread_block_logits, seq_len,
reduceSoftmax(thread_block_logits, context_len,
block_num * BLOCK_SIZE);
}
@ -340,7 +340,7 @@ struct paged_attention_v1_impl {
#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
num_heads);
@ -348,8 +348,8 @@ template <typename T, int BLOCK_SIZE>
void paged_attention_v1_impl_launcher(
torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
torch::Tensor &value_cache, int num_kv_heads, float scale,
torch::Tensor &block_tables, torch::Tensor &seq_lens,
int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
torch::Tensor &block_tables, torch::Tensor &context_lens,
int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -369,7 +369,7 @@ void paged_attention_v1_impl_launcher(
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
int *block_tables_ptr = block_tables.data_ptr<int>();
int *seq_lens_ptr = seq_lens.data_ptr<int>();
int *context_lens_ptr = context_lens.data_ptr<int>();
switch (head_size) {
case 64:
@ -399,7 +399,7 @@ void paged_attention_v1_impl_launcher(
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
seq_lens, max_seq_len, alibi_slopes);
context_lens, max_context_len, alibi_slopes);
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
switch (block_size) { \
@ -416,8 +416,8 @@ void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
torch::Tensor &key_cache, torch::Tensor &value_cache,
int num_kv_heads, float scale,
torch::Tensor &block_tables,
torch::Tensor &seq_lens, int block_size,
int max_seq_len,
torch::Tensor &context_lens, int block_size,
int max_context_len,
const c10::optional<torch::Tensor> &alibi_slopes,
const std::string &kv_cache_dtype, float kv_scale) {
TORCH_CHECK(kv_scale == 1.0f);
@ -448,7 +448,7 @@ struct paged_attention_v2_impl {
const int num_kv_heads, const float scale,
const int
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int *__restrict__ seq_lens, // [num_seqs]
const int *__restrict__ context_lens, // [num_seqs]
const int max_num_blocks_per_seq,
const float *__restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int kv_block_stride, const int kv_head_stride,
@ -465,22 +465,22 @@ struct paged_attention_v2_impl {
for (int partition_idx = 0; partition_idx < max_num_partitions;
++partition_idx) {
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
const int seq_len = seq_lens[seq_idx];
const int context_len = context_lens[seq_idx];
const int start_token_idx = partition_idx * PARTITION_SIZE;
if (start_token_idx >= seq_len)
if (start_token_idx >= context_len)
continue;
const int partition_num =
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
const bool no_reduce = (partition_num == 1);
const int token_num =
(std::min(seq_len, start_token_idx + PARTITION_SIZE) -
const int context_token_num =
(std::min(context_len, start_token_idx + PARTITION_SIZE) -
start_token_idx);
const int block_num =
(token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
(context_token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int last_block_token_num =
token_num - (block_num - 1) * BLOCK_SIZE;
context_token_num - (block_num - 1) * BLOCK_SIZE;
const int *seq_block_table = block_tables +
max_num_blocks_per_seq * seq_idx +
start_token_idx / BLOCK_SIZE;
@ -507,10 +507,10 @@ struct paged_attention_v2_impl {
std::pair<float, float> max_and_sum;
if (alibi_slopes) {
max_and_sum = reduceSoftmaxAlibi(
logits, token_num, block_num * BLOCK_SIZE,
alibi_slopes[head_idx], start_token_idx, seq_len);
logits, context_token_num, block_num * BLOCK_SIZE,
alibi_slopes[head_idx], start_token_idx, context_len);
} else {
max_and_sum = reduceSoftmax(logits, token_num,
max_and_sum = reduceSoftmax(logits, context_token_num,
block_num * BLOCK_SIZE);
}
@ -583,9 +583,9 @@ struct paged_attention_v2_impl {
#pragma omp parallel for collapse(2) schedule(static, 1)
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
const int seq_len = seq_lens[seq_idx];
const int context_len = context_lens[seq_idx];
const int partition_num =
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
if (partition_num == 1)
continue;
@ -612,9 +612,9 @@ struct paged_attention_v2_impl {
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
const int seq_len = seq_lens[seq_idx];
const int context_len = context_lens[seq_idx];
const int partition_num =
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
if (partition_num == 1)
continue;
@ -649,7 +649,7 @@ struct paged_attention_v2_impl {
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
context_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
max_num_partitions);
@ -658,8 +658,8 @@ void paged_attention_v2_impl_launcher(
torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
torch::Tensor &value_cache, int num_kv_heads, float scale,
torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size,
int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -683,7 +683,7 @@ void paged_attention_v2_impl_launcher(
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
int *block_tables_ptr = block_tables.data_ptr<int>();
int *seq_lens_ptr = seq_lens.data_ptr<int>();
int *context_lens_ptr = context_lens.data_ptr<int>();
switch (head_size) {
case 64:
@ -713,8 +713,8 @@ void paged_attention_v2_impl_launcher(
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
num_kv_heads, scale, block_tables, seq_lens, block_size, \
max_seq_len, alibi_slopes);
num_kv_heads, scale, block_tables, context_lens, block_size, \
max_context_len, alibi_slopes);
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
switch (block_size) { \
@ -732,8 +732,8 @@ void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
torch::Tensor &query, torch::Tensor &key_cache,
torch::Tensor &value_cache, int num_kv_heads,
float scale, torch::Tensor &block_tables,
torch::Tensor &seq_lens, int block_size,
int max_seq_len,
torch::Tensor &context_lens, int block_size,
int max_context_len,
const c10::optional<torch::Tensor> &alibi_slopes,
const std::string &kv_cache_dtype, float kv_scale) {
TORCH_CHECK(kv_scale == 1.0f);

View File

@ -10,9 +10,9 @@ void paged_attention_v1(
int num_kv_heads,
float scale,
torch::Tensor& block_tables,
torch::Tensor& seq_lens,
torch::Tensor& context_lens,
int block_size,
int max_seq_len,
int max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype,
float kv_scale);
@ -28,9 +28,9 @@ void paged_attention_v2(
int num_kv_heads,
float scale,
torch::Tensor& block_tables,
torch::Tensor& seq_lens,
torch::Tensor& context_lens,
int block_size,
int max_seq_len,
int max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype,
float kv_scale);
@ -86,21 +86,6 @@ void gelu_fast(
torch::Tensor& input);
#ifndef USE_ROCM
torch::Tensor aqlm_gemm(
const torch::Tensor& input,
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const torch::Tensor& codebook_partition_sizes,
const std::optional<torch::Tensor>& bias
);
torch::Tensor aqlm_dequant(
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& codebook_partition_sizes
);
torch::Tensor awq_gemm(
torch::Tensor _in_feats,
torch::Tensor _kernel,
@ -124,26 +109,6 @@ torch::Tensor marlin_gemm(
int64_t size_m,
int64_t size_n,
int64_t size_k);
torch::Tensor gptq_marlin_gemm(
torch::Tensor &a,
torch::Tensor &b_q_weight,
torch::Tensor &b_scales,
torch::Tensor &g_idx,
torch::Tensor &perm,
torch::Tensor &workspace,
int64_t num_bits,
int64_t size_m,
int64_t size_n,
int64_t size_k,
bool is_k_full);
torch::Tensor gptq_marlin_repack(
torch::Tensor &b_q_weight,
torch::Tensor &perm,
int64_t size_k,
int64_t size_n,
int64_t num_bits);
#endif
void squeezellm_gemm(
@ -166,16 +131,6 @@ void gptq_shuffle(
torch::Tensor q_perm,
int bit);
void static_scaled_fp8_quant(
torch::Tensor& out,
torch::Tensor& input,
torch::Tensor& scale);
void dynamic_scaled_fp8_quant(
torch::Tensor& out,
torch::Tensor& input,
torch::Tensor& scale);
void moe_align_block_size(
torch::Tensor topk_ids,
int num_experts,

View File

@ -2,4 +2,3 @@
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)

View File

@ -2,4 +2,3 @@
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)

View File

@ -60,7 +60,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 32768) \
f(in_T, out_T, W_T, narrow, 33024) \
f(in_T, out_T, W_T, narrow, 36864) \
f(in_T, out_T, W_T, narrow, 43264) \
f(in_T, out_T, W_T, narrow, 49152) \
f(in_T, out_T, W_T, narrow, 64000) \
f(in_T, out_T, W_T, narrow, 64256) \
@ -74,74 +73,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
// and vllm/tests/lora/test_punica.py
// Used for defining kernels going from the variety of
// dim in to the narrow dim out
// Using it for the fully sharded column
// parallel LoRA A which splits the rank dim
#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
f(in_T, out_T, W_T, 128, narrow) \
f(in_T, out_T, W_T, 256, narrow) \
f(in_T, out_T, W_T, 512, narrow) \
f(in_T, out_T, W_T, 640, narrow) \
f(in_T, out_T, W_T, 768, narrow) \
f(in_T, out_T, W_T, 1024, narrow) \
f(in_T, out_T, W_T, 1152, narrow) \
f(in_T, out_T, W_T, 1280, narrow) \
f(in_T, out_T, W_T, 1536, narrow) \
f(in_T, out_T, W_T, 1728, narrow) \
f(in_T, out_T, W_T, 1792, narrow) \
f(in_T, out_T, W_T, 2048, narrow) \
f(in_T, out_T, W_T, 2304, narrow) \
f(in_T, out_T, W_T, 2560, narrow) \
f(in_T, out_T, W_T, 2752, narrow) \
f(in_T, out_T, W_T, 2816, narrow) \
f(in_T, out_T, W_T, 3072, narrow) \
f(in_T, out_T, W_T, 3456, narrow) \
f(in_T, out_T, W_T, 3584, narrow) \
f(in_T, out_T, W_T, 4096, narrow) \
f(in_T, out_T, W_T, 4608, narrow) \
f(in_T, out_T, W_T, 5120, narrow) \
f(in_T, out_T, W_T, 5504, narrow) \
f(in_T, out_T, W_T, 5632, narrow) \
f(in_T, out_T, W_T, 6144, narrow) \
f(in_T, out_T, W_T, 6848, narrow) \
f(in_T, out_T, W_T, 6912, narrow) \
f(in_T, out_T, W_T, 7168, narrow) \
f(in_T, out_T, W_T, 8192, narrow) \
f(in_T, out_T, W_T, 9216, narrow) \
f(in_T, out_T, W_T, 10240, narrow) \
f(in_T, out_T, W_T, 11008, narrow) \
f(in_T, out_T, W_T, 12288, narrow) \
f(in_T, out_T, W_T, 13696, narrow) \
f(in_T, out_T, W_T, 13824, narrow) \
f(in_T, out_T, W_T, 14336, narrow) \
f(in_T, out_T, W_T, 15360, narrow) \
f(in_T, out_T, W_T, 16384, narrow) \
f(in_T, out_T, W_T, 20480, narrow) \
f(in_T, out_T, W_T, 22016, narrow) \
f(in_T, out_T, W_T, 24576, narrow) \
f(in_T, out_T, W_T, 27392, narrow) \
f(in_T, out_T, W_T, 28672, narrow) \
f(in_T, out_T, W_T, 32000, narrow) \
f(in_T, out_T, W_T, 32256, narrow) \
f(in_T, out_T, W_T, 32512, narrow) \
f(in_T, out_T, W_T, 32768, narrow) \
f(in_T, out_T, W_T, 33024, narrow) \
f(in_T, out_T, W_T, 36864, narrow) \
f(in_T, out_T, W_T, 43264, narrow) \
f(in_T, out_T, W_T, 49152, narrow) \
f(in_T, out_T, W_T, 64000, narrow) \
f(in_T, out_T, W_T, 64256, narrow) \
f(in_T, out_T, W_T, 64512, narrow) \
f(in_T, out_T, W_T, 102400, narrow) \
f(in_T, out_T, W_T, 102656, narrow) \
f(in_T, out_T, W_T, 102912, narrow) \
f(in_T, out_T, W_T, 128000, narrow) \
f(in_T, out_T, W_T, 128256, narrow) \
f(in_T, out_T, W_T, 128512, narrow) \
// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
// Keep this in sync with vllm/config::LoRAConfig
#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8) \
@ -149,14 +80,4 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
f(in_T, out_T, W_T, 8, 64) \
f(in_T, out_T, W_T, 16, 64) \
f(in_T, out_T, W_T, 32, 64) \
f(in_T, out_T, W_T, 64, 64)
// clang-format on

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)

View File

@ -2,4 +2,3 @@
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)

View File

@ -2,4 +2,3 @@
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)

View File

@ -2,4 +2,3 @@
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)

View File

@ -2,4 +2,3 @@
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)

View File

@ -0,0 +1,4 @@
#include "bgmv_config.h"
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)

View File

@ -199,7 +199,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
constexpr int tz = 4;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if constexpr (feat_in <= feat_out) {
if constexpr (feat_in < feat_out) {
static_assert(feat_in % vec_size == 0);
constexpr int tx = feat_in / vec_size;
@ -289,9 +289,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
int64_t y_offset, int64_t full_y_size, int64_t batch_size, \
int64_t num_layers, int64_t layer_idx, float scale);
#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out) \
INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide) \
INST_BGMV(narrow, wide, in_T, out_T, W_T) \
INST_BGMV(wide, narrow, in_T, out_T, W_T)

View File

@ -10,7 +10,6 @@ TEMPLATE = """
#include "bgmv_impl.cuh"
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
""".lstrip() # noqa: E501
for input_dtype in DTYPES:
@ -19,26 +18,6 @@ for input_dtype in DTYPES:
if weight_dtype == "fp32":
# FP32 weights are not supported.
continue
if output_dtype == "fp32":
# LoRA A matrix.
if input_dtype != weight_dtype:
# NOTE(woosuk): While Punica supports the case where the
# input and weight dtypes are different, we only generate
# the kernels the same dtypes to reduce the binary size.
continue
elif input_dtype == "fp32":
# LoRA B matrix.
if output_dtype != weight_dtype:
# NOTE(woosuk): While Punica supports the case where the
# output and weight dtypes are different, we only generate
# the kernels the same dtypes to reduce the binary size.
continue
elif not (input_dtype == output_dtype == weight_dtype):
# NOTE(woosuk): While Punica supports mixed data types for
# input, output, and weight, we only generate the kernels with
# the same data types to reduce the binary size.
continue
kernel_definition = TEMPLATE.format(
input_dtype=DTYPE_MAP[input_dtype],
output_dtype=DTYPE_MAP[output_dtype],

View File

@ -50,23 +50,6 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
int64_t y_offset, int64_t full_y_size,
int64_t batch_size, int64_t num_layers,
int64_t layer_idx, float scale) {
// NOTE(woosuk): While Punica supports various combinations of input/output
// data types, we limit the supported data types to reduce the binary size.
constexpr bool is_input_float = std::is_same<in_T, float>::value;
constexpr bool is_output_float = std::is_same<out_T, float>::value;
if (is_input_float) {
if (!std::is_same<out_T, W_T>::value) {
return false;
}
} else if (is_output_float) {
if (!std::is_same<in_T, W_T>::value) {
return false;
}
} else if (!(std::is_same<in_T, W_T>::value &&
std::is_same<out_T, W_T>::value)) {
return false;
}
switch (pack_u32(in_features, out_features)) {
#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out) \
case pack_u32(feat_in, feat_out): \
@ -79,12 +62,12 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
#undef CASE
#undef CASE_ONESIDE
default:
return false;
}
return true;
}

View File

@ -63,20 +63,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// Quantization ops
#ifndef USE_ROCM
ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm, "gptq_marlin Optimized Quantized GEMM for GPTQ");
ops.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin repack from GPTQ");
ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
#endif
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
ops.def(
"moe_align_block_size",
&moe_align_block_size,
@ -96,10 +90,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
"reshape_and_cache",
&reshape_and_cache,
"Reshape the key and value tensors and cache them");
cache_ops.def(
"reshape_and_cache_flash",
&reshape_and_cache_flash,
"Reshape the key and value tensors and cache them");
cache_ops.def(
"convert_fp8",
&convert_fp8,

View File

@ -1,712 +0,0 @@
/*
* Modified by Neural Magic
* Adapted from https://github.com/Vahe1994/AQLM
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/cuda/CUDAGuard.h>
#include <iostream>
#include <cstdlib>
namespace vllm {
namespace aqlm {
__global__ void Code1x16MatVec(
const int4* __restrict__ A,
const int4* __restrict__ B,
int4* __restrict__ C,
const int4* __restrict__ codebook,
const int prob_m,
const int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
const int codebook_stride // as int4.
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred)
{
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size)
{
codebook += codebook_stride;
++codebook_size;
}
}
int b_gl_rd = 0;
int c_gl_wr = a_gl_rd;
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
__shared__ int4 sh_b[32 * 9];
float res = 0;
int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
while (iters--) {
// We pad shared memory to avoid bank conflicts during reads
__syncthreads();
for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
if (b_gl_rd + i < prob_k / 8)
sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
}
__syncthreads();
b_gl_rd += 32 * 8;
int b_sh_rd = 9 * (threadIdx.x % 32);
if (pred && a_gl_rd < a_gl_end) {
const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
uint32_t dec[4];
// We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
// actually help us; this brings > 2x speedup.
asm volatile (
"ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
: "l"((void*) &codebook[enc[i]])
);
half2* a = reinterpret_cast<half2*>(&dec);
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
half2 res2 = {};
#pragma unroll
for (int j = 0; j < 4; j++)
res2 = __hfma2(a[j], b[j], res2);
res += __half2float(res2.x) + __half2float(res2.y);
b_sh_rd++;
}
a_gl_rd += 32;
}
}
if (pred) {
#pragma unroll
for (int i = 16; i > 0; i /= 2)
res += __shfl_down_sync(0xffffffff, res, i);
if (threadIdx.x % 32 == 0)
reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
}
}
__global__ void Code2x8MatVec(
const int4* __restrict__ A,
const int4* __restrict__ B,
int4* __restrict__ C,
const int4* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
const int codebook_stride // as int4.
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred)
{
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size)
{
codebook += codebook_stride;
++codebook_size;
}
}
int b_gl_rd = 0;
int c_gl_wr = a_gl_rd;
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
int lane = threadIdx.x % 8;
extern __shared__ int4 sh[];
int4* sh_b = sh;
int4* sh_code = sh_b + 32 * 9;
int4* sh_code0 = sh_code;
int4* sh_code1 = sh_code + 256 * 8;
for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
int4 dec = codebook[i];
#pragma unroll
for (int j = 0; j < 8; j++)
sh_code[8 * i + (j + lane) % 8] = dec;
}
__syncthreads();
float res = 0;
int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
while (iters--) {
// We pad shared memory to avoid bank conflicts during reads
__syncthreads();
for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
if (b_gl_rd + i < prob_k / 8)
sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
}
__syncthreads();
b_gl_rd += 32 * 8;
int b_sh_rd = 9 * (threadIdx.x % 32);
if (pred && a_gl_rd < a_gl_end) {
const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
half2 res2 = {};
#pragma unroll
for (int j = 0; j < 4; j++)
res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
res += __half2float(res2.x) + __half2float(res2.y);
b_sh_rd++;
}
a_gl_rd += 32;
}
}
if (pred) {
#pragma unroll
for (int i = 16; i > 0; i /= 2)
res += __shfl_down_sync(0xffffffff, res, i);
if (threadIdx.x % 32 == 0)
reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
}
}
__global__ void Code1x16Dequant(
const int4* __restrict__ A,
int4* __restrict__ C,
const int4* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
const int codebook_stride // as int4
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred)
{
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size)
{
codebook += codebook_stride;
++codebook_size;
}
}
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
int c_gl_stride = prob_k / 8;
int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
while (iters--) {
if (pred && a_gl_rd < a_gl_end) {
const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
int4 chunk;
auto dec = reinterpret_cast<uint32_t*>(&chunk);
// We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
// actually help us; this brings > 2x speedup.
asm volatile (
"ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
: "l"((void*) &codebook[enc[i]])
);
C[a_gl_rd * 8 + i] = chunk;
}
}
a_gl_rd += 32;
}
}
__global__ void Code2x8Dequant(
const int4* __restrict__ A,
int4* __restrict__ C,
const int4* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
const int codebook_stride // as int4
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred)
{
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size)
{
codebook += codebook_stride;
++codebook_size;
}
}
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
int lane = threadIdx.x % 8;
int c_gl_stride = prob_k / 8;
int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
extern __shared__ int4 sh[];
int4* sh_code = sh;
int4* sh_code0 = sh_code;
int4* sh_code1 = sh_code + 256 * 8;
for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
int4 dec = codebook[i];
#pragma unroll
for (int j = 0; j < 8; j++)
sh_code[8 * i + (j + lane) % 8] = dec;
}
__syncthreads();
float res = 0;
int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
while (iters--) {
if (pred && a_gl_rd < a_gl_end) {
const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
int4 chunk;
half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
#pragma unroll
for (int j = 0; j < 4; j++)
reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
C[a_gl_rd * 8 + i] = chunk;
}
}
a_gl_rd += 32;
}
}
inline int ceildiv(int a, int b) {
return (a + b - 1) / b;
}
const int THREAD_M = 16;
void code1x16_matvec_cuda(
const void* __restrict__ A,
const void* __restrict__ B,
void* __restrict__ C,
const void* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes,
const int codebook_stride
) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
Code1x16MatVec<<<blocks, threads, 16*32*9, stream>>>(
(const int4*) A,
(const int4*) B,
(int4*) C,
(const int4*) codebook,
prob_m,
prob_k,
codebook_a_sizes,
codebook_stride
);
}
void code2x8_matvec_cuda(
const void* __restrict__ A,
const void* __restrict__ B,
void* __restrict__ C,
const void* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes,
const int codebook_stride
) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
int shared = 16 * (2 * 256 * 8 + 32 * 9);
cudaFuncSetAttribute(
Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
Code2x8MatVec<<<blocks, threads, shared, stream>>>(
(const int4*) A,
(const int4*) B,
(int4*) C,
(const int4*) codebook,
prob_m,
prob_k,
codebook_a_sizes,
codebook_stride
);
}
void code1x16_dequant_cuda(
const void* __restrict__ A,
void* __restrict__ C,
const void* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
const int codebook_stride // as int4.
) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
Code1x16Dequant<<<blocks, threads, 0, stream>>>(
(const int4*) A,
(int4*) C,
(const int4*) codebook,
prob_m,
prob_k,
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
codebook_stride // as int4.
);
}
// Dequantizes the code and codebook into weights.
void code2x8_dequant_cuda(
const void* __restrict__ A,
void* __restrict__ C,
const void* __restrict__ codebook,
int prob_m,
int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
const int codebook_stride // as int4
) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
int shared = 16 * (2 * 256 * 8 + 32 * 9);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
cudaFuncSetAttribute(
Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
);
Code2x8Dequant<<<blocks, threads, shared, stream>>>(
(const int4*) A,
(int4*) C,
(const int4*) codebook,
prob_m,
prob_k,
codebook_a_sizes,
codebook_stride
);
}
int codebook_stride(const torch::Tensor& codebooks)
{
return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
}
void code1x16_matvec(
const torch::Tensor& A,
const torch::Tensor& B,
torch::Tensor& C,
const torch::Tensor& codebook,
const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long.
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
int prob_m = C.size(0);
int prob_k = B.size(0);
code1x16_matvec_cuda(
A.data_ptr(),
B.data_ptr(),
C.data_ptr(),
codebook.data_ptr(),
prob_m,
prob_k,
codebook_a_sizes,
codebook_stride(codebook)
);
}
torch::Tensor code1x16_matmat(
const torch::Tensor& input,
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const int4 codebook_a_sizes,
const std::optional<torch::Tensor>& bias) {
auto input_sizes = input.sizes();
auto out_features = codes.size(0) * codebooks.size(2);
auto flat_input = input.reshape({-1, input.size(-1)});
auto flat_output = torch::empty({flat_input.size(0), out_features},
torch::TensorOptions()
.dtype(input.dtype())
.device(input.device())
);
for (int i = 0; i < flat_input.size(0); ++i) {
auto input_vec = flat_input.index({i});
auto output_vec = flat_output.index({i});
code1x16_matvec(
codes.squeeze(2),
input_vec,
output_vec,
codebooks,
codebook_a_sizes
);
}
flat_output *= scales.flatten().unsqueeze(0);
if (bias.has_value()) {
flat_output += bias->unsqueeze(0);
}
auto output_sizes = input_sizes.vec();
output_sizes.pop_back();
output_sizes.push_back(-1);
auto output = flat_output.reshape(output_sizes);
return output;
}
void code2x8_matvec(
const torch::Tensor& A,
const torch::Tensor& B,
torch::Tensor& C,
const torch::Tensor& codebook,
const int4 codebook_a_sizes
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
int prob_m = C.size(0);
int prob_k = B.size(0);
code2x8_matvec_cuda(
A.data_ptr(),
B.data_ptr(),
C.data_ptr(),
codebook.data_ptr(),
prob_m,
prob_k,
codebook_a_sizes,
2 * codebook_stride(codebook)
);
}
torch::Tensor code2x8_matmat(
const torch::Tensor& input,
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const int4 codebook_a_sizes,
const std::optional<torch::Tensor>& bias
) {
auto input_sizes = input.sizes();
auto out_features = codes.size(0) * codebooks.size(2);
auto flat_input = input.reshape({-1, input.size(-1)});
auto flat_output = torch::empty({flat_input.size(0), out_features},
torch::TensorOptions()
.dtype(input.dtype())
.device(input.device())
);
for (int i = 0; i < flat_input.size(0); ++i) {
auto input_vec = flat_input.index({i});
auto output_vec = flat_output.index({i});
code2x8_matvec(
codes.squeeze(2),
input_vec,
output_vec,
codebooks,
codebook_a_sizes
);
}
flat_output *= scales.flatten().unsqueeze(0);
if (bias.has_value()) {
flat_output += bias->unsqueeze(0);
}
auto output_sizes = input_sizes.vec();
output_sizes.pop_back();
output_sizes.push_back(-1);
auto output = flat_output.reshape(output_sizes);
return output;
}
// Accumulate the partition sizes.
int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
{
int4 cumulative_sizes;
auto cumulative_size = &cumulative_sizes.x;
int i = 0;
int last = 0;
assert(codebook_partition_sizes.size(0) <= 4);
for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size)
{
*cumulative_size = codebook_partition_sizes[i].item<int>() + last;
last = *cumulative_size;
}
// fill in the rest with unreachable.
for (; i < 4; ++i, ++cumulative_size)
{
*cumulative_size = last*10;
}
return cumulative_sizes;
}
} // namespace aqlm
} // namespace vllm
torch::Tensor aqlm_gemm(
const torch::Tensor& input,
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const torch::Tensor& codebook_partition_sizes,
const std::optional<torch::Tensor>& bias
)
{
int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
int const entries = codebooks.size(1);
if (nbooks == 1 && entries == (1 << 16))
{
return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
}
if (nbooks == 2 && entries == (1 << 8))
{
return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
}
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
return {};
}
torch::Tensor aqlm_dequant(
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& codebook_partition_sizes
)
{
int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
int const entries = codebooks.size(1);
const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
int rows = codes.size(1);
int cols = codes.size(0);
auto in_features = codes.size(1) * 8;
auto out_features = codes.size(0);
assert(out_features = codebook_partition_sizes.sum().item<int>());
auto weights = torch::empty({out_features, in_features},
torch::TensorOptions()
.dtype(codebooks.dtype())
.device(codebooks.device())
);
if (nbooks == 1 && entries == (1 << 16))
{
vllm::aqlm::code1x16_dequant_cuda(
codes.data_ptr(),
weights.data_ptr(),
codebooks.data_ptr(),
out_features,
in_features,
cumulative_sizes,
vllm::aqlm::codebook_stride(codebooks));
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
// weights *= scales.index({"...", 0, 0});
return weights;
}
if (nbooks == 2 && entries == (1 << 8))
{
vllm::aqlm::code2x8_dequant_cuda(
codes.data_ptr(),
weights.data_ptr(),
codebooks.data_ptr(),
out_features,
in_features,
cumulative_sizes,
vllm::aqlm::codebook_stride(codebooks));
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
// weights *= scales.index({"...", 0, 0});
return weights;
}
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
return {};
}

View File

@ -1,126 +0,0 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cmath>
#include "cuda_compat.h"
#include "dispatch_utils.h"
namespace vllm {
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
// Compute the absolute maximum m of the input tensor and store
// m / float8_e4m3::max() in *scale. Each thread block performs a
// reduction tree and the memory in scale is atomically updated.
// So to get the right answer, *scale needs to be initialized to
// a value <= 0.0 and we need to wait for all thread blocks to
// finish before consuming *scale.
template<typename scalar_t>
__global__ void segmented_max_reduction(
float* __restrict__ scale,
const scalar_t* __restrict__ input,
int64_t num_elems) {
__shared__ float cache[1024];
int i = blockDim.x * blockIdx.x + threadIdx.x;
// First store maximum for all values processes by
// the current thread in cache[threadIdx.x]
scalar_t tmp = 0.0;
while (i < num_elems) {
float x = static_cast<float>(input[i]);
tmp = max(tmp, fabs(x));
i += blockDim.x * gridDim.x;
}
cache[threadIdx.x] = tmp;
__syncthreads();
// Now perform parallel reduction within the thread block
int ib = blockDim.x / 2;
while (ib != 0) {
if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
cache[threadIdx.x] = cache[threadIdx.x + ib];
}
__syncthreads();
ib /= 2;
}
// Finally, since cache[0] contains the maximum for this thread block,
// atomically write the max to the target location
if (threadIdx.x == 0) {
atomicMaxFloat(scale, cache[0] / std::numeric_limits<c10::Float8_e4m3fn>::max());
}
}
template<typename scalar_t>
__global__ void scaled_fp8_quant_kernel(
c10::Float8_e4m3fn* __restrict__ out,
const scalar_t* __restrict__ input,
const float* __restrict__ scale,
int64_t num_elems) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
while (i < num_elems) {
out[i] = static_cast<c10::Float8_e4m3fn>(input[i] / *scale);
i += blockDim.x * gridDim.x;
}
}
} // namespace vllm
void static_scaled_fp8_quant(
torch::Tensor& out, // [..., d]
torch::Tensor& input, // [..., d]
torch::Tensor& scale) // [1]
{
int64_t num_tokens = input.numel() / input.size(-1);
int64_t num_elems = input.numel();
dim3 grid(num_tokens);
dim3 block(1024);
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(),
"scaled_fp8_quant_kernel",
[&] {
vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
out.data_ptr<c10::Float8_e4m3fn>(),
input.data_ptr<scalar_t>(),
scale.data_ptr<float>(),
num_elems);
});
}
void dynamic_scaled_fp8_quant(
torch::Tensor& out, // [..., d]
torch::Tensor& input, // [..., d]
torch::Tensor& scale) // [1]
{
int64_t num_tokens = input.numel() / input.size(-1);
int64_t num_elems = input.numel();
dim3 grid(num_tokens);
dim3 block(1024);
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(),
"scaled_fp8_quant_kernel",
[&] {
vllm::segmented_max_reduction<scalar_t><<<grid, block, 0, stream>>>(
scale.data_ptr<float>(),
input.data_ptr<scalar_t>(),
num_elems);
vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
out.data_ptr<c10::Float8_e4m3fn>(),
input.data_ptr<scalar_t>(),
scale.data_ptr<float>(),
num_elems);
});
}

File diff suppressed because it is too large Load Diff

View File

@ -1,70 +0,0 @@
#pragma once
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
namespace gptq_marlin {
// 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per
// schedule allows some more latency hiding. At the same time, we want relatively few warps to have
// many registers per warp and small tiles.
static constexpr int default_threads = 256;
static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory
static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;
static constexpr int tile_size = 16;
static constexpr int max_par = 16;
template <typename T, int n>
struct Vec {
T elems[n];
__device__ T& operator[](int i) { return elems[i]; }
};
using I4 = Vec<int, 4>;
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" cp.async.cg.shared.global [%0], [%1], %2;\n"
"}\n" ::"r"(smem),
"l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); }
template <int n>
__device__ inline void cp_async_wait() {
asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}
#endif
} // namespace gptq_marlin

View File

@ -1,352 +0,0 @@
#include "gptq_marlin.cuh"
namespace gptq_marlin {
static constexpr int repack_stages = 8;
static constexpr int repack_threads = 256;
static constexpr int tile_k_size = tile_size;
static constexpr int tile_n_size = tile_k_size * 4;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void
marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
uint32_t const *__restrict__ perm_ptr,
uint32_t *__restrict__ out_ptr, int size_k, int size_n) {}
} // namespace gptq_marlin
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
int64_t size_k, int64_t size_n,
int64_t num_bits) {
TORCH_CHECK_NOT_IMPLEMENTED(
false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
return torch::empty({1, 1});
}
#else
template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void
marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
uint32_t const *__restrict__ perm_ptr,
uint32_t *__restrict__ out_ptr, int size_k, int size_n) {
constexpr int pack_factor = 32 / num_bits;
int k_tiles = size_k / tile_k_size;
int n_tiles = size_n / tile_n_size;
int block_k_tiles = div_ceil(k_tiles, gridDim.x);
int start_k_tile = blockIdx.x * block_k_tiles;
if (start_k_tile >= k_tiles) {
return;
}
int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
// Wait until the next thread tile has been loaded to shared memory.
auto wait_for_stage = [&]() {
// We only have `stages - 2` active fetches since we are double buffering
// and can only issue the next fetch when it is guaranteed that the previous
// shared memory load is fully complete (as it may otherwise be
// overwritten).
cp_async_wait<repack_stages - 2>();
__syncthreads();
};
extern __shared__ int4 sh[];
constexpr int perm_size = tile_k_size / 4;
int4 *sh_perm_ptr = sh;
int4 *sh_pipe_ptr = sh_perm_ptr;
if constexpr (has_perm) {
sh_pipe_ptr += perm_size;
}
constexpr int tile_ints = tile_k_size / pack_factor;
constexpr int stage_n_threads = tile_n_size / 4;
constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
constexpr int stage_size = stage_k_threads * stage_n_threads;
auto load_perm_to_shared = [&](int k_tile_id) {
int first_k_int4 = (k_tile_id * tile_k_size) / 4;
int4 const *perm_int4_ptr = reinterpret_cast<int4 const *>(perm_ptr);
if (threadIdx.x < perm_size) {
sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
}
__syncthreads();
};
auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
if (n_tile_id >= n_tiles) {
cp_async_fence();
return;
}
int first_n = n_tile_id * tile_n_size;
int4 *sh_ptr = sh_pipe_ptr + stage_size * pipe;
if constexpr (has_perm) {
if (threadIdx.x < stage_size) {
int k_id = threadIdx.x / stage_n_threads;
int n_id = threadIdx.x % stage_n_threads;
uint32_t const *sh_perm_int_ptr =
reinterpret_cast<uint32_t const *>(sh_perm_ptr);
int src_k = sh_perm_int_ptr[k_id];
int src_k_packed = src_k / pack_factor;
cp_async4(
&sh_ptr[k_id * stage_n_threads + n_id],
reinterpret_cast<int4 const *>(&(
b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
}
} else {
if (threadIdx.x < stage_size) {
int k_id = threadIdx.x / stage_n_threads;
int n_id = threadIdx.x % stage_n_threads;
int first_k = k_tile_id * tile_k_size;
int first_k_packed = first_k / pack_factor;
cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
reinterpret_cast<int4 const *>(
&(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
first_n + (n_id * 4)])));
}
}
cp_async_fence();
};
auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
if (n_tile_id >= n_tiles) {
return;
}
int warp_id = threadIdx.x / 32;
int th_id = threadIdx.x % 32;
if (warp_id >= 4) {
return;
}
int tc_col = th_id / 4;
int tc_row = (th_id % 4) * 2;
constexpr int tc_offsets[4] = {0, 1, 8, 9};
int cur_n = warp_id * 16 + tc_col;
constexpr int sh_stride = 64;
constexpr uint32_t mask = (1 << num_bits) - 1;
int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
uint32_t *sh_perm_int_ptr = reinterpret_cast<uint32_t *>(sh_perm_ptr);
uint32_t vals[8];
if constexpr (has_perm) {
for (int i = 0; i < 4; i++) {
int k_idx = tc_row + tc_offsets[i];
uint32_t src_k = sh_perm_int_ptr[k_idx];
uint32_t src_k_pos = src_k % pack_factor;
uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
vals[i] = b1_cur_val;
vals[4 + i] = b2_cur_val;
}
} else {
uint32_t b1_vals[tile_ints];
uint32_t b2_vals[tile_ints];
#pragma unroll
for (int i = 0; i < tile_ints; i++) {
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
}
#pragma unroll
for (int i = 0; i < 4; i++) {
int cur_elem = tc_row + tc_offsets[i];
int cur_int = cur_elem / pack_factor;
int cur_pos = cur_elem % pack_factor;
vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
}
}
constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
// Result of:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
if constexpr (num_bits == 4) {
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
uint32_t res = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
res |= vals[pack_idx[i]] << (i * 4);
}
out_ptr[out_offset + th_id * 4 + warp_id] = res;
} else {
constexpr int pack_idx[4] = {0, 2, 1, 3};
uint32_t res1 = 0;
uint32_t res2 = 0;
#pragma unroll
for (int i = 0; i < 4; i++) {
res1 |= vals[pack_idx[i]] << (i * 8);
res2 |= vals[4 + pack_idx[i]] << (i * 8);
}
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
}
};
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
#pragma unroll
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
}
wait_for_stage();
};
#pragma unroll
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
int n_tile_id = 0;
if constexpr (has_perm) {
load_perm_to_shared(k_tile_id);
}
start_pipes(k_tile_id, n_tile_id);
while (n_tile_id < n_tiles) {
#pragma unroll
for (int pipe = 0; pipe < repack_stages; pipe++) {
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
n_tile_id + pipe + repack_stages - 1);
repack_tile(pipe, k_tile_id, n_tile_id + pipe);
wait_for_stage();
}
n_tile_id += repack_stages;
}
}
}
} // namespace gptq_marlin
#define CALL_IF(NUM_BITS, HAS_PERM) \
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
cudaFuncSetAttribute( \
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, \
NUM_BITS, HAS_PERM>, \
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
HAS_PERM> \
<<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>( \
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
}
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
int64_t size_k, int64_t size_n,
int64_t num_bits) {
// Verify compatibility with marlin tile of 16x64
TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
" is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
" is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
TORCH_CHECK(num_bits == 4 || num_bits == 8,
"num_bits must be 4 or 8. Got = ", num_bits);
int const pack_factor = 32 / num_bits;
// Verify B
TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
"Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
", size_k = ", size_k, ", pack_factor = ", pack_factor);
TORCH_CHECK(b_q_weight.size(1) == size_n,
"b_q_weight.size(1) = ", b_q_weight.size(1),
" is not size_n = ", size_n);
// Verify device and strides
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
// Alloc buffers
const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
auto options = torch::TensorOptions()
.dtype(b_q_weight.dtype())
.device(b_q_weight.device());
torch::Tensor out =
torch::empty({size_k / gptq_marlin::tile_size,
size_n * gptq_marlin::tile_size / pack_factor},
options);
// Detect if there is act_order
bool has_perm = perm.size(0) != 0;
// Get ptrs
uint32_t const *b_q_weight_ptr =
reinterpret_cast<uint32_t const *>(b_q_weight.data_ptr());
uint32_t const *perm_ptr =
reinterpret_cast<uint32_t const *>(perm.data_ptr());
uint32_t *out_ptr = reinterpret_cast<uint32_t *>(out.data_ptr());
// Get dev info
int dev = b_q_weight.get_device();
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
int blocks;
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
int max_shared_mem = 0;
cudaDeviceGetAttribute(&max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
TORCH_CHECK(max_shared_mem > 0);
if (false) {
}
CALL_IF(4, false)
CALL_IF(4, true)
CALL_IF(8, false)
CALL_IF(8, true)
else {
TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
", has_perm = ", has_perm);
}
return out;
}
#endif

View File

@ -67,13 +67,20 @@ __device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
// Asynchronous global->shared copy
__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
// Asynchronous global->shared copy with a cache hint indicating that the values
// may be evicted immediately; used for quantized weights B, which are only
// accessed precisely once and should thus not pollute the L2 cache which we
// need for inputs A and outputs C.
__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" cp.async.cg.shared.global [%0], [%1], %2;\n"
"}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES));
asm volatile(
"{\n"
" .reg .b64 p;\n"
" createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
" cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
"}\n" ::"r"(smem),
"l"(glob_ptr), "n"(BYTES));
}
// Async copy fence.
@ -441,14 +448,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll
for (int i = 0; i < b_sh_wr_iters; i++) {
cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
B_ptr[i] += b_gl_rd_delta_o;
}
// Only fetch scales if this tile starts a new group
if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
if (s_sh_wr_pred)
cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
s_gl_rd += s_gl_rd_delta;
}
}
@ -743,7 +750,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
// write-out
if (group_blocks == -1 && last) {
if (s_sh_wr_pred)
cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]);
cp_async_fence();
}
thread_block_reduce();

Binary file not shown.

Before

Width:  |  Height:  |  Size: 115 KiB

View File

@ -11,14 +11,12 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.
import logging
import os
import sys
from typing import List
from sphinx.ext import autodoc
logger = logging.getLogger(__name__)
sys.path.append(os.path.abspath("../.."))
# -- Project information -----------------------------------------------------
@ -48,7 +46,7 @@ templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns: List[str] = ["**/*.template.rst"]
exclude_patterns: List[str] = []
# Exclude the prompt "$" when copying code
copybutton_prompt_text = r"\$ "
@ -73,13 +71,6 @@ html_theme_options = {
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
# Generate additional rst documentation here.
def setup(app):
from docs.source.generate_examples import generate_examples
generate_examples()
# Mock out external dependencies here.
autodoc_mock_imports = [
"cpuinfo",
@ -98,10 +89,9 @@ autodoc_mock_imports = [
for mock_target in autodoc_mock_imports:
if mock_target in sys.modules:
logger.info(
"Potentially problematic mock target (%s) found; "
f"Potentially problematic mock target ({mock_target}) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.",
mock_target)
"been loaded into sys.modules when the sphinx build starts.")
class MockedClassDocumenter(autodoc.ClassDocumenter):

View File

@ -1,50 +0,0 @@
Dockerfile
====================
See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct
the image for running an OpenAI compatible server with vLLM.
- Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
- All build stages
- The default build target (highlighted in grey)
- External images (with dashed borders)
The edges of the build graph represent:
- FROM ... dependencies (with a solid line and a full arrow head)
- COPY --from=... dependencies (with a dashed line and an empty arrow head)
- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
.. figure:: ../../assets/dev/dockerfile-stages-dependency.png
:alt: query
:width: 100%
:align: center
Made using: https://github.com/patrickhoefler/dockerfilegraph
Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
.. code:: bash
dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
or in case you want to run it directly with the docker image:
.. code:: bash
docker run \
--rm \
--user "$(id -u):$(id -g)" \
--workdir /workspace \
--volume "$(pwd)":/workspace \
ghcr.io/patrickhoefler/dockerfilegraph:alpine \
--output png \
--dpi 200 \
--max-label-length 50 \
--filename Dockerfile \
--legend
(To run it for a different file, you can pass in a different argument to the flag `--filename`.)

View File

@ -1,6 +1,7 @@
AsyncLLMEngine
=================================
.. autoclass:: vllm.AsyncLLMEngine
:members:
.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
:members: generate, abort
:show-inheritance:

View File

@ -1,6 +1,6 @@
LLMEngine
=================================
.. autoclass:: vllm.LLMEngine
:members:
:show-inheritance:
.. autoclass:: vllm.engine.llm_engine.LLMEngine
:members: add_request, abort_request, step
:show-inheritance:

View File

@ -1,5 +1,4 @@
Sampling Params
===============
.. autoclass:: vllm.SamplingParams
:members:
.. automodule:: vllm.sampling_params.SamplingParams

View File

@ -1,61 +0,0 @@
import re
from pathlib import Path
def fix_case(text: str) -> str:
subs = [
("api", "API"),
("llm", "LLM"),
("vllm", "vLLM"),
("openai", "OpenAI"),
("multilora", "MultiLoRA"),
]
for sub in subs:
text = re.sub(*sub, text, flags=re.IGNORECASE)
return text
def underline(title: str, character: str = "=") -> str:
return f"{title}\n{character * len(title)}"
def generate_title(filename: str) -> str:
# Turn filename into a title
title = filename.replace("_", " ").title()
# Handle acronyms and names
title = fix_case(title)
# Underline title
title = underline(title)
return title
def generate_examples():
root_dir = Path(__file__).parent.parent.parent.resolve()
# Source paths
script_dir = root_dir / "examples"
script_paths = sorted(script_dir.glob("*.py"))
# Destination paths
doc_dir = root_dir / "docs/source/getting_started/examples"
doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
# Generate the example docs for each example script
for script_path, doc_path in zip(script_paths, doc_paths):
script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
# Make script_path relative to doc_path and call it include_path
include_path = '../../../..' / script_path.relative_to(root_dir)
content = (f"{generate_title(doc_path.stem)}\n\n"
f"Source {script_url}.\n\n"
f".. literalinclude:: {include_path}\n"
" :language: python\n"
" :linenos:\n")
with open(doc_path, "w+") as f:
f.write(content)
# Generate the toctree for the example scripts
with open(doc_dir / "examples_index.template.rst") as f:
examples_index = f.read()
with open(doc_dir / "examples_index.rst", "w+") as f:
example_docs = "\n ".join(path.stem for path in script_paths)
f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))

View File

@ -3,7 +3,9 @@
Installation with ROCm
======================
vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm.
At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported.
Data types currently supported in ROCm are FP16 and BF16.
Requirements
------------
@ -11,57 +13,114 @@ Requirements
* OS: Linux
* Python: 3.8 -- 3.11
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
* ROCm 6.0 and ROCm 5.7
* Pytorch 2.0.1/2.1.1/2.2
* ROCm 5.7 (Verified on python 3.10) or ROCm 6.0 (Verified on python 3.9)
Installation options:
#. :ref:`Build from source with docker <build_from_source_docker_rocm>`
#. :ref:`(Recommended) Quick start with vLLM pre-installed in Docker Image <quick_start_docker_rocm>`
#. :ref:`Build from source <build_from_source_rocm>`
#. :ref:`Build from source with docker <build_from_source_docker_rocm>`
.. _quick_start_docker_rocm:
(Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image
---------------------------------------------------------------------------
This option is for ROCm 5.7 only:
.. code-block:: console
$ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4
$ docker run -it \
--network=host \
--group-add=video \
--ipc=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v <path/to/model>:/app/model \
embeddedllminfo/vllm-rocm \
bash
.. _build_from_source_rocm:
Option 2: Build from source
---------------------------
You can build and install vLLM from source:
Below instruction is for ROCm 5.7 only.
At the time of this documentation update, PyTorch on ROCm 6.0 wheel is not yet available on the PyTorch website.
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
- `Pytorch <https://pytorch.org/>`_
.. code-block:: console
$ pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
.. note::
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
.. code-block:: console
$ pip install xformers==0.0.23 --no-deps
$ bash patch_xformers.rocm.sh
3. Build vLLM.
.. code-block:: console
$ cd vllm
$ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
.. _build_from_source_docker_rocm:
Option 1: Build from source with docker (recommended)
Option 3: Build from source with docker
-----------------------------------------------------
You can build and install vLLM from source.
You can build and install vLLM from source:
First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
Build a docker image from `Dockerfile.rocm`, and launch a docker container.
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
It provides flexibility to customize the build of docker image using the following arguments:
The `Dockerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:
* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
* `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5`
* `BUILD_FA`: specifies whether to build flash-attention. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
.. code-block:: console
$ docker build -f Dockerfile.rocm -t vllm-rocm .
To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
.. code-block:: console
$ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
For example, to build docker image for vllm on ROCm 5.7, you can run:
.. code-block:: console
$ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-f Dockerfile.rocm -t vllm-rocm .
To run the above docker image ``vllm-rocm``, use the below command:
To build vllm on ROCm 6.0, you can use the default:
.. code-block:: console
$ docker build -f Dockerfile.rocm -t vllm-rocm .
$ docker run -it \
--network=host \
--group-add=video \
@ -74,13 +133,7 @@ To run the above docker image ``vllm-rocm``, use the below command:
vllm-rocm \
bash
Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
.. _build_from_source_rocm:
Option 2: Build from source
---------------------------
Alternatively, if you plan to install vLLM-ROCm on a local machine or start from a fresh docker image (e.g. rocm/pytorch), you can follow the steps below:
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
@ -88,50 +141,32 @@ Option 2: Build from source
- `Pytorch <https://pytorch.org/>`_
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
For rocm6.0:
.. code-block:: console
$ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
For rocm5.7:
.. code-block:: console
$ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
.. note::
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
- If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
.. code-block:: console
$ pip install xformers==0.0.23 --no-deps
$ bash patch_xformers.rocm.sh
3. Build vLLM.
.. code-block:: console
.. code-block:: console
$ cd vllm
$ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
$ cd vllm
$ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes.
.. tip::
.. note::
- You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
- To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention.
- The ROCm version of pytorch, ideally, should match the ROCm driver version.

View File

@ -1,8 +0,0 @@
Examples
=================================
.. toctree::
:maxdepth: 1
:caption: Scripts
%EXAMPLE_DOCS%

View File

@ -53,7 +53,6 @@ You can also build and install vLLM from source:
$ git clone https://github.com/vllm-project/vllm.git
$ cd vllm
$ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
$ pip install -e . # This may take 5-10 minutes.
.. tip::

View File

@ -65,7 +65,6 @@ Documentation
getting_started/neuron-installation
getting_started/cpu-installation
getting_started/quickstart
getting_started/examples/examples_index
.. toctree::
:maxdepth: 1
@ -75,7 +74,6 @@ Documentation
serving/deploying_with_docker
serving/distributed_serving
serving/metrics
serving/env_vars
serving/usage_stats
serving/integrations
@ -87,7 +85,6 @@ Documentation
models/adding_model
models/engine_args
models/lora
models/performance
.. toctree::
:maxdepth: 1
@ -104,7 +101,6 @@ Documentation
dev/sampling_params
dev/engine/engine_index
dev/kernel/paged_attention
dev/dockerfile/dockerfile
Indices and tables
==================

View File

@ -95,7 +95,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
5. Register your model
----------------------
Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_.
Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader.py>`_.
6. Out-of-Tree Model Integration
--------------------------------------------

View File

@ -5,19 +5,133 @@ Engine Arguments
Below, you can find an explanation of every engine argument for vLLM:
.. argparse::
:module: vllm.engine.arg_utils
:func: _engine_args_parser
:prog: -m vllm.entrypoints.openai.api_server
:nodefaultconst:
.. option:: --model <model_name_or_path>
Name or path of the huggingface model to use.
.. option:: --tokenizer <tokenizer_name_or_path>
Name or path of the huggingface tokenizer to use.
.. option:: --revision <revision>
The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
.. option:: --tokenizer-revision <revision>
The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
.. option:: --tokenizer-mode {auto,slow}
The tokenizer mode.
* "auto" will use the fast tokenizer if available.
* "slow" will always use the slow tokenizer.
.. option:: --trust-remote-code
Trust remote code from huggingface.
.. option:: --download-dir <directory>
Directory to download and load the weights, default to the default cache dir of huggingface.
.. option:: --load-format {auto,pt,safetensors,npcache,dummy,tensorizer}
The format of the model weights to load.
* "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
* "pt" will load the weights in the pytorch bin format.
* "safetensors" will load the weights in the safetensors format.
* "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
* "dummy" will initialize the weights with random values, mainly for profiling.
* "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_ See `examples/tensorize_vllm_model.py <https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py>`_ to serialize a vLLM model, and for more information.
.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
Data type for model weights and activations.
* "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
* "half" for FP16. Recommended for AWQ quantization.
* "float16" is the same as "half".
* "bfloat16" for a balance between precision and range.
* "float" is shorthand for FP32 precision.
* "float32" for FP32 precision.
.. option:: --max-model-len <length>
Model context length. If unspecified, will be automatically derived from the model config.
.. option:: --worker-use-ray
Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
.. option:: --pipeline-parallel-size (-pp) <size>
Number of pipeline stages.
.. option:: --tensor-parallel-size (-tp) <size>
Number of tensor parallel replicas.
.. option:: --max-parallel-loading-workers <workers>
Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
.. option:: --block-size {8,16,32}
Token block size for contiguous chunks of tokens.
.. option:: --enable-prefix-caching
Enables automatic prefix caching
.. option:: --seed <seed>
Random seed for operations.
.. option:: --swap-space <size>
CPU swap space size (GiB) per GPU.
.. option:: --gpu-memory-utilization <fraction>
The fraction of GPU memory to be used for the model executor, which can range from 0 to 1.
For example, a value of 0.5 would imply 50% GPU memory utilization.
If unspecified, will use the default value of 0.9.
.. option:: --max-num-batched-tokens <tokens>
Maximum number of batched tokens per iteration.
.. option:: --max-num-seqs <sequences>
Maximum number of sequences per iteration.
.. option:: --max-paddings <paddings>
Maximum number of paddings in a batch.
.. option:: --disable-log-stats
Disable logging statistics.
.. option:: --quantization (-q) {awq,squeezellm,None}
Method used to quantize the weights.
Async Engine Arguments
----------------------
Below are the additional arguments related to the asynchronous engine:
.. argparse::
:module: vllm.engine.arg_utils
:func: _async_engine_args_parser
:prog: -m vllm.entrypoints.openai.api_server
:nodefaultconst:
.. option:: --engine-use-ray
Use Ray to start the LLM engine in a separate process as the server process.
.. option:: --disable-log-requests
Disable logging requests.
.. option:: --max-log-len
Max number of prompt characters or prompt ID numbers being printed in log. Defaults to unlimited.

View File

@ -1,38 +0,0 @@
.. _performance:
Performance and Tuning
======================
Chunked Prefill
---------------
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
You can enable the feature by specifying
.. code-block:: python
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
# Set max_num_batched_tokens to tune performance.
# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to thefirst token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
Once chunked prefill is enabled, the policy is changed to
- prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill.
- When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
This policy has two benefits.
- It improves ITL (inter token latency) and generation decode because decode requests are prioritized.
- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
You can tune the performance by changing `max_num_batched_tokens`.
By default, it is set to 512, which has the best ITL on A100 in the initial benchmark.
Smaller batch size achieves better ITL because there are fewer prefills interrupting decodes.
Higher batch size achieves better TTFT as you can put more prefill to the batch.
If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
Note that the default batch size (512) is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput.
See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369).

View File

@ -80,8 +80,8 @@ Alongside each architecture, we include some popular models that use it.
- :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
-
* - :code:`LlamaForCausalLM`
- LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
- :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
- LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
- :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
- ✅︎
* - :code:`MiniCPMForCausalLM`
- MiniCPM
@ -101,7 +101,7 @@ Alongside each architecture, we include some popular models that use it.
-
* - :code:`OLMoForCausalLM`
- OLMo
- :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
- :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
-
* - :code:`OPTForCausalLM`
- OPT, OPT-IML
@ -115,10 +115,6 @@ Alongside each architecture, we include some popular models that use it.
- Phi
- :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
-
* - :code:`Phi3ForCausalLM`
- Phi-3
- :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
-
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.

View File

@ -49,6 +49,3 @@ To run vLLM:
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
vllm/vllm-openai <args...>
.. note::
vLLM docker image is currently designed to be run under the root user (contribution welcomed for changing this!). It will try to load library at runtime under the root user's home directory, e.g. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . If you are running the container under a different user, you may need to change the permissions of the library (and all the parent directories) to allow the user to access it. Then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .

View File

@ -1,9 +0,0 @@
Environment Variables
========================
vLLM uses the following environment variables to configure the system:
.. literalinclude:: ../../../vllm/envs.py
:language: python
:start-after: begin-env-vars-definition
:end-before: end-env-vars-definition

View File

@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
You can start the server using Python, or using [Docker](deploying_with_docker.rst):
```bash
python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123
```
To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@ -16,7 +16,7 @@ client = OpenAI(
)
completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct",
model="mistralai/Mistral-7B-Instruct-v0.2",
messages=[
{"role": "user", "content": "Hello!"}
]
@ -37,7 +37,7 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
```python
completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct",
model="mistralai/Mistral-7B-Instruct-v0.2",
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
@ -87,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
An example chat template for `mistralai/Mistral-7B-Instruct-v0.2` can be found [here](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format)
Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat

View File

@ -1,7 +1,7 @@
.. _on_cloud:
Deploying and scaling up with SkyPilot
================================================
Running on clouds with SkyPilot
===============================
.. raw:: html
@ -9,75 +9,51 @@ Deploying and scaling up with SkyPilot
<img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
</p>
vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
Prerequisites
-------------
- Go to the `HuggingFace model page <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`.
- Check that you have installed SkyPilot (`docs <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__).
- Check that :code:`sky check` shows clouds or Kubernetes are enabled.
To install SkyPilot and setup your cloud credentials, run:
.. code-block:: console
pip install skypilot-nightly
sky check
Run on a single instance
------------------------
$ pip install skypilot
$ sky check
See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
.. code-block:: yaml
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
accelerators: A100
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
MODEL_NAME: decapoda-research/llama-13b-hf
TOKENIZER: hf-internal-testing/llama-tokenizer
setup: |
conda create -n vllm python=3.10 -y
conda create -n vllm python=3.9 -y
conda activate vllm
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
git clone https://github.com/vllm-project/vllm.git
cd vllm
pip install .
pip install gradio
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
python -u -m vllm.entrypoints.api_server \
--model $MODEL_NAME \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--tokenizer $TOKENIZER 2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
python vllm/examples/gradio_webserver.py
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
Start the serving the LLaMA-13B model on an A100 GPU:
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
$ sky launch serving.yaml
Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
@ -85,226 +61,9 @@ Check the output of the command. There will be a shareable gradio link (like the
(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
**Optional**: Serve the 65B model instead of the default 13B and use more GPU:
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
Scale up to multiple replicas
-----------------------------
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
.. code-block:: yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
.. raw:: html
<details>
<summary>Click to see the full recipe YAML</summary>
.. code-block:: yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
.. raw:: html
</details>
Start the serving the Llama-3 8B model on multiple replicas:
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
Wait until the service is ready:
.. code-block:: console
watch -n10 sky serve status vllm
.. raw:: html
<details>
<summary>Example outputs:</summary>
.. code-block:: console
Services
NAME VERSION UPTIME STATUS REPLICAS ENDPOINT
vllm 1 35s READY 2/2 xx.yy.zz.100:30001
Service Replicas
SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION
vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP({'L4': 1}) READY us-east4
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP({'L4': 1}) READY us-east4
.. raw:: html
</details>
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
.. code-block:: console
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
curl -L http://$ENDPOINT/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Who are you?"
}
],
"stop_token_ids": [128009, 128001]
}'
To enable autoscaling, you could specify additional configs in `services`:
.. code-block:: yaml
services:
replica_policy:
min_replicas: 0
max_replicas: 3
target_qps_per_replica: 2
This will scale the service up to when the QPS exceeds 2 for each replica.
**Optional**: Connect a GUI to the endpoint
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
.. raw:: html
<details>
<summary>Click to see the full GUI YAML</summary>
.. code-block:: yaml
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
resources:
cpus: 2
setup: |
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.10 -y
conda activate vllm
fi
# Install Gradio for web UI.
pip install gradio openai
run: |
conda activate vllm
export PATH=$PATH:/sbin
WORKER_IP=$(hostname -I | cut -d' ' -f1)
CONTROLLER_PORT=21001
WORKER_PORT=21002
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log
.. raw:: html
</details>
1. Start the chat web UI:
.. code-block:: console
sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
2. Then, we can access the GUI at the returned gradio link:
.. code-block:: console
| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf

View File

@ -1,46 +0,0 @@
import argparse
from vllm import LLM, SamplingParams
def main():
parser = argparse.ArgumentParser(description='AQLM examples')
parser.add_argument('--model',
'-m',
type=str,
default=None,
help='model path, as for HF')
parser.add_argument('--choice',
'-c',
type=int,
default=0,
help='known good models by index, [0-4]')
parser.add_argument('--tensor_parallel_size',
'-t',
type=int,
default=1,
help='tensor parallel size')
args = parser.parse_args()
models = [
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
"ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
"BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
]
model = LLM(args.model if args.model is not None else models[args.choice],
tensor_parallel_size=args.tensor_parallel_size)
sampling_params = SamplingParams(max_tokens=100, temperature=0)
outputs = model.generate("Hello my name is",
sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
if __name__ == '__main__':
main()

View File

@ -11,7 +11,7 @@ from safetensors.torch import safe_open
from vllm.model_executor.layers.quantization.schema import QuantParamSchema
# Adapted from vllm/model_executor/model_loader/weight_utils.py
# Adapted from vllm/model_executor/weight_utils.py
# The main differences are that we add the NPZ format and simplify
# its functionality drastically for our purposes (e.g. we assume that
# the quantized model exists locally and there is no need to download it)
@ -71,7 +71,7 @@ def _prepare_hf_weights(
return hf_weights_files, use_safetensors
# Adapted from vllm/model_executor/model_loader/weight_utils.py
# Adapted from vllm/model_executor/weight_utils.py
def _hf_tensorfile_iterator(filename: str, load_format: str,
use_safetensors: bool):
if load_format == "npz":

View File

@ -1,178 +0,0 @@
# Logging Configuration
vLLM leverages Python's `logging.config.dictConfig` functionality to enable
robust and flexible configuration of the various loggers used by vLLM.
vLLM offers two environment variables that can be used to accommodate a range
of logging configurations that range from simple-and-inflexible to
more-complex-and-more-flexible.
- No vLLM logging (simple and inflexible)
- Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
- vLLM's default logging configuration (simple and inflexible)
- Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
- Fine-grained custom logging configuration (more complex, more flexible)
- Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
## Logging Configuration Environment Variables
### `VLLM_CONFIGURE_LOGGING`
`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
configure the loggers used by vLLM. This functionality is enabled by default,
but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
configure the root vLLM logger. By default, no other vLLM loggers are
configured and, as such, all vLLM loggers defer to the root vLLM logger to make
all logging decisions.
If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
### `VLLM_LOGGING_CONFIG_PATH`
`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
alternative, custom logging configuration that will be used instead of vLLM's
built-in default logging configuration. The logging configuration should be
provided in JSON format following the schema specified by Python's [logging
configuration dictionary
schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
disabled, an error will occur while starting vLLM.
## Examples
### Example 1: Customize vLLM root logger
For this example, we will customize the vLLM root logger to use
[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
STDOUT of the console in JSON format with a log level of `INFO`.
To begin, first, create an appropriate JSON logging configuration file:
**/path/to/logging_config.json:**
```json
{
"formatters": {
"json": {
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
}
},
"handlers": {
"console": {
"class" : "logging.StreamHandler",
"formatter": "json",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["console"],
"level": "INFO",
"propagate": false
}
},
"version": 1
}
```
Next, install the `python-json-logger` package if it's not already installed:
```bash
pip install python-json-logger
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file:
```bash
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
python3 -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
```
### Example 2: Silence a particular vLLM logger
To silence a particular vLLM logger, it is necessary to provide custom logging
configuration for the target logger that configures the logger so that it won't
propagate its log messages to the root vLLM logger.
When custom configuration is provided for any logger, it is also necessary to
provide configuration for the root vLLM logger since any custom logger
configuration overrides the built-in default logging configuration used by vLLM.
First, create an appropriate JSON logging configuration file that includes
configuration for the root vLLM logger and for the logger you wish to silence:
**/path/to/logging_config.json:**
```json
{
"formatters": {
"vllm": {
"class": "vllm.logging.NewLineFormatter",
"datefmt": "%m-%d %H:%M:%S",
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
}
},
"handlers": {
"vllm": {
"class" : "logging.StreamHandler",
"formatter": "vllm",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["vllm"],
"level": "DEBUG",
"propagage": false
},
"vllm.example_noisy_logger": {
"propagate": false
}
},
"version": 1
}
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file:
```bash
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
python3 -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
```
### Example 3: Disable vLLM default logging configuration
To disable vLLM's default logging configuration and silence all vLLM loggers,
simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
for configuring the root vLLM logger, which in turn, silences all other vLLM
loggers.
```bash
VLLM_CONFIGURE_LOGGING=0 \
python3 -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
```
## Additional resources
- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)

View File

@ -873,289 +873,6 @@
],
"title": "Cache Utilization",
"type": "timeseries"
},
{
"type": "heatmap",
"title": "Request Prompt Length",
"description": "Heatmap of request prompt length",
"gridPos": {
"x": 0,
"y": 24,
"w": 12,
"h": 8
},
"datasource": {
"uid": "prometheus",
"type": "prometheus"
},
"id": 12,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"refId": "A",
"expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
"range": true,
"instant": false,
"editorMode": "builder",
"legendFormat": "{{le}}",
"useBackend": false,
"disableTextWrap": false,
"fullMetaSearch": false,
"includeNullMetadata": true,
"format": "heatmap"
}
],
"options": {
"calculate": false,
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "none",
"axisLabel": "Prompt Length"
},
"rowsFrame": {
"layout": "auto",
"value": "Request count"
},
"color": {
"mode": "scheme",
"fill": "dark-orange",
"scale": "exponential",
"exponent": 0.5,
"scheme": "Spectral",
"steps": 64,
"reverse": false,
"min": 0
},
"cellGap": 1,
"filterValues": {
"le": 1e-9
},
"tooltip": {
"show": true,
"yHistogram": true
},
"legend": {
"show": true
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"cellValues": {
"unit": "none"
}
},
"fieldConfig": {
"defaults": {
"custom": {
"scaleDistribution": {
"type": "linear"
},
"hideFrom": {
"tooltip": false,
"viz": false,
"legend": false
}
}
},
"overrides": []
},
"pluginVersion": "10.2.0"
},
{
"datasource": {
"uid": "prometheus",
"type": "prometheus"
},
"type": "heatmap",
"title": "Request Generation Length",
"description": "Heatmap of request generation length",
"gridPos": {
"x": 12,
"y": 24,
"w": 12,
"h": 8
},
"id": 13,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"refId": "A",
"expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
"range": true,
"instant": false,
"editorMode": "builder",
"legendFormat": "{{le}}",
"useBackend": false,
"disableTextWrap": false,
"fullMetaSearch": false,
"includeNullMetadata": true,
"format": "heatmap"
}
],
"options": {
"calculate": false,
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "none",
"axisLabel": "Generation Length"
},
"rowsFrame": {
"layout": "auto",
"value": "Request count"
},
"color": {
"mode": "scheme",
"fill": "dark-orange",
"scale": "exponential",
"exponent": 0.5,
"scheme": "Spectral",
"steps": 64,
"reverse": false,
"min": 0
},
"cellGap": 1,
"filterValues": {
"le": 1e-9
},
"tooltip": {
"show": true,
"yHistogram": true
},
"legend": {
"show": true
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"cellValues": {
"unit": "none"
}
},
"fieldConfig": {
"defaults": {
"custom": {
"scaleDistribution": {
"type": "linear"
},
"hideFrom": {
"tooltip": false,
"viz": false,
"legend": false
}
}
},
"overrides": []
},
"pluginVersion": "10.2.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"barAlignment": 0,
"lineWidth": 1,
"fillOpacity": 0,
"gradientMode": "none",
"spanNulls": false,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 5,
"stacking": {
"mode": "none",
"group": "A"
},
"axisPlacement": "auto",
"axisLabel": "",
"axisColorMode": "text",
"axisBorderShow": false,
"scaleDistribution": {
"type": "linear"
},
"axisCenteredZero": false,
"hideFrom": {
"tooltip": false,
"viz": false,
"legend": false
},
"thresholdsStyle": {
"mode": "off"
}
},
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 11,
"options": {
"tooltip": {
"mode": "single",
"sort": "none"
},
"legend": {
"showLegend": true,
"displayMode": "list",
"placement": "bottom",
"calcs": []
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"interval": "",
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Finish Reason",
"description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
"type": "timeseries"
}
],
"refresh": "",

View File

@ -16,8 +16,8 @@ from transformers import AutoConfig, PretrainedConfig
from vllm.distributed import initialize_model_parallel
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.tensorizer_loader import TensorizerArgs
# yapf conflicts with isort for this docstring
# yapf: disable

View File

@ -94,21 +94,21 @@ echo 'vLLM yapf: Done'
# Run mypy
echo 'vLLM mypy:'
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
# TODO(sang): Follow up
# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
CODESPELL_EXCLUDES=(

33
patch_xformers.rocm.sh Normal file
View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
XFORMERS_VERSION="0.0.23"
export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
exit 1
fi
export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
else
echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
fi
if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
else
echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
fi

View File

@ -5,7 +5,7 @@ requires = [
"ninja",
"packaging",
"setuptools >= 49.4.0",
"torch == 2.3.0",
"torch == 2.2.1",
"wheel",
]
build-backend = "setuptools.build_meta"
@ -32,7 +32,6 @@ select = [
"SIM",
# isort
# "I",
"G",
]
ignore = [
# star imports
@ -47,17 +46,15 @@ ignore = [
python_version = "3.8"
ignore_missing_imports = true
check_untyped_defs = true
follow_imports = "skip"
check_untyped_defs = true
files = "vllm"
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = [
"vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
# Ignore triton kernels in ops.
'vllm/attention/ops/.*\.py$'
]
[tool.codespell]
ignore-words-list = "dout, te, indicies"
skip = "./tests/prompts,./benchmarks/sonnet.txt"

View File

@ -3,5 +3,5 @@ cmake>=3.21
ninja
packaging
setuptools>=49.4.0
torch==2.3.0
torch==2.2.1
wheel

View File

@ -5,16 +5,13 @@ sentencepiece # Required for LLaMA tokenizer.
numpy
requests
py-cpuinfo
transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3.
tokenizers >= 0.19.1 # Required for Llama 3.
transformers >= 4.39.1 # Required for StarCoder2 & Llava.
fastapi
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken == 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.9.8
lm-format-enforcer == 0.9.3
outlines == 0.0.34 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4

View File

@ -2,5 +2,5 @@
-r requirements-common.txt
# Dependencies for x86_64 CPUs
torch == 2.3.0+cpu
torch == 2.2.1+cpu
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.

View File

@ -3,7 +3,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py # for pynvml package
pynvml == 11.5.0
vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library
torch == 2.3.0
xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
torch == 2.2.1
xformers == 0.0.25 # Requires PyTorch 2.2.1

View File

@ -14,17 +14,19 @@ types-setuptools
# testing
pytest
tensorizer==2.9.0
tensorizer==2.9.0a0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
httpx
einops # required for MPT
openai
requests
ray
peft
awscli
ai2-olmo # required for OLMo
# Benchmarking
aiohttp

6
requirements-tpu.txt Normal file
View File

@ -0,0 +1,6 @@
# Common dependencies
-r requirements-common.txt
torch
jax[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
flax >= 0.8

Some files were not shown because too many files have changed in this diff Show More