Compare commits
70 Commits
v1-blockta
...
correct-do
| Author | SHA1 | Date | |
|---|---|---|---|
| c1d1875ba3 | |||
| 973f5dc581 | |||
| c994223d56 | |||
| 869579a702 | |||
| c0efe92d8b | |||
| d9fa1c05ad | |||
| 2de197bdd4 | |||
| 869e829b85 | |||
| 8f37be38eb | |||
| 8082ad7950 | |||
| 1e4ce295ae | |||
| ce1917fcf2 | |||
| e512f76a89 | |||
| 898cdf033e | |||
| 0f3f3c86ec | |||
| b278557935 | |||
| 8ceffbf315 | |||
| d93d2d74fd | |||
| d0169e1b0f | |||
| 08fb75c72e | |||
| 91b361ae89 | |||
| e20c92bb61 | |||
| 32c9eff2ff | |||
| 4ca5d40adc | |||
| 9279b9f83d | |||
| ee77fdb5de | |||
| 996357e480 | |||
| 2a622d704a | |||
| 9c749713f6 | |||
| 022c5c6944 | |||
| f8fcca100b | |||
| 06bfb51963 | |||
| 408e560015 | |||
| 402d378360 | |||
| 9e764e7b10 | |||
| 33fc1e2e86 | |||
| eba17173d3 | |||
| 635b897246 | |||
| 4068f4b5b5 | |||
| 47831430cc | |||
| 65c08928c2 | |||
| ba214dffbe | |||
| eed11ebee9 | |||
| 300acb8347 | |||
| d91457d529 | |||
| fbf2564554 | |||
| d1d49397e7 | |||
| 9c93636d84 | |||
| e5d7ed0c53 | |||
| ad0d567e1c | |||
| bf0d97d786 | |||
| a655eb3025 | |||
| 1543914c04 | |||
| 61fed92c7e | |||
| 80c751e7f6 | |||
| e1a5c2f0a1 | |||
| fd3a62a122 | |||
| 07064cb1d4 | |||
| 2f1e8e8f54 | |||
| 68d37809b9 | |||
| 5dba257506 | |||
| 187e32997c | |||
| b55ed6ef8a | |||
| 2f385183f3 | |||
| 84c35c374a | |||
| 8c38ee7007 | |||
| b6087a6bee | |||
| 23c1b10a4c | |||
| a115ac46b5 | |||
| 73001445fb |
@ -1,5 +1,6 @@
|
||||
steps:
|
||||
- label: "Wait for container to be ready"
|
||||
key: wait-for-container-image
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
@ -10,12 +11,11 @@ steps:
|
||||
command:
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
|
||||
- wait
|
||||
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
@ -49,6 +49,7 @@ steps:
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
@ -73,7 +74,7 @@ steps:
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: ~
|
||||
depends_on: wait-for-container-image
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
|
||||
@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
|
||||
remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
@ -46,26 +46,26 @@ function cpu_tests() {
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
|
||||
@ -3,6 +3,18 @@
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
set -v
|
||||
|
||||
image_name="neuron/vllm-ci"
|
||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker image prune -f
|
||||
docker system prune -f
|
||||
rm -rf "${HF_MOUNT:?}/*"
|
||||
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f neuron || true; }
|
||||
remove_docker_container() {
|
||||
docker image rm -f "${image_name}" || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
||||
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
|
||||
|
||||
@ -242,7 +242,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||
@ -363,12 +363,14 @@ steps:
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/audio_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
@ -533,6 +535,7 @@ steps:
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
||||
|
||||
@ -550,7 +550,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
|
||||
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
13
Dockerfile
13
Dockerfile
@ -234,8 +234,8 @@ RUN mv vllm test_docs/
|
||||
#################### TEST IMAGE ####################
|
||||
|
||||
#################### OPENAI API SERVER ####################
|
||||
# openai api server alternative
|
||||
FROM vllm-base AS vllm-openai
|
||||
# base openai image with additional requirements, for any subsequent openai-style images
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
# define sagemaker first, so it is not default from `docker build`
|
||||
FROM vllm-openai-base AS vllm-sagemaker
|
||||
|
||||
COPY examples/sagemaker-entrypoint.sh .
|
||||
RUN chmod +x sagemaker-entrypoint.sh
|
||||
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
||||
|
||||
FROM vllm-openai-base AS vllm-openai
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
#################### OPENAI API SERVER ####################
|
||||
|
||||
@ -15,8 +15,8 @@ RUN apt-get update && \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /app
|
||||
ARG APP_MOUNT=/app
|
||||
# When launching the container, mount the code directory to /workspace
|
||||
ARG APP_MOUNT=/workspace
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install pytest
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# overwrite entrypoint to run bash script
|
||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -77,7 +77,7 @@ pip install vllm
|
||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||
- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||
|
||||
## Contributing
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ void paged_attention_v1_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
@ -176,7 +176,7 @@ void paged_attention_v1(
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
|
||||
@ -54,7 +54,7 @@ void paged_attention_v2_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
@ -187,7 +187,7 @@ void paged_attention_v2(
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
|
||||
@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||
const std::optional<torch::Tensor>& alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -459,7 +459,7 @@ void paged_attention_v1(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||
int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -781,7 +781,7 @@ void paged_attention_v2(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
|
||||
@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
|
||||
const torch::Tensor& b, // [IC, OC], column-major
|
||||
const torch::Tensor& a_scales, // [1] or [M]
|
||||
const torch::Tensor& b_scales, // [1] or [OC]
|
||||
const c10::optional<torch::Tensor>& bias // [OC]
|
||||
const std::optional<torch::Tensor>& bias // [OC]
|
||||
) {
|
||||
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
|
||||
// Checks for conformality
|
||||
@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
|
||||
const torch::Tensor& a_scales, // [1] or [M]
|
||||
const torch::Tensor& b_scales, // [1] or [OC]
|
||||
const torch::Tensor& azp_adj, // [OC]
|
||||
const c10::optional<torch::Tensor>& azp, // [1] or [M]
|
||||
const c10::optional<torch::Tensor>& bias // [OC]
|
||||
const std::optional<torch::Tensor>& azp, // [1] or [M]
|
||||
const std::optional<torch::Tensor>& bias // [OC]
|
||||
) {
|
||||
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
|
||||
// Checks for conformality
|
||||
@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
|
||||
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
const torch::Tensor& input, // [..., hidden_size]
|
||||
const torch::Tensor& scale,
|
||||
c10::optional<torch::Tensor> const& azp) {
|
||||
std::optional<torch::Tensor> const& azp) {
|
||||
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant(
|
||||
torch::Tensor& out, // [..., hidden_size]
|
||||
const torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& scale, // [..., 1]
|
||||
c10::optional<torch::Tensor> const& azp) {
|
||||
std::optional<torch::Tensor> const& azp) {
|
||||
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
|
||||
@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids);
|
||||
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales,
|
||||
const c10::optional<torch::Tensor>& bias);
|
||||
const std::optional<torch::Tensor>& bias);
|
||||
|
||||
void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
|
||||
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales,
|
||||
const torch::Tensor& azp_adj,
|
||||
const c10::optional<torch::Tensor>& azp,
|
||||
const c10::optional<torch::Tensor>& bias);
|
||||
const std::optional<torch::Tensor>& azp,
|
||||
const std::optional<torch::Tensor>& bias);
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
@ -68,7 +68,7 @@ struct ScaledEpilogueBase {
|
||||
// This overload handles the case where there might not be a tensor, in which
|
||||
// case a nullptr is passed and a constant (0) is used.
|
||||
template <typename Descriptor, typename T>
|
||||
static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
|
||||
static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
|
||||
static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
|
||||
using Arguments = typename Descriptor::Arguments;
|
||||
auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
|
||||
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
|
||||
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
torch::Tensor const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
||||
@ -67,7 +67,7 @@ struct ScaledEpilogueBase {
|
||||
// This overload handles the case where there might not be a tensor, in which
|
||||
// case a nullptr is passed and a constant (0) is used.
|
||||
template <typename Descriptor, typename T>
|
||||
static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
|
||||
static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
|
||||
using Arguments = typename Descriptor::Arguments;
|
||||
auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
|
||||
static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
|
||||
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
|
||||
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
torch::Tensor const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
||||
@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
|
||||
|
||||
template <typename Stride>
|
||||
static inline auto maybe_make_cute_layout(
|
||||
c10::optional<torch::Tensor> const& tensor,
|
||||
std::optional<torch::Tensor> const& tensor,
|
||||
std::string_view name = "tensor") {
|
||||
using Layout = decltype(make_cute_layout<Stride>(*tensor));
|
||||
|
||||
|
||||
@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
||||
const at::Tensor x,
|
||||
const at::Tensor weight,
|
||||
const at::Tensor out,
|
||||
const c10::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
bool silu_activation,
|
||||
int64_t pad_slot_id,
|
||||
const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
|
||||
const c10::optional<at::Tensor>& cache_indices = std::nullopt,
|
||||
const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
|
||||
const std::optional<at::Tensor>& query_start_loc = std::nullopt,
|
||||
const std::optional<at::Tensor>& cache_indices = std::nullopt,
|
||||
const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
|
||||
|
||||
// Reset the parameters
|
||||
memset(¶ms, 0, sizeof(params));
|
||||
@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
||||
|
||||
|
||||
void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||
const c10::optional<at::Tensor> &bias_,
|
||||
const c10::optional<at::Tensor> &conv_states,
|
||||
const c10::optional<at::Tensor> &query_start_loc,
|
||||
const c10::optional<at::Tensor> &cache_indices,
|
||||
const c10::optional<at::Tensor> &has_initial_state,
|
||||
const std::optional<at::Tensor> &bias_,
|
||||
const std::optional<at::Tensor> &conv_states,
|
||||
const std::optional<at::Tensor> &query_start_loc,
|
||||
const std::optional<at::Tensor> &cache_indices,
|
||||
const std::optional<at::Tensor> &has_initial_state,
|
||||
bool silu_activation,
|
||||
// used to identify padding entries if cache_indices provided
|
||||
// in case of padding, the kernel will return early
|
||||
@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||
void causal_conv1d_update(const at::Tensor &x,
|
||||
const at::Tensor &conv_state,
|
||||
const at::Tensor &weight,
|
||||
const c10::optional<at::Tensor> &bias_,
|
||||
const std::optional<at::Tensor> &bias_,
|
||||
bool silu_activation,
|
||||
const c10::optional<at::Tensor> &cache_seqlens_,
|
||||
const c10::optional<at::Tensor> &conv_state_indices_,
|
||||
const std::optional<at::Tensor> &cache_seqlens_,
|
||||
const std::optional<at::Tensor> &conv_state_indices_,
|
||||
// used to identify padding entries if cache_indices provided
|
||||
// in case of padding, the kernel will return early
|
||||
int64_t pad_slot_id) {
|
||||
|
||||
@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
const torch::Tensor out,
|
||||
const torch::Tensor z,
|
||||
const torch::Tensor out_z,
|
||||
const c10::optional<at::Tensor>& D,
|
||||
const c10::optional<at::Tensor>& delta_bias,
|
||||
const std::optional<at::Tensor>& D,
|
||||
const std::optional<at::Tensor>& delta_bias,
|
||||
const torch::Tensor ssm_states,
|
||||
bool has_z,
|
||||
bool delta_softplus,
|
||||
const c10::optional<at::Tensor>& query_start_loc,
|
||||
const c10::optional<at::Tensor>& cache_indices,
|
||||
const c10::optional<at::Tensor>& has_initial_state,
|
||||
const std::optional<at::Tensor>& query_start_loc,
|
||||
const std::optional<at::Tensor>& cache_indices,
|
||||
const std::optional<at::Tensor>& has_initial_state,
|
||||
bool varlen,
|
||||
int64_t pad_slot_id) {
|
||||
|
||||
@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
|
||||
void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
|
||||
const c10::optional<torch::Tensor> &D_,
|
||||
const c10::optional<torch::Tensor> &z_,
|
||||
const c10::optional<torch::Tensor> &delta_bias_,
|
||||
const std::optional<torch::Tensor> &D_,
|
||||
const std::optional<torch::Tensor> &z_,
|
||||
const std::optional<torch::Tensor> &delta_bias_,
|
||||
bool delta_softplus,
|
||||
const c10::optional<torch::Tensor> &query_start_loc,
|
||||
const c10::optional<torch::Tensor> &cache_indices,
|
||||
const c10::optional<torch::Tensor> &has_initial_state,
|
||||
const std::optional<torch::Tensor> &query_start_loc,
|
||||
const std::optional<torch::Tensor> &cache_indices,
|
||||
const std::optional<torch::Tensor> &has_initial_state,
|
||||
const torch::Tensor &ssm_states,
|
||||
// used to identify padding entries if cache_indices provided
|
||||
// in case of padding, the kernel will return early
|
||||
|
||||
46
csrc/ops.h
46
csrc/ops.h
@ -33,7 +33,7 @@ void paged_attention_v1(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
@ -44,7 +44,7 @@ void paged_attention_v2(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
||||
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
|
||||
|
||||
@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b, torch::Tensor const& e,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
|
||||
torch::Tensor& e, torch::Tensor const& a);
|
||||
@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
|
||||
|
||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor const& scale,
|
||||
c10::optional<torch::Tensor> const& azp);
|
||||
std::optional<torch::Tensor> const& azp);
|
||||
|
||||
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor& scales,
|
||||
c10::optional<torch::Tensor> const& azp);
|
||||
std::optional<torch::Tensor> const& azp);
|
||||
|
||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
torch::Tensor b_gptq_qzeros,
|
||||
@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
|
||||
void dynamic_per_token_scaled_fp8_quant(
|
||||
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
|
||||
c10::optional<torch::Tensor> const& scale_ub);
|
||||
std::optional<torch::Tensor> const& scale_ub);
|
||||
|
||||
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
|
||||
const torch::Tensor& A, const torch::Tensor& B,
|
||||
const torch::Tensor& C,
|
||||
const c10::optional<torch::Tensor>& D_,
|
||||
const c10::optional<torch::Tensor>& z_,
|
||||
const c10::optional<torch::Tensor>& delta_bias_,
|
||||
const std::optional<torch::Tensor>& D_,
|
||||
const std::optional<torch::Tensor>& z_,
|
||||
const std::optional<torch::Tensor>& delta_bias_,
|
||||
bool delta_softplus,
|
||||
const c10::optional<torch::Tensor>& query_start_loc,
|
||||
const c10::optional<torch::Tensor>& cache_indices,
|
||||
const c10::optional<torch::Tensor>& has_initial_state,
|
||||
const std::optional<torch::Tensor>& query_start_loc,
|
||||
const std::optional<torch::Tensor>& cache_indices,
|
||||
const std::optional<torch::Tensor>& has_initial_state,
|
||||
const torch::Tensor& ssm_states, int64_t pad_slot_id);
|
||||
|
||||
void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
|
||||
const at::Tensor& weight,
|
||||
const c10::optional<at::Tensor>& bias_,
|
||||
const std::optional<at::Tensor>& bias_,
|
||||
bool silu_activation,
|
||||
const c10::optional<at::Tensor>& cache_seqlens_,
|
||||
const c10::optional<at::Tensor>& conv_state_indices_,
|
||||
const std::optional<at::Tensor>& cache_seqlens_,
|
||||
const std::optional<at::Tensor>& conv_state_indices_,
|
||||
int64_t pad_slot_id);
|
||||
|
||||
void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
|
||||
const c10::optional<at::Tensor>& bias_,
|
||||
const c10::optional<at::Tensor>& conv_states,
|
||||
const c10::optional<at::Tensor>& query_start_loc,
|
||||
const c10::optional<at::Tensor>& cache_indices,
|
||||
const c10::optional<at::Tensor>& has_initial_state,
|
||||
const std::optional<at::Tensor>& bias_,
|
||||
const std::optional<at::Tensor>& conv_states,
|
||||
const std::optional<at::Tensor>& query_start_loc,
|
||||
const std::optional<at::Tensor>& cache_indices,
|
||||
const std::optional<at::Tensor>& has_initial_state,
|
||||
bool silu_activation, int64_t pad_slot_id);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
||||
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor const& input, // [..., hidden_size]
|
||||
torch::Tensor const& scale,
|
||||
c10::optional<torch::Tensor> const& azp) {
|
||||
std::optional<torch::Tensor> const& azp) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(scale.numel() == 1);
|
||||
@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
void dynamic_scaled_int8_quant(
|
||||
torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor const& input, // [..., hidden_size]
|
||||
torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
|
||||
torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(scales.is_contiguous());
|
||||
|
||||
@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
|
||||
@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
|
||||
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
|
||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
||||
void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
||||
@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
||||
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
// Checks for conformality
|
||||
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||
@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
// Checks for conformality
|
||||
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||
|
||||
@ -834,6 +834,7 @@ __global__ void Marlin(
|
||||
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
|
||||
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
|
||||
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
||||
int4* sh_red = sh_s + (stages * s_sh_stage);
|
||||
|
||||
// Register storage for double buffer of shared memory reads.
|
||||
FragA frag_a[2][thread_m_blocks];
|
||||
@ -932,11 +933,11 @@ __global__ void Marlin(
|
||||
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
|
||||
|
||||
if constexpr (group_blocks >= thread_k_blocks) {
|
||||
if (s_sh_wr_pred) {
|
||||
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
|
||||
}
|
||||
// Only fetch scales if this tile starts a new group
|
||||
if (pipe % (group_blocks / thread_k_blocks) == 0) {
|
||||
if (s_sh_wr_pred) {
|
||||
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
|
||||
}
|
||||
if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
|
||||
s_gl_rd += s_gl_rd_delta;
|
||||
}
|
||||
} else {
|
||||
@ -1038,9 +1039,7 @@ __global__ void Marlin(
|
||||
// No act-order case
|
||||
if constexpr (group_blocks != -1) {
|
||||
if constexpr (group_blocks >= thread_k_blocks) {
|
||||
int4* sh_s_stage =
|
||||
sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
|
||||
(pipe / (group_blocks / thread_k_blocks)));
|
||||
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
|
||||
reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
|
||||
} else {
|
||||
int warp_id = threadIdx.x / 32;
|
||||
@ -1339,15 +1338,15 @@ __global__ void Marlin(
|
||||
int red_sh_wr =
|
||||
red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
|
||||
if (i < red_off) {
|
||||
float* c_rd =
|
||||
reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
|
||||
float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
|
||||
float* c_rd = reinterpret_cast<float*>(
|
||||
&sh_red[red_sh_delta * j + red_sh_rd]);
|
||||
float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 4; k++)
|
||||
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
|
||||
c_rd[k] + c_wr[k];
|
||||
}
|
||||
sh[red_sh_wr] =
|
||||
sh_red[red_sh_wr] =
|
||||
reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
|
||||
}
|
||||
}
|
||||
@ -1357,7 +1356,7 @@ __global__ void Marlin(
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4 * 2; i++) {
|
||||
float* c_rd =
|
||||
reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
|
||||
reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; j++)
|
||||
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
|
||||
@ -1397,7 +1396,7 @@ __global__ void Marlin(
|
||||
#pragma unroll
|
||||
for (int i = 0; i < thread_m_blocks * 4; i++) {
|
||||
cp_async4_pred(
|
||||
&sh[c_sh_wr + c_sh_wr_delta * i],
|
||||
&sh_red[c_sh_wr + c_sh_wr_delta * i],
|
||||
&C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
|
||||
c_gl_wr_delta_i * (i % 2)],
|
||||
i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
|
||||
@ -1410,7 +1409,7 @@ __global__ void Marlin(
|
||||
for (int i = 0; i < thread_m_blocks * 4; i++) {
|
||||
if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
|
||||
if (!first) {
|
||||
int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
|
||||
int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 2 * 4; j++) {
|
||||
reinterpret_cast<float*>(
|
||||
@ -1461,10 +1460,10 @@ __global__ void Marlin(
|
||||
float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < th_size; k++) {
|
||||
sh[threadIdx.x] =
|
||||
sh_red[threadIdx.x] =
|
||||
C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
|
||||
|
||||
float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
|
||||
float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
|
||||
#pragma unroll
|
||||
for (int f = 0; f < 4; f++) {
|
||||
frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
|
||||
@ -1515,7 +1514,7 @@ __global__ void Marlin(
|
||||
res = __hmul2(res, s[0]);
|
||||
}
|
||||
|
||||
((scalar_t2*)sh)[idx] = res;
|
||||
((scalar_t2*)sh_red)[idx] = res;
|
||||
};
|
||||
|
||||
if (threadIdx.x / 32 < thread_n_blocks / 4) {
|
||||
@ -1543,7 +1542,7 @@ __global__ void Marlin(
|
||||
i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
|
||||
i++) {
|
||||
if (c_gl_wr < c_gl_wr_end) {
|
||||
C[c_gl_wr] = sh[c_sh_rd];
|
||||
C[c_gl_wr] = sh_red[c_sh_rd];
|
||||
c_gl_wr += c_gl_wr_delta;
|
||||
c_sh_rd += c_sh_rd_delta;
|
||||
}
|
||||
@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
|
||||
|
||||
float pipe_size = (a_size + b_size) * pipe_stages;
|
||||
|
||||
float reduce_size = max(th_config.num_threads * 32 * 4,
|
||||
(tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
|
||||
|
||||
TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
|
||||
|
||||
return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
|
||||
return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
|
||||
}
|
||||
|
||||
bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
|
||||
|
||||
@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
|
||||
|
||||
|
||||
static inline std::optional<at::ScalarType> maybe_scalartype(
|
||||
c10::optional<at::Tensor> const& t) {
|
||||
std::optional<at::Tensor> const& t) {
|
||||
if (!t) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
|
||||
@ -183,11 +183,11 @@ struct MacheteKernelTemplate {
|
||||
torch::Tensor const& A, // MxK matrix
|
||||
torch::Tensor const& B, // KxN prepacked matrix
|
||||
torch::Tensor& D, // MxN matrix
|
||||
c10::optional<torch::Tensor> const& maybe_g_scales, // scale_KxN matrix
|
||||
c10::optional<torch::Tensor> const& maybe_g_zeros, // scale_KxN matrix
|
||||
c10::optional<int64_t> maybe_group_size,
|
||||
c10::optional<torch::Tensor> const& maybe_ch_scales, // len N vector
|
||||
c10::optional<torch::Tensor> const& maybe_tok_scales) // len M vector
|
||||
std::optional<torch::Tensor> const& maybe_g_scales, // scale_KxN matrix
|
||||
std::optional<torch::Tensor> const& maybe_g_zeros, // scale_KxN matrix
|
||||
std::optional<int64_t> maybe_group_size,
|
||||
std::optional<torch::Tensor> const& maybe_ch_scales, // len N vector
|
||||
std::optional<torch::Tensor> const& maybe_tok_scales) // len M vector
|
||||
{
|
||||
static_assert(!with_group_zeropoints || with_group_scales);
|
||||
|
||||
|
||||
@ -13,23 +13,23 @@ struct MMArgs {
|
||||
torch::Tensor const& A;
|
||||
torch::Tensor const& B;
|
||||
vllm::ScalarType const& b_type;
|
||||
c10::optional<at::ScalarType> const& maybe_out_type;
|
||||
c10::optional<torch::Tensor> const& maybe_group_scales;
|
||||
c10::optional<torch::Tensor> const& maybe_group_zeros;
|
||||
c10::optional<int64_t> maybe_group_size;
|
||||
c10::optional<torch::Tensor> const& maybe_channel_scales;
|
||||
c10::optional<torch::Tensor> const& maybe_token_scales;
|
||||
c10::optional<std::string> maybe_schedule;
|
||||
std::optional<at::ScalarType> const& maybe_out_type;
|
||||
std::optional<torch::Tensor> const& maybe_group_scales;
|
||||
std::optional<torch::Tensor> const& maybe_group_zeros;
|
||||
std::optional<int64_t> maybe_group_size;
|
||||
std::optional<torch::Tensor> const& maybe_channel_scales;
|
||||
std::optional<torch::Tensor> const& maybe_token_scales;
|
||||
std::optional<std::string> maybe_schedule;
|
||||
};
|
||||
|
||||
struct SupportedSchedulesArgs {
|
||||
at::ScalarType a_type;
|
||||
vllm::ScalarType b_type;
|
||||
c10::optional<at::ScalarType> maybe_group_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_group_zeros_type;
|
||||
c10::optional<at::ScalarType> maybe_channel_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_token_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_out_type;
|
||||
std::optional<at::ScalarType> maybe_group_scales_type;
|
||||
std::optional<at::ScalarType> maybe_group_zeros_type;
|
||||
std::optional<at::ScalarType> maybe_channel_scales_type;
|
||||
std::optional<at::ScalarType> maybe_token_scales_type;
|
||||
std::optional<at::ScalarType> maybe_out_type;
|
||||
};
|
||||
|
||||
torch::Tensor mm_dispatch(MMArgs args);
|
||||
|
||||
@ -10,7 +10,7 @@ struct PrepackBArgs {
|
||||
torch::Tensor const& B;
|
||||
at::ScalarType a_type;
|
||||
vllm::ScalarType b_type;
|
||||
c10::optional<at::ScalarType> maybe_group_scales_type;
|
||||
std::optional<at::ScalarType> maybe_group_scales_type;
|
||||
};
|
||||
|
||||
template <typename PrepackedLayoutB>
|
||||
|
||||
@ -10,11 +10,11 @@ using namespace vllm;
|
||||
|
||||
std::vector<std::string> supported_schedules(
|
||||
at::ScalarType a_type, int64_t b_type_id,
|
||||
c10::optional<at::ScalarType> maybe_group_scales_type,
|
||||
c10::optional<at::ScalarType> maybe_group_zeros_type,
|
||||
c10::optional<at::ScalarType> maybe_channel_scales_type,
|
||||
c10::optional<at::ScalarType> maybe_token_scales_type,
|
||||
c10::optional<at::ScalarType> maybe_out_type) {
|
||||
std::optional<at::ScalarType> maybe_group_scales_type,
|
||||
std::optional<at::ScalarType> maybe_group_zeros_type,
|
||||
std::optional<at::ScalarType> maybe_channel_scales_type,
|
||||
std::optional<at::ScalarType> maybe_token_scales_type,
|
||||
std::optional<at::ScalarType> maybe_out_type) {
|
||||
ScalarType const b_type = ScalarType::from_id(b_type_id);
|
||||
return supported_schedules_dispatch({
|
||||
.a_type = a_type,
|
||||
@ -29,13 +29,13 @@ std::vector<std::string> supported_schedules(
|
||||
|
||||
torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
|
||||
int64_t b_type_id,
|
||||
c10::optional<at::ScalarType> const& maybe_out_type,
|
||||
c10::optional<torch::Tensor> const& maybe_group_scales,
|
||||
c10::optional<torch::Tensor> const& maybe_group_zeros,
|
||||
c10::optional<int64_t> maybe_group_size,
|
||||
c10::optional<torch::Tensor> const& maybe_channel_scales,
|
||||
c10::optional<torch::Tensor> const& maybe_token_scales,
|
||||
c10::optional<std::string> maybe_schedule) {
|
||||
std::optional<at::ScalarType> const& maybe_out_type,
|
||||
std::optional<torch::Tensor> const& maybe_group_scales,
|
||||
std::optional<torch::Tensor> const& maybe_group_zeros,
|
||||
std::optional<int64_t> maybe_group_size,
|
||||
std::optional<torch::Tensor> const& maybe_channel_scales,
|
||||
std::optional<torch::Tensor> const& maybe_token_scales,
|
||||
std::optional<std::string> maybe_schedule) {
|
||||
ScalarType const b_type = ScalarType::from_id(b_type_id);
|
||||
return mm_dispatch({.A = A,
|
||||
.B = B,
|
||||
@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
|
||||
|
||||
torch::Tensor prepack_B(
|
||||
torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
|
||||
c10::optional<at::ScalarType> const& maybe_group_scales_type) {
|
||||
std::optional<at::ScalarType> const& maybe_group_scales_type) {
|
||||
ScalarType const b_type = ScalarType::from_id(b_type_id);
|
||||
return prepack_B_dispatch(
|
||||
{.B = B,
|
||||
|
||||
@ -928,7 +928,7 @@ void paged_attention_custom_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, const int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& context_lens,
|
||||
int max_context_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
float k_scale, float v_scale) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
@ -1086,7 +1086,7 @@ void paged_attention(
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& context_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_context_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale) {
|
||||
const int head_size = query.size(2);
|
||||
if (kv_cache_dtype == "auto") {
|
||||
|
||||
@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
|
||||
double scale, torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens, int64_t block_size,
|
||||
int64_t max_context_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale,
|
||||
double v_scale);
|
||||
|
||||
@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& bt_meta,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
|
||||
@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& e,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& bt_meta,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
// Checks for conformality
|
||||
TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
|
||||
TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
|
||||
|
||||
|
Before Width: | Height: | Size: 968 KiB After Width: | Height: | Size: 968 KiB |
|
Before Width: | Height: | Size: 102 KiB After Width: | Height: | Size: 102 KiB |
|
Before Width: | Height: | Size: 173 KiB After Width: | Height: | Size: 173 KiB |
@ -1,7 +1,7 @@
|
||||
# Dockerfile
|
||||
|
||||
We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
|
||||
More information about deploying with Docker can be found [here](#deployment-docker).
|
||||
|
||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
|
||||
|
||||
115
docs/source/contributing/model/basic.md
Normal file
115
docs/source/contributing/model/basic.md
Normal file
@ -0,0 +1,115 @@
|
||||
(new-model-basic)=
|
||||
|
||||
# Basic Implementation
|
||||
|
||||
This guide walks you through the steps to implement a basic vLLM model.
|
||||
|
||||
## 1. Bring your model code
|
||||
|
||||
First, clone the PyTorch model code from the source repository.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
```{warning}
|
||||
Make sure to review and adhere to the original code's copyright and licensing terms!
|
||||
```
|
||||
|
||||
## 2. Make your code compatible with vLLM
|
||||
|
||||
To ensure compatibility with vLLM, your model must meet the following requirements:
|
||||
|
||||
### Initialization Code
|
||||
|
||||
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
|
||||
|
||||
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
|
||||
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
|
||||
|
||||
The initialization code should look like this:
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.attention import Attention
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.attn = Attention(prefix=f"{prefix}.attn")
|
||||
|
||||
class MyDecoderLayer(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
||||
|
||||
class MyModel(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList(
|
||||
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
||||
)
|
||||
|
||||
class MyModelForCausalLM(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
||||
```
|
||||
|
||||
### Computation Code
|
||||
|
||||
Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
|
||||
|
||||
```python
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
```{note}
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
```
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
||||
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
||||
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
|
||||
When it comes to the linear layers, we provide the following options to parallelize them:
|
||||
|
||||
- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||
- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||
- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||
- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||
- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||
|
||||
Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||
|
||||
## 4. Implement the weight loading logic
|
||||
|
||||
You now need to implement the `load_weights` method in your `*ForCausalLM` class.
|
||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||
|
||||
## 5. Register your model
|
||||
|
||||
See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
### How to support models with interleaving sliding windows?
|
||||
|
||||
For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
|
||||
|
||||
To support a model with interleaving sliding windows, we need to take care of the following details:
|
||||
|
||||
- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
|
||||
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
|
||||
|
||||
With these two steps, interleave sliding windows should work with the model.
|
||||
26
docs/source/contributing/model/index.md
Normal file
26
docs/source/contributing/model/index.md
Normal file
@ -0,0 +1,26 @@
|
||||
(new-model)=
|
||||
|
||||
# Adding a New Model
|
||||
|
||||
This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
|
||||
|
||||
```{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
basic
|
||||
registration
|
||||
multimodal
|
||||
```
|
||||
|
||||
```{note}
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
```
|
||||
|
||||
```{tip}
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
|
||||
or ask on our [developer slack](https://slack.vllm.ai).
|
||||
We will be happy to help you out!
|
||||
```
|
||||
@ -2,15 +2,11 @@
|
||||
|
||||
# Enabling Multimodal Inputs
|
||||
|
||||
This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
|
||||
|
||||
```{seealso}
|
||||
[Adding a New Model](adding-a-new-model)
|
||||
```
|
||||
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
|
||||
|
||||
## 1. Update the base vLLM model
|
||||
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
|
||||
Further update the model as follows:
|
||||
|
||||
- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
56
docs/source/contributing/model/registration.md
Normal file
56
docs/source/contributing/model/registration.md
Normal file
@ -0,0 +1,56 @@
|
||||
(new-model-registration)=
|
||||
|
||||
# Model Registration
|
||||
|
||||
vLLM relies on a model registry to determine how to run each model.
|
||||
A list of pre-registered architectures can be found [here](#supported-models).
|
||||
|
||||
If your model is not on this list, you must register it to vLLM.
|
||||
This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
|
||||
Finally, update our [list of supported models](#supported-models) to promote your model!
|
||||
|
||||
```{important}
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
```
|
||||
|
||||
## Out-of-tree models
|
||||
|
||||
You can load an external model using a plugin without modifying the vLLM codebase.
|
||||
|
||||
```{seealso}
|
||||
[vLLM's Plugin System](#plugin-system)
|
||||
```
|
||||
|
||||
To register the model, use the following code:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
from your_code import YourModelForCausalLM
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
```
|
||||
|
||||
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
|
||||
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
|
||||
```
|
||||
|
||||
```{important}
|
||||
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
Read more about that [here](#enabling-multimodal-inputs).
|
||||
```
|
||||
|
||||
```{note}
|
||||
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
|
||||
```
|
||||
43
docs/source/contributing/vulnerability_management.md
Normal file
43
docs/source/contributing/vulnerability_management.md
Normal file
@ -0,0 +1,43 @@
|
||||
# Vulnerability Management
|
||||
|
||||
## Reporting Vulnerabilities
|
||||
|
||||
As mentioned in the [security
|
||||
policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
|
||||
vulnerabilities may be reported privately to the project via
|
||||
[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
|
||||
## Vulnerability Management Team
|
||||
|
||||
Once a vulnerability has been reported to the project, the Vulnerability
|
||||
Management Team (VMT) is responsible for managing the vulnerability. The VMT is
|
||||
responsible for:
|
||||
|
||||
- Triaging the vulnerability.
|
||||
- Coordinating with reporters and project maintainers on vulnerability analysis
|
||||
and resolution.
|
||||
- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
|
||||
- Coordination with project maintainers on a coordinated release of the fix and
|
||||
security advisory.
|
||||
|
||||
### Security Advisories
|
||||
|
||||
Advisories are published via GitHub through the same system used to report
|
||||
vulnerabilities. More information on the process can be found in the [GitHub
|
||||
documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
|
||||
|
||||
### Team Members
|
||||
|
||||
We prefer to keep all vulnerability-related communication on the security report
|
||||
on GitHub. However, if you need to contact the VMT directly for an urgent issue,
|
||||
you may contact the following individuals:
|
||||
|
||||
- Simon Mo - simon.mo@hey.com
|
||||
- Russell Bryant - rbryant@redhat.com
|
||||
|
||||
## Slack Discussion
|
||||
|
||||
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
|
||||
to discuss security-related topics. However, please do not disclose any
|
||||
vulnerabilities in this channel. If you need to report a vulnerability, please
|
||||
use the GitHub security advisory system or contact a VMT member privately.
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-docker)=
|
||||
(deployment-docker)=
|
||||
|
||||
# Deploying with Docker
|
||||
# Using Docker
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-bentoml)=
|
||||
(deployment-bentoml)=
|
||||
|
||||
# Deploying with BentoML
|
||||
# BentoML
|
||||
|
||||
[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-cerebrium)=
|
||||
(deployment-cerebrium)=
|
||||
|
||||
# Deploying with Cerebrium
|
||||
# Cerebrium
|
||||
|
||||
```{raw} html
|
||||
<p align="center">
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-dstack)=
|
||||
(deployment-dstack)=
|
||||
|
||||
# Deploying with dstack
|
||||
# dstack
|
||||
|
||||
```{raw} html
|
||||
<p align="center">
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-helm)=
|
||||
(deployment-helm)=
|
||||
|
||||
# Deploying with Helm
|
||||
# Helm
|
||||
|
||||
A Helm chart to deploy vLLM for Kubernetes
|
||||
|
||||
@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release.
|
||||
|
||||
## Architecture
|
||||
|
||||
```{image} architecture_helm_deployment.png
|
||||
```{image} /assets/deployment/architecture_helm_deployment.png
|
||||
```
|
||||
|
||||
## Values
|
||||
13
docs/source/deployment/frameworks/index.md
Normal file
13
docs/source/deployment/frameworks/index.md
Normal file
@ -0,0 +1,13 @@
|
||||
# Using other frameworks
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
bentoml
|
||||
cerebrium
|
||||
dstack
|
||||
helm
|
||||
lws
|
||||
skypilot
|
||||
triton
|
||||
```
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-lws)=
|
||||
(deployment-lws)=
|
||||
|
||||
# Deploying with LWS
|
||||
# LWS
|
||||
|
||||
LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
|
||||
A major use case is for multi-host/multi-node distributed inference.
|
||||
@ -1,6 +1,6 @@
|
||||
(on-cloud)=
|
||||
(deployment-skypilot)=
|
||||
|
||||
# Deploying and scaling up with SkyPilot
|
||||
# SkyPilot
|
||||
|
||||
```{raw} html
|
||||
<p align="center">
|
||||
@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
|
||||
- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
|
||||
- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
|
||||
- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
|
||||
- Check that `sky check` shows clouds or Kubernetes are enabled.
|
||||
|
||||
```console
|
||||
pip install skypilot-nightly
|
||||
@ -1,5 +1,5 @@
|
||||
(deploying-with-triton)=
|
||||
(deployment-triton)=
|
||||
|
||||
# Deploying with NVIDIA Triton
|
||||
# NVIDIA Triton
|
||||
|
||||
The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
|
||||
9
docs/source/deployment/integrations/index.md
Normal file
9
docs/source/deployment/integrations/index.md
Normal file
@ -0,0 +1,9 @@
|
||||
# External Integrations
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
kserve
|
||||
kubeai
|
||||
llamastack
|
||||
```
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-kserve)=
|
||||
(deployment-kserve)=
|
||||
|
||||
# Deploying with KServe
|
||||
# KServe
|
||||
|
||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-kubeai)=
|
||||
(deployment-kubeai)=
|
||||
|
||||
# Deploying with KubeAI
|
||||
# KubeAI
|
||||
|
||||
[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(run-on-llamastack)=
|
||||
(deployment-llamastack)=
|
||||
|
||||
# Serving with Llama Stack
|
||||
# Llama Stack
|
||||
|
||||
vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(deploying-with-k8s)=
|
||||
(deployment-k8s)=
|
||||
|
||||
# Deploying with Kubernetes
|
||||
# Using Kubernetes
|
||||
|
||||
Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
|
||||
|
||||
@ -43,7 +43,7 @@ metadata:
|
||||
name: hf-token-secret
|
||||
namespace: default
|
||||
type: Opaque
|
||||
data:
|
||||
stringData:
|
||||
token: "REPLACE_WITH_TOKEN"
|
||||
```
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(nginxloadbalancer)=
|
||||
|
||||
# Deploying with Nginx Loadbalancer
|
||||
# Using Nginx
|
||||
|
||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||
|
||||
@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference
|
||||
|
||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||
|
||||
### OpenAI-compatible API server
|
||||
### OpenAI-Compatible API Server
|
||||
|
||||
The second primary interface to vLLM is via its OpenAI-compatible API server.
|
||||
This server can be started using the `vllm serve` command.
|
||||
@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
|
||||
|
||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||
|
||||
More details on the API server can be found in the {doc}`OpenAI Compatible
|
||||
Server </serving/openai_compatible_server>` document.
|
||||
More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
|
||||
|
||||
## LLM Engine
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# Implementation
|
||||
(design-automatic-prefix-caching)=
|
||||
|
||||
The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
||||
# Automatic Prefix Caching
|
||||
|
||||
The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
||||
|
||||
To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
(design-paged-attention)=
|
||||
|
||||
# vLLM Paged Attention
|
||||
|
||||
- Currently, vLLM utilizes its own implementation of a multi-head query
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
## Debugging
|
||||
|
||||
Please see the [Debugging Tips](#debugging-python-multiprocessing)
|
||||
Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
|
||||
page for information on known issues and how to solve them.
|
||||
|
||||
## Introduction
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# Offline Inference
|
||||
|
||||
```{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
llm
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
(apc)=
|
||||
(automatic-prefix-caching)=
|
||||
|
||||
# Introduction
|
||||
# Automatic Prefix Caching
|
||||
|
||||
## What is Automatic Prefix Caching
|
||||
## Introduction
|
||||
|
||||
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
|
||||
|
||||
```{note}
|
||||
Technical details on how vLLM implements APC are in the next page.
|
||||
Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
|
||||
```
|
||||
|
||||
## Enabling APC in vLLM
|
||||
@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
|
||||
|
||||
* - Feature
|
||||
- [CP](#chunked-prefill)
|
||||
- [APC](#apc)
|
||||
- [APC](#automatic-prefix-caching)
|
||||
- [LoRA](#lora-adapter)
|
||||
- <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
||||
- [SD](#spec_decode)
|
||||
@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
|
||||
-
|
||||
-
|
||||
-
|
||||
* - [APC](#apc)
|
||||
* - [APC](#automatic-prefix-caching)
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - [APC](#apc)
|
||||
* - [APC](#automatic-prefix-caching)
|
||||
- [✗](gh-issue:3687)
|
||||
- ✅
|
||||
- ✅
|
||||
@ -1,8 +1,12 @@
|
||||
(disagg-prefill)=
|
||||
|
||||
# Disaggregated prefilling (experimental)
|
||||
# Disaggregated Prefilling (experimental)
|
||||
|
||||
This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
|
||||
This page introduces you the disaggregated prefilling feature in vLLM.
|
||||
|
||||
```{note}
|
||||
This feature is experimental and subject to change.
|
||||
```
|
||||
|
||||
## Why disaggregated prefilling?
|
||||
|
||||
@ -41,13 +45,13 @@ Key abstractions for disaggregated prefilling:
|
||||
|
||||
Here is a figure illustrating how the above 3 abstractions are organized:
|
||||
|
||||
```{image} /assets/usage/disagg_prefill/abstraction.jpg
|
||||
```{image} /assets/features/disagg_prefill/abstraction.jpg
|
||||
:alt: Disaggregated prefilling abstractions
|
||||
```
|
||||
|
||||
The workflow of disaggregated prefilling is as follows:
|
||||
|
||||
```{image} /assets/usage/disagg_prefill/overview.jpg
|
||||
```{image} /assets/features/disagg_prefill/overview.jpg
|
||||
:alt: Disaggregated prefilling workflow
|
||||
```
|
||||
|
||||
@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
```
|
||||
## OpenAI Compatible Server
|
||||
|
||||
Append the following to your 4bit model arguments:
|
||||
|
||||
```
|
||||
--quantization bitsandbytes --load-format bitsandbytes
|
||||
```
|
||||
19
docs/source/features/quantization/index.md
Normal file
19
docs/source/features/quantization/index.md
Normal file
@ -0,0 +1,19 @@
|
||||
(quantization-index)=
|
||||
|
||||
# Quantization
|
||||
|
||||
Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
|
||||
|
||||
```{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
supported_hardware
|
||||
auto_awq
|
||||
bnb
|
||||
gguf
|
||||
int8
|
||||
fp8
|
||||
fp8_e5m2_kvcache
|
||||
fp8_e4m3_kvcache
|
||||
```
|
||||
@ -1,6 +1,6 @@
|
||||
(supported-hardware-for-quantization)=
|
||||
(quantization-supported-hardware)=
|
||||
|
||||
# Supported Hardware for Quantization Kernels
|
||||
# Supported Hardware
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations
|
||||
- ✗
|
||||
```
|
||||
|
||||
## Notes:
|
||||
|
||||
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
|
||||
- "✅︎" indicates that the quantization method is supported on the specified hardware.
|
||||
- "✗" indicates that the quantization method is not supported on the specified hardware.
|
||||
|
||||
Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
```{note}
|
||||
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
|
||||
For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
|
||||
```
|
||||
@ -1,6 +1,6 @@
|
||||
(spec-decode)=
|
||||
|
||||
# Speculative decoding
|
||||
# Speculative Decoding
|
||||
|
||||
```{warning}
|
||||
Please note that speculative decoding in vLLM is not yet optimized and does
|
||||
@ -159,6 +159,72 @@ A variety of speculative models of this type are available on HF hub:
|
||||
- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
|
||||
- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
|
||||
|
||||
## Speculating using EAGLE based draft models
|
||||
|
||||
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
speculative_model="path/to/modified/eagle/model",
|
||||
speculative_draft_tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
```
|
||||
|
||||
A few important things to consider when using the EAGLE based draft models:
|
||||
|
||||
1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
|
||||
used directly with vLLM due to differences in the expected layer names and model definition.
|
||||
To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d)
|
||||
to convert them. Note that this script does not modify the model's weights.
|
||||
|
||||
In the above example, use the script to first convert
|
||||
the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model
|
||||
and then use the converted checkpoint as the draft model in vLLM.
|
||||
|
||||
2. The EAGLE based draft models need to be run without tensor parallelism
|
||||
(i.e. speculative_draft_tensor_parallel_size is set to 1), although
|
||||
it is possible to run the main model using tensor parallelism (see example above).
|
||||
|
||||
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
||||
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
|
||||
investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
|
||||
|
||||
|
||||
A variety of EAGLE draft models are available on the Hugging Face hub:
|
||||
|
||||
| Base Model | EAGLE on Hugging Face | # EAGLE Parameters |
|
||||
|---------------------------------------------------------------------|-------------------------------------------|--------------------|
|
||||
| Vicuna-7B-v1.3 | yuhuili/EAGLE-Vicuna-7B-v1.3 | 0.24B |
|
||||
| Vicuna-13B-v1.3 | yuhuili/EAGLE-Vicuna-13B-v1.3 | 0.37B |
|
||||
| Vicuna-33B-v1.3 | yuhuili/EAGLE-Vicuna-33B-v1.3 | 0.56B |
|
||||
| LLaMA2-Chat 7B | yuhuili/EAGLE-llama2-chat-7B | 0.24B |
|
||||
| LLaMA2-Chat 13B | yuhuili/EAGLE-llama2-chat-13B | 0.37B |
|
||||
| LLaMA2-Chat 70B | yuhuili/EAGLE-llama2-chat-70B | 0.99B |
|
||||
| Mixtral-8x7B-Instruct-v0.1 | yuhuili/EAGLE-mixtral-instruct-8x7B | 0.28B |
|
||||
| LLaMA3-Instruct 8B | yuhuili/EAGLE-LLaMA3-Instruct-8B | 0.25B |
|
||||
| LLaMA3-Instruct 70B | yuhuili/EAGLE-LLaMA3-Instruct-70B | 0.99B |
|
||||
| Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B |
|
||||
| Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B |
|
||||
|
||||
|
||||
## Lossless guarantees of Speculative Decoding
|
||||
|
||||
In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
|
||||
@ -182,7 +248,7 @@ speculative decoding, breaking down the guarantees into three key areas:
|
||||
3. **vLLM Logprob Stability**
|
||||
\- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
|
||||
same request across runs. For more details, see the FAQ section
|
||||
titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
|
||||
titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
|
||||
|
||||
**Conclusion**
|
||||
|
||||
@ -195,7 +261,7 @@ can occur due to following factors:
|
||||
|
||||
**Mitigation Strategies**
|
||||
|
||||
For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
|
||||
For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
|
||||
|
||||
## Resources for vLLM contributors
|
||||
|
||||
@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters:
|
||||
- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
|
||||
- `guided_decoding_backend`: used to select the guided decoding backend to use.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page.
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
|
||||
|
||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||
|
||||
@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser llama3_json \
|
||||
--chat-template examples/tool_chat_template_llama3_json.jinja
|
||||
--chat-template examples/tool_chat_template_llama3.1_json.jinja
|
||||
```
|
||||
|
||||
Next, make a request to the model that should result in it using the available tools:
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
# Installation for ARM CPUs
|
||||
|
||||
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
|
||||
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
|
||||
|
||||
- CPU backend inference capabilities
|
||||
- Relevant runtime environment variables
|
||||
@ -1,6 +1,6 @@
|
||||
(installation-cpu)=
|
||||
(installation-x86)=
|
||||
|
||||
# Installation with CPU
|
||||
# Installation for x86 CPUs
|
||||
|
||||
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
|
||||
|
||||
@ -151,4 +151,4 @@ $ python examples/offline_inference.py
|
||||
$ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
|
||||
```
|
||||
|
||||
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
|
||||
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
|
||||
@ -1,8 +1,8 @@
|
||||
(installation)=
|
||||
(installation-cuda)=
|
||||
|
||||
# Installation
|
||||
# Installation for CUDA
|
||||
|
||||
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
|
||||
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.4) binaries.
|
||||
|
||||
## Requirements
|
||||
|
||||
@ -12,24 +12,43 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin
|
||||
|
||||
## Install released versions
|
||||
|
||||
You can install vLLM using pip:
|
||||
### Create a new Python environment
|
||||
|
||||
You can create a new Python environment using `conda`:
|
||||
|
||||
```console
|
||||
$ # (Recommended) Create a new conda environment.
|
||||
$ conda create -n myenv python=3.12 -y
|
||||
$ conda activate myenv
|
||||
|
||||
$ # Install vLLM with CUDA 12.1.
|
||||
$ pip install vllm
|
||||
```
|
||||
|
||||
```{note}
|
||||
Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
|
||||
[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
|
||||
```
|
||||
|
||||
````{note}
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
|
||||
We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
|
||||
Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
|
||||
|
||||
```console
|
||||
$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
|
||||
$ uv venv myenv --python 3.12 --seed
|
||||
$ source myenv/bin/activate
|
||||
```
|
||||
|
||||
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
||||
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
|
||||
|
||||
### Install vLLM
|
||||
|
||||
You can install vLLM using either `pip` or `uv pip`:
|
||||
|
||||
```console
|
||||
$ # Install vLLM with CUDA 12.4.
|
||||
$ pip install vllm # If you are using pip.
|
||||
$ uv pip install vllm # If you are using uv.
|
||||
```
|
||||
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
|
||||
|
||||
```console
|
||||
$ # Install vLLM with CUDA 11.8.
|
||||
@ -38,29 +57,47 @@ $ export PYTHON_VERSION=310
|
||||
$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
||||
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
|
||||
````
|
||||
|
||||
(install-the-latest-code)=
|
||||
|
||||
## Install the latest code
|
||||
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||
|
||||
### Install the latest code using `pip`
|
||||
|
||||
```console
|
||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
|
||||
```
|
||||
|
||||
If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
|
||||
`--pre` is required for `pip` to consider pre-released versions.
|
||||
|
||||
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
|
||||
|
||||
```console
|
||||
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
|
||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
```
|
||||
|
||||
Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
|
||||
Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
|
||||
|
||||
### Install the latest code using `uv`
|
||||
|
||||
Another way to install the latest code is to use `uv`:
|
||||
|
||||
```console
|
||||
$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
|
||||
```
|
||||
|
||||
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
|
||||
|
||||
```console
|
||||
$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
|
||||
$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
|
||||
```
|
||||
|
||||
The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
|
||||
|
||||
### Install the latest code using `docker`
|
||||
|
||||
Another way to access the latest code is to use the docker images:
|
||||
|
||||
@ -89,7 +126,7 @@ $ cd vllm
|
||||
$ VLLM_USE_PRECOMPILED=1 pip install --editable .
|
||||
```
|
||||
|
||||
This will download the latest nightly wheel and use the compiled libraries from there in the install.
|
||||
This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
|
||||
|
||||
The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
(installation-rocm)=
|
||||
|
||||
# Installation with ROCm
|
||||
# Installation for ROCm
|
||||
|
||||
vLLM supports AMD GPUs with ROCm 6.2.
|
||||
|
||||
@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
||||
$ python3 setup.py develop
|
||||
```
|
||||
|
||||
This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
|
||||
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
||||
|
||||
```{tip}
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
|
||||
@ -1,4 +1,6 @@
|
||||
# Installation with Intel® Gaudi® AI Accelerators
|
||||
(installation-gaudi)=
|
||||
|
||||
# Installation for Intel® Gaudi®
|
||||
|
||||
This README provides instructions on running vLLM with Intel Gaudi devices.
|
||||
|
||||
@ -80,7 +82,7 @@ $ python setup.py develop
|
||||
|
||||
## Supported Features
|
||||
|
||||
- [Offline batched inference](#offline-batched-inference)
|
||||
- [Offline inference](#offline-inference)
|
||||
- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
|
||||
- HPU autodetection - no need to manually select device within vLLM
|
||||
- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
|
||||
19
docs/source/getting_started/installation/index.md
Normal file
19
docs/source/getting_started/installation/index.md
Normal file
@ -0,0 +1,19 @@
|
||||
(installation-index)=
|
||||
|
||||
# Installation
|
||||
|
||||
vLLM supports the following hardware platforms:
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
gpu-cuda
|
||||
gpu-rocm
|
||||
cpu-x86
|
||||
cpu-arm
|
||||
hpu-gaudi
|
||||
tpu
|
||||
xpu
|
||||
openvino
|
||||
neuron
|
||||
```
|
||||
@ -1,6 +1,6 @@
|
||||
(installation-neuron)=
|
||||
|
||||
# Installation with Neuron
|
||||
# Installation for Neuron
|
||||
|
||||
vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
|
||||
Paged Attention and Chunked Prefill are currently in development and will be available soon.
|
||||
@ -1,8 +1,8 @@
|
||||
(installation-openvino)=
|
||||
|
||||
# Installation with OpenVINO
|
||||
# Installation for OpenVINO
|
||||
|
||||
vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
|
||||
- Prefix caching (`--enable-prefix-caching`)
|
||||
- Chunked prefill (`--enable-chunked-prefill`)
|
||||
@ -1,6 +1,6 @@
|
||||
(installation-tpu)=
|
||||
|
||||
# Installation with TPU
|
||||
# Installation for TPUs
|
||||
|
||||
Tensor Processing Units (TPUs) are Google's custom-developed application-specific
|
||||
integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
|
||||
@ -1,6 +1,6 @@
|
||||
(installation-xpu)=
|
||||
|
||||
# Installation with XPU
|
||||
# Installation for XPUs
|
||||
|
||||
vLLM initially supports basic model inferencing and serving on Intel GPU platform.
|
||||
|
||||
@ -2,20 +2,20 @@
|
||||
|
||||
# Quickstart
|
||||
|
||||
This guide will help you quickly get started with vLLM to:
|
||||
This guide will help you quickly get started with vLLM to perform:
|
||||
|
||||
- [Run offline batched inference](#offline-batched-inference)
|
||||
- [Run OpenAI-compatible inference](#openai-compatible-server)
|
||||
- [Offline batched inference](#quickstart-offline)
|
||||
- [Online inference using OpenAI-compatible server](#quickstart-online)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- OS: Linux
|
||||
- Python: 3.9 -- 3.12
|
||||
- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
|
||||
|
||||
## Installation
|
||||
|
||||
You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
|
||||
If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
|
||||
It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
|
||||
|
||||
```console
|
||||
$ conda create -n myenv python=3.10 -y
|
||||
@ -23,9 +23,11 @@ $ conda activate myenv
|
||||
$ pip install vllm
|
||||
```
|
||||
|
||||
Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
|
||||
```{note}
|
||||
For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
|
||||
```
|
||||
|
||||
(offline-batched-inference)=
|
||||
(quickstart-offline)=
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
@ -73,7 +75,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
(openai-compatible-server)=
|
||||
(quickstart-online)=
|
||||
|
||||
## OpenAI-Compatible Server
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
(debugging)=
|
||||
(troubleshooting)=
|
||||
|
||||
# Debugging Tips
|
||||
# Troubleshooting
|
||||
|
||||
This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
|
||||
```{note}
|
||||
Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
|
||||
@ -47,6 +47,7 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
|
||||
If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
|
||||
To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
|
||||
|
||||
(troubleshooting-incorrect-hardware-driver)=
|
||||
## Incorrect hardware/driver
|
||||
|
||||
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
|
||||
@ -139,7 +140,7 @@ A multi-node environment is more complicated than a single-node one. If you see
|
||||
Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
|
||||
```
|
||||
|
||||
(debugging-python-multiprocessing)=
|
||||
(troubleshooting-python-multiprocessing)=
|
||||
## Python multiprocessing
|
||||
|
||||
### `RuntimeError` Exception
|
||||
@ -150,7 +151,7 @@ If you have seen a warning in your logs like this:
|
||||
WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
|
||||
initialized. We must use the `spawn` multiprocessing start method. Setting
|
||||
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
|
||||
https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
|
||||
https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
|
||||
for more information.
|
||||
```
|
||||
|
||||
@ -50,7 +50,7 @@ For more information, check out the following:
|
||||
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
|
||||
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
|
||||
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
|
||||
- {ref}`vLLM Meetups <meetups>`.
|
||||
- [vLLM Meetups](#meetups)
|
||||
|
||||
## Documentation
|
||||
|
||||
@ -58,91 +58,68 @@ For more information, check out the following:
|
||||
:caption: Getting Started
|
||||
:maxdepth: 1
|
||||
|
||||
getting_started/installation
|
||||
getting_started/amd-installation
|
||||
getting_started/openvino-installation
|
||||
getting_started/cpu-installation
|
||||
getting_started/gaudi-installation
|
||||
getting_started/arm-installation
|
||||
getting_started/neuron-installation
|
||||
getting_started/tpu-installation
|
||||
getting_started/xpu-installation
|
||||
getting_started/installation/index
|
||||
getting_started/quickstart
|
||||
getting_started/debugging
|
||||
getting_started/examples/examples_index
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Serving
|
||||
:maxdepth: 1
|
||||
|
||||
serving/openai_compatible_server
|
||||
serving/deploying_with_docker
|
||||
serving/deploying_with_k8s
|
||||
serving/deploying_with_helm
|
||||
serving/deploying_with_nginx
|
||||
serving/distributed_serving
|
||||
serving/metrics
|
||||
serving/integrations
|
||||
serving/tensorizer
|
||||
serving/runai_model_streamer
|
||||
getting_started/troubleshooting
|
||||
getting_started/faq
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Models
|
||||
:maxdepth: 1
|
||||
|
||||
models/supported_models
|
||||
models/generative_models
|
||||
models/pooling_models
|
||||
models/adding_model
|
||||
models/enabling_multimodal_inputs
|
||||
models/supported_models
|
||||
models/extensions/index
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Usage
|
||||
:caption: Features
|
||||
:maxdepth: 1
|
||||
|
||||
usage/lora
|
||||
usage/multimodal_inputs
|
||||
usage/tool_calling
|
||||
usage/structured_outputs
|
||||
usage/spec_decode
|
||||
usage/compatibility_matrix
|
||||
usage/performance
|
||||
usage/faq
|
||||
usage/engine_args
|
||||
usage/env_vars
|
||||
usage/usage_stats
|
||||
usage/disagg_prefill
|
||||
features/quantization/index
|
||||
features/lora
|
||||
features/tool_calling
|
||||
features/structured_outputs
|
||||
features/automatic_prefix_caching
|
||||
features/disagg_prefill
|
||||
features/spec_decode
|
||||
features/compatibility_matrix
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Quantization
|
||||
:caption: Inference and Serving
|
||||
:maxdepth: 1
|
||||
|
||||
quantization/supported_hardware
|
||||
quantization/auto_awq
|
||||
quantization/bnb
|
||||
quantization/gguf
|
||||
quantization/int8
|
||||
quantization/fp8
|
||||
quantization/fp8_e5m2_kvcache
|
||||
quantization/fp8_e4m3_kvcache
|
||||
serving/offline_inference
|
||||
serving/openai_compatible_server
|
||||
serving/multimodal_inputs
|
||||
serving/distributed_serving
|
||||
serving/metrics
|
||||
serving/engine_args
|
||||
serving/env_vars
|
||||
serving/usage_stats
|
||||
serving/integrations/index
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Automatic Prefix Caching
|
||||
:caption: Deployment
|
||||
:maxdepth: 1
|
||||
|
||||
automatic_prefix_caching/apc
|
||||
automatic_prefix_caching/details
|
||||
deployment/docker
|
||||
deployment/k8s
|
||||
deployment/nginx
|
||||
deployment/frameworks/index
|
||||
deployment/integrations/index
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Performance
|
||||
:maxdepth: 1
|
||||
|
||||
performance/optimization
|
||||
performance/benchmarks
|
||||
```
|
||||
|
||||
@ -156,10 +133,8 @@ community/meetups
|
||||
community/sponsors
|
||||
```
|
||||
|
||||
% API Documentation: API reference aimed at vllm library usage
|
||||
|
||||
```{toctree}
|
||||
:caption: API Documentation
|
||||
:caption: API Reference
|
||||
:maxdepth: 2
|
||||
|
||||
dev/sampling_params
|
||||
@ -168,30 +143,33 @@ dev/offline_inference/offline_index
|
||||
dev/engine/engine_index
|
||||
```
|
||||
|
||||
% Design: docs about vLLM internals
|
||||
% Design Documents: Details about vLLM internals
|
||||
|
||||
```{toctree}
|
||||
:caption: Design
|
||||
:caption: Design Documents
|
||||
:maxdepth: 2
|
||||
|
||||
design/arch_overview
|
||||
design/huggingface_integration
|
||||
design/plugin_system
|
||||
design/input_processing/model_inputs_index
|
||||
design/kernel/paged_attention
|
||||
design/input_processing/model_inputs_index
|
||||
design/multimodal/multimodal_index
|
||||
design/automatic_prefix_caching
|
||||
design/multiprocessing
|
||||
```
|
||||
|
||||
% For Developers: contributing to the vLLM project
|
||||
% Developer Guide: How to contribute to the vLLM project
|
||||
|
||||
```{toctree}
|
||||
:caption: For Developers
|
||||
:caption: Developer Guide
|
||||
:maxdepth: 2
|
||||
|
||||
contributing/overview
|
||||
contributing/profiling/profiling_index
|
||||
contributing/dockerfile/dockerfile
|
||||
contributing/model/index
|
||||
contributing/vulnerability_management
|
||||
```
|
||||
|
||||
# Indices and tables
|
||||
|
||||
@ -1,155 +0,0 @@
|
||||
(adding-a-new-model)=
|
||||
|
||||
# Adding a New Model
|
||||
|
||||
This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
|
||||
|
||||
```{note}
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
```
|
||||
|
||||
```{note}
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
|
||||
please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
|
||||
```
|
||||
|
||||
```{tip}
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
|
||||
We will be happy to help you out!
|
||||
```
|
||||
|
||||
## 0. Fork the vLLM repository
|
||||
|
||||
Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
```{tip}
|
||||
If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
|
||||
```
|
||||
|
||||
## 1. Bring your model code
|
||||
|
||||
Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
```{warning}
|
||||
When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
|
||||
```
|
||||
|
||||
## 2. Make your code compatible with vLLM
|
||||
|
||||
To ensure compatibility with vLLM, your model must meet the following requirements:
|
||||
|
||||
### Initialization Code
|
||||
|
||||
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
|
||||
|
||||
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
|
||||
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
|
||||
|
||||
The initialization code should look like this:
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.attention import Attention
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.attn = Attention(prefix=f"{prefix}.attn")
|
||||
|
||||
class MyDecoderLayer(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
||||
|
||||
class MyModel(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList(
|
||||
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
||||
)
|
||||
|
||||
class MyModelForCausalLM(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
||||
```
|
||||
|
||||
### Computation Code
|
||||
|
||||
Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
|
||||
|
||||
```python
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
```{note}
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
```
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
||||
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
||||
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
|
||||
When it comes to the linear layers, we provide the following options to parallelize them:
|
||||
|
||||
- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||
- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||
- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||
- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||
- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||
|
||||
Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||
|
||||
## 4. Implement the weight loading logic
|
||||
|
||||
You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
|
||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||
|
||||
## 5. Register your model
|
||||
|
||||
Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
|
||||
|
||||
## 6. Out-of-Tree Model Integration
|
||||
|
||||
You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
|
||||
|
||||
To register the model, use the following code:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
from your_code import YourModelForCausalLM
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
```
|
||||
|
||||
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
|
||||
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
|
||||
```
|
||||
|
||||
```{important}
|
||||
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
Read more about that [here](#enabling-multimodal-inputs).
|
||||
```
|
||||
|
||||
```{note}
|
||||
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
|
||||
```
|
||||
8
docs/source/models/extensions/index.md
Normal file
8
docs/source/models/extensions/index.md
Normal file
@ -0,0 +1,8 @@
|
||||
# Built-in Extensions
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
runai_model_streamer
|
||||
tensorizer
|
||||
```
|
||||
@ -1,6 +1,6 @@
|
||||
(runai-model-streamer)=
|
||||
|
||||
# Loading Models with Run:ai Model Streamer
|
||||
# Loading models with Run:ai Model Streamer
|
||||
|
||||
Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
|
||||
Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
|
||||
@ -1,6 +1,6 @@
|
||||
(tensorizer)=
|
||||
|
||||
# Loading Models with CoreWeave's Tensorizer
|
||||
# Loading models with CoreWeave's Tensorizer
|
||||
|
||||
vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
|
||||
vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
|
||||
@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
|
||||
|
||||
## Online Inference
|
||||
|
||||
Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
|
||||
- [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
|
||||
|
||||
@ -106,7 +106,7 @@ A code example can be found here: <gh-file:examples/offline_inference_scoring.py
|
||||
|
||||
## Online Inference
|
||||
|
||||
Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user