Compare commits

..

4 Commits

Author SHA1 Message Date
22bf5c5077 fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-11 11:38:33 -07:00
3a8990743e add truncation
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-11 11:20:31 -07:00
fbc2cc8217 merge 2025-10-11 11:09:22 -07:00
efd4bc967d [Misc] Remove in ModelRunnerOutput
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-23 21:09:20 -07:00
1247 changed files with 18290 additions and 30001 deletions

View File

@ -5,11 +5,11 @@ import os
import sys import sys
import zipfile import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Note that we have 800 MiB quota, please use it wisely. # Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 . # See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.90
limit: 100
num_fewshot: 0

View File

@ -1,11 +0,0 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,5 +1,4 @@
# For vllm script, with -t option (tensor parallel size) # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -1 +0,0 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -1,44 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -1,50 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,27 +19,21 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096) max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
model_args = ( model_args = (
f"pretrained={eval_config['model_name']}," f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={tp_size},"
f"enforce_eager=true," f"enforce_eager=true,"
f"add_bos_token=true," f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}," f"max_model_len={max_model_len}"
) )
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model=backend, model="vllm",
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help batch_size="auto",
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
apply_chat_template=backend == "vllm-vlm",
batch_size=batch_size,
) )
return results return results

View File

@ -8,7 +8,7 @@ steps:
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest # Add job to create multi-arch manifest

File diff suppressed because it is too large Load Diff

View File

@ -527,9 +527,8 @@ steps:
# since torchao nightly is only compatible with torch nightly currently # since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
timeout_in_minutes: 75 timeout_in_minutes: 75
@ -734,16 +733,6 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true

View File

@ -1,10 +1,5 @@
[run] [run]
# Track the installed vllm package (this is what actually gets imported during tests) source = vllm
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit = omit =
*/tests/* */tests/*
*/test_* */test_*
@ -17,16 +12,6 @@ omit =
*/benchmarks/* */benchmarks/*
*/docs/* */docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report] [report]
exclude_lines = exclude_lines =
pragma: no cover pragma: no cover

View File

@ -1,4 +0,0 @@
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f

13
.github/CODEOWNERS vendored
View File

@ -5,7 +5,9 @@
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
@ -24,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson /vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
@ -57,7 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/offloading @ApostaC /tests/v1/offloading @ApostaC
# Transformers backend # Transformers backend
/vllm/model_executor/models/transformers @hmellor /vllm/model_executor/models/transformers.py @hmellor
/tests/models/test_transformers.py @hmellor /tests/models/test_transformers.py @hmellor
# Docs # Docs
@ -118,11 +121,3 @@ mkdocs.yaml @hmellor
# KVConnector installation files # KVConnector installation files
/requirements/kv_connectors.txt @NickLucche /requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop

View File

@ -13,7 +13,6 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Label issues based on keywords - name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with: with:
script: | script: |
@ -43,6 +42,7 @@ jobs:
searchIn: "body" searchIn: "body"
}, },
], ],
// Substring search - matches anywhere in text (partial matches) // Substring search - matches anywhere in text (partial matches)
substrings: [ substrings: [
{ {
@ -89,12 +89,14 @@ jobs:
term: "hip_", term: "hip_",
searchIn: "both" searchIn: "both"
}, },
// ROCm tools and libraries // ROCm tools and libraries
{ {
term: "hipify", term: "hipify",
searchIn: "both" searchIn: "both"
}, },
], ],
// Regex patterns - for complex pattern matching // Regex patterns - for complex pattern matching
regexPatterns: [ regexPatterns: [
{ {
@ -105,17 +107,13 @@ jobs:
} }
], ],
}, },
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
}; };
// Helper function to create regex based on search type // Helper function to create regex based on search type
function createSearchRegex(term, type) { function createSearchRegex(term, type) {
// Escape special regex characters in the term // Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) { switch (type) {
case 'keyword': case 'keyword':
// Word boundary search - matches whole words only // Word boundary search - matches whole words only
@ -127,13 +125,16 @@ jobs:
throw new Error(`Unknown search type: ${type}`); throw new Error(`Unknown search type: ${type}`);
} }
} }
// Helper function to find matching terms in text with line information // Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = []; const matches = [];
const lines = text.split('\n'); const lines = text.split('\n');
for (const termConfig of searchTerms) { for (const termConfig of searchTerms) {
let regex; let regex;
let term, searchIn, pattern, description, flags; let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object) // Handle different input formats (string or object)
if (typeof termConfig === 'string') { if (typeof termConfig === 'string') {
term = termConfig; term = termConfig;
@ -145,17 +146,21 @@ jobs:
description = termConfig.description; description = termConfig.description;
flags = termConfig.flags; flags = termConfig.flags;
} }
// Skip if this term shouldn't be searched in the current location // Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) { if (searchIn !== 'both' && searchIn !== searchLocation) {
continue; continue;
} }
// Create appropriate regex // Create appropriate regex
if (searchType === 'regex') { if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi"); regex = new RegExp(pattern, flags || "gi");
} else { } else {
regex = createSearchRegex(term, searchType); regex = createSearchRegex(term, searchType);
} }
const termMatches = []; const termMatches = [];
// Check each line for matches // Check each line for matches
lines.forEach((line, lineIndex) => { lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex); const lineMatches = line.match(regex);
@ -170,14 +175,15 @@ jobs:
originalTerm: term || pattern, originalTerm: term || pattern,
description: description, description: description,
// Show context around the match in the line // Show context around the match in the line
context: line.length > 100 ? context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim() : line.trim()
}); });
}); });
} }
}); });
if (termMatches.length > 0) { if (termMatches.length > 0) {
matches.push({ matches.push({
term: term || (description || pattern), term: term || (description || pattern),
@ -190,48 +196,64 @@ jobs:
}); });
} }
} }
return matches; return matches;
} }
// Helper function to check if label should be added // Helper function to check if label should be added
async function processLabel(labelName, config) { async function processLabel(labelName, config) {
const body = context.payload.issue.body || ""; const body = context.payload.issue.body || "";
const title = context.payload.issue.title || ""; const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`); core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`); core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`); core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false; let shouldAddLabel = false;
let allMatches = []; let allMatches = [];
let reason = ''; let reason = '';
const keywords = config.keywords || []; const keywords = config.keywords || [];
const substrings = config.substrings || []; const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || []; const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title // Search in title
if (title.trim()) { if (title.trim()) {
core.notice(`Searching in title: "${title}"`); core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
} }
// Search in body // Search in body
if (body.trim()) { if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`); core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
} }
if (allMatches.length > 0) { if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`); core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) { for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') { if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else { } else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} }
// Show details for each match // Show details for each match
termMatch.matches.forEach((match, index) => { termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -244,6 +266,7 @@ jobs:
} }
}); });
} }
shouldAddLabel = true; shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -251,10 +274,13 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
} }
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) { if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name); const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) { if (!existingLabels.includes(labelName)) {
@ -270,92 +296,14 @@ jobs:
core.notice(`Label "${labelName}" already present.`); core.notice(`Label "${labelName}" already present.`);
return false; return false;
} }
core.notice(`No matching terms found for label "${labelName}".`); core.notice(`No matching terms found for label "${labelName}".`);
return false; return false;
} }
// Process all configured labels // Process all configured labels
const labelsAddedResults = await Promise.all( const processLabels = Object.entries(labelConfig)
Object.entries(labelConfig).map(([labelName, config]) => .map(([labelName, config]) => processLabel(labelName, config));
processLabel(labelName, config).then(added => ({ labelName, added })) const labelsAdded = await Promise.all(processLabels);
) const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
); core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}

View File

@ -16,7 +16,6 @@ repos:
rev: v1.38.1 rev: v1.38.1
hooks: hooks:
- id: typos - id: typos
args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v21.1.2 rev: v21.1.2
hooks: hooks:

View File

@ -8,6 +8,7 @@ import sys
import time import time
import traceback import traceback
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp import aiohttp
import huggingface_hub.constants import huggingface_hub.constants
@ -27,13 +28,13 @@ class RequestFuncInput:
prompt_len: int prompt_len: int
output_len: int output_len: int
model: str model: str
model_name: str | None = None model_name: Optional[str] = None
logprobs: int | None = None logprobs: Optional[int] = None
extra_body: dict | None = None extra_body: Optional[dict] = None
multi_modal_content: dict | list[dict] | None = None multi_modal_content: Optional[dict | list[dict]] = None
ignore_eos: bool = False ignore_eos: bool = False
language: str | None = None language: Optional[str] = None
request_id: str | None = None request_id: Optional[str] = None
@dataclass @dataclass
@ -51,7 +52,7 @@ class RequestFuncOutput:
async def async_request_tgi( async def async_request_tgi(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
@ -132,7 +133,7 @@ async def async_request_tgi(
async def async_request_trt_llm( async def async_request_trt_llm(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
@ -203,7 +204,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii( async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions( async def async_request_openai_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
@ -366,7 +367,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions( async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), ( assert api_url.endswith(("chat/completions", "profile")), (
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio( async def async_request_openai_audio(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep. # Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile import soundfile
@ -609,7 +610,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
**kwargs, **kwargs,
) -> PreTrainedTokenizer | PreTrainedTokenizerFast: ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists( if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path pretrained_model_name_or_path
): ):

View File

@ -32,6 +32,7 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import Optional
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int], input_length_range: tuple[int, int],
fixed_output_len: int | None, fixed_output_len: Optional[int],
) -> list[Request]: ) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")
@ -127,7 +128,7 @@ def sample_requests_from_random(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int], input_length_range: tuple[int, int],
fixed_output_len: int | None, fixed_output_len: Optional[int],
prefix_len: int, prefix_len: int,
) -> list[Request]: ) -> list[Request]:
requests = [] requests = []

View File

@ -7,6 +7,7 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -23,7 +24,7 @@ def sample_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
fixed_output_len: int | None, fixed_output_len: Optional[int],
) -> list[tuple[str, int, int, int]]: ) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")

View File

@ -31,8 +31,8 @@ import time
import uuid import uuid
import warnings import warnings
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from contextlib import nullcontext
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional
import datasets import datasets
import numpy as np import numpy as np
@ -316,7 +316,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str], selected_percentile_metrics: list[str],
selected_percentiles: list[float], selected_percentiles: list[float],
goodput_config_dict: dict[str, float] | None = None, goodput_config_dict: Optional[dict[str, float]] = None,
) -> tuple[BenchmarkMetrics, list[int]]: ) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = [] actual_output_lens: list[int] = []
total_input = 0 total_input = 0
@ -436,9 +436,9 @@ async def benchmark(
selected_percentile_metrics: list[str], selected_percentile_metrics: list[str],
selected_percentiles: list[str], selected_percentiles: list[str],
ignore_eos: bool, ignore_eos: bool,
max_concurrency: int | None, max_concurrency: Optional[int],
structured_output_ratio: float, structured_output_ratio: float,
goodput_config_dict: dict[str, float] | None = None, goodput_config_dict: Optional[dict[str, float]] = None,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
@ -502,9 +502,15 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) pbar = None if disable_tqdm else tqdm(total=len(input_requests))
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext() # This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
async def limited_request_func(request_func_input, pbar): async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore: async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar) return await request_func(request_func_input=request_func_input, pbar=pbar)

View File

@ -6,7 +6,7 @@ import math
import os import os
import time import time
from types import TracebackType from types import TracebackType
from typing import Any from typing import Any, Optional, Union
def convert_to_pytorch_benchmark_format( def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None: def __init__(self, scale: int) -> None:
self.cnt: int = 0 self.cnt: int = 0
self._sum: int = 0 self._sum: int = 0
self._max: int | None = None self._max: Optional[int] = None
self.scale = scale self.scale = scale
self.start_time: int = time.monotonic_ns() self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
else: else:
self._max = max(self._max, v) self._max = max(self._max, v)
def avg(self) -> float | str: def avg(self) -> Union[float, str]:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> float | str: def max(self) -> Union[float, str]:
return self._max / self.scale if self._max else "N/A" return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[float | str]: def dump_avg_max(self) -> list[Union[float, str]]:
return [self.avg(), self.max()] return [self.avg(), self.max()]
def __enter__(self) -> None: def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
def __exit__( def __exit__(
self, self,
exc_type: type[BaseException] | None, exc_type: Optional[type[BaseException]],
exc_value: BaseException | None, exc_value: Optional[BaseException],
exc_traceback: TracebackType | None, exc_traceback: Optional[TracebackType],
) -> None: ) -> None:
self.collect(time.monotonic_ns() - self.start_time) self.collect(time.monotonic_ns() - self.start_time)

View File

@ -6,7 +6,8 @@ import copy
import itertools import itertools
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Callable
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark

View File

@ -6,7 +6,8 @@ import copy
import itertools import itertools
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -52,7 +53,7 @@ def bench_int8(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels.""" """Benchmark INT8-based kernels."""
assert dtype == torch.int8 assert dtype == torch.int8
@ -107,7 +108,7 @@ def bench_fp8(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels.""" """Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn assert dtype == torch.float8_e4m3fn
@ -182,7 +183,7 @@ def bench(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
if dtype == torch.int8: if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -200,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run( def run(
dtype: torch.dtype, dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]], MKNs: Iterable[tuple[int, int, int]],
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
results = [] results = []
for m, k, n in MKNs: for m, k, n in MKNs:

View File

@ -3,9 +3,10 @@
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from itertools import product from itertools import product
from typing import Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl( def unfused_int8_impl(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
# Norm # Norm
@ -67,7 +68,7 @@ def unfused_int8_impl(
def unfused_fp8_impl( def unfused_fp8_impl(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
# Norm # Norm
@ -84,7 +85,7 @@ def unfused_fp8_impl(
def fused_impl( def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
out, _ = ops.rms_norm_dynamic_per_token_quant( out, _ = ops.rms_norm_dynamic_per_token_quant(

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from collections.abc import Callable from typing import Callable
from unittest.mock import patch from unittest.mock import patch
import pandas as pd import pandas as pd

View File

@ -22,8 +22,8 @@ Example:
import json import json
import os import os
import time import time
from collections.abc import Callable
from contextlib import nullcontext from contextlib import nullcontext
from typing import Callable, Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
def benchmark_allreduce_single( def benchmark_allreduce_single(
self, self,
sequence_length: int, sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
should_use_fn: Callable[[torch.Tensor], bool], should_use_fn: Callable[[torch.Tensor], bool],
context, context,
num_warmup: int, num_warmup: int,
num_trials: int, num_trials: int,
) -> float | None: ) -> Optional[float]:
"""Benchmark method with CUDA graph optimization.""" """Benchmark method with CUDA graph optimization."""
try: try:
# Create test tensor (2D: sequence_length x hidden_size) # Create test tensor (2D: sequence_length x hidden_size)

View File

@ -6,12 +6,11 @@ import copy
import json import json
import pickle import pickle
import time import time
from collections.abc import Callable
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from itertools import product from itertools import product
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -159,7 +158,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor, seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor,
scaling: float, scaling: float,
add_inputs: bool | None, add_inputs: Optional[bool],
): ):
""" """
Torch group gemm reference implementation to test correctness of Torch group gemm reference implementation to test correctness of
@ -317,8 +316,8 @@ class BenchmarkContext:
lora_rank: int lora_rank: int
sort_by_lora_id: bool sort_by_lora_id: bool
dtype: torch.dtype dtype: torch.dtype
seq_length: int | None = None seq_length: Optional[int] = None
num_slices: int | None = None # num_slices for slice based ops num_slices: Optional[int] = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext": def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self) ctx = copy.copy(self)
@ -562,7 +561,7 @@ class BenchmarkTensors:
} }
def bench_fn_kwargs( def bench_fn_kwargs(
self, op_type: OpType, add_inputs: bool | None = None self, op_type: OpType, add_inputs: Optional[bool] = None
) -> dict[str, Any]: ) -> dict[str, Any]:
if op_type.is_shrink_fn(): if op_type.is_shrink_fn():
assert add_inputs is None assert add_inputs is None
@ -576,7 +575,7 @@ class BenchmarkTensors:
raise ValueError(f"Unrecognized optype {self}") raise ValueError(f"Unrecognized optype {self}")
def test_correctness( def test_correctness(
self, op_type: OpType, expand_fn_add_inputs: bool | None self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
) -> bool: ) -> bool:
""" """
Test correctness of op_type implementation against a grouped gemm Test correctness of op_type implementation against a grouped gemm
@ -612,8 +611,8 @@ def bench_optype(
ctx: BenchmarkContext, ctx: BenchmarkContext,
arg_pool_size: int, arg_pool_size: int,
op_type: OpType, op_type: OpType,
cuda_graph_nops: int | None = None, cuda_graph_nops: Optional[int] = None,
expand_fn_add_inputs: bool | None = None, expand_fn_add_inputs: Optional[bool] = None,
test_correctness: bool = False, test_correctness: bool = False,
) -> TMeasurement: ) -> TMeasurement:
assert arg_pool_size >= 1 assert arg_pool_size >= 1
@ -680,7 +679,7 @@ def bench_torch_mm(
ctx: BenchmarkContext, ctx: BenchmarkContext,
arg_pool_size: int, arg_pool_size: int,
op_type: OpType, op_type: OpType,
cuda_graph_nops: int | None = None, cuda_graph_nops: Optional[int] = None,
) -> TMeasurement: ) -> TMeasurement:
""" """
Benchmark basic torch.mm as a roofline. Benchmark basic torch.mm as a roofline.
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
""" """
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
compare = TBenchmark.Compare(timers) compare = TBenchmark.Compare(timers)
compare.print() compare.print()

View File

@ -8,9 +8,10 @@ import math
import os import os
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from itertools import product from itertools import product
from typing import Callable, Optional
import pandas as pd import pandas as pd
import torch import torch
@ -62,23 +63,23 @@ class BenchmarkTensors:
a: torch.Tensor a: torch.Tensor
w_q: torch.Tensor w_q: torch.Tensor
group_size: int | None group_size: Optional[int]
wtype: ScalarType wtype: ScalarType
w_g_s: torch.Tensor w_g_s: torch.Tensor
w_g_zp: torch.Tensor | None w_g_zp: Optional[torch.Tensor]
w_ch_s: torch.Tensor | None w_ch_s: Optional[torch.Tensor]
w_tok_s: torch.Tensor | None w_tok_s: Optional[torch.Tensor]
@dataclass @dataclass
class TypeConfig: class TypeConfig:
act_type: torch.dtype act_type: torch.dtype
weight_type: ScalarType weight_type: ScalarType
output_type: torch.dtype | None output_type: Optional[torch.dtype]
group_scale_type: torch.dtype | None group_scale_type: Optional[torch.dtype]
group_zero_type: torch.dtype | None group_zero_type: Optional[torch.dtype]
channel_scale_type: torch.dtype | None channel_scale_type: Optional[torch.dtype]
token_scale_type: torch.dtype | None token_scale_type: Optional[torch.dtype]
def rand_data(shape, dtype=torch.float16, scale=1): def rand_data(shape, dtype=torch.float16, scale=1):
@ -92,8 +93,8 @@ def quantize_and_pack(
atype: torch.dtype, atype: torch.dtype,
w: torch.Tensor, w: torch.Tensor,
wtype: ScalarType, wtype: ScalarType,
stype: torch.dtype | None, stype: Optional[torch.dtype],
group_size: int | None, group_size: Optional[int],
zero_points: bool = False, zero_points: bool = False,
): ):
assert wtype.is_integer(), "TODO: support floating point weights" assert wtype.is_integer(), "TODO: support floating point weights"
@ -112,7 +113,7 @@ def quantize_and_pack(
def create_bench_tensors( def create_bench_tensors(
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
) -> list[BenchmarkTensors]: ) -> list[BenchmarkTensors]:
m, n, k = shape m, n, k = shape
@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res return res
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
def bench( def bench(

View File

@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else: else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)

View File

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
hidden_size = config.hidden_size hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute use_customized_permute = args.use_customized_permute

View File

@ -3,6 +3,7 @@
import random import random
import time import time
from typing import Optional
import torch import torch
@ -36,7 +37,7 @@ def main(
seed: int, seed: int,
do_profile: bool, do_profile: bool,
device: str = "cuda", device: str = "cuda",
kv_cache_dtype: str | None = None, kv_cache_dtype: Optional[str] = None,
) -> None: ) -> None:
current_platform.seed_everything(seed) current_platform.seed_everything(seed)

View File

@ -3,8 +3,8 @@
import argparse import argparse
import math import math
from collections.abc import Callable
from contextlib import contextmanager from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch from unittest.mock import patch
import torch import torch

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random import random
import time import time

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random import random
import time import time

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from typing import Optional, Union
import torch import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -20,8 +21,8 @@ class HuggingFaceRMSNorm(nn.Module):
def forward( def forward(
self, self,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
orig_dtype = x.dtype orig_dtype = x.dtype
x = x.to(torch.float32) x = x.to(torch.float32)
if residual is not None: if residual is not None:
@ -40,7 +41,7 @@ class HuggingFaceRMSNorm(nn.Module):
def rmsnorm_naive( def rmsnorm_naive(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -64,7 +65,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer( def rmsnorm_flashinfer(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
orig_shape = x.shape orig_shape = x.shape
@ -88,7 +89,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm( def rmsnorm_vllm(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
orig_shape = x.shape orig_shape = x.shape

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate from itertools import accumulate
from typing import Optional
import nvtx import nvtx
import torch import torch
@ -17,7 +18,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int, seq_len: int,
num_heads: int, num_heads: int,
head_size: int, head_size: int,
rotary_dim: int | None, rotary_dim: Optional[int],
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: str, device: str,

View File

@ -4,6 +4,7 @@
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from typing import Optional
import flashinfer import flashinfer
import torch import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad() @torch.no_grad()
def benchmark_decode( def benchmark_decode(
dtype: torch.dtype, dtype: torch.dtype,
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
batch_size: int, batch_size: int,
max_seq_len: int, max_seq_len: int,
num_heads: tuple[int, int] = (64, 8), num_heads: tuple[int, int] = (64, 8),

View File

@ -4,6 +4,7 @@
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from typing import Optional
import flashinfer import flashinfer
import torch import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad() @torch.no_grad()
def benchmark_prefill( def benchmark_prefill(
dtype: torch.dtype, dtype: torch.dtype,
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
batch_size: int, batch_size: int,
max_seq_len: int, max_seq_len: int,
num_heads: tuple[int, int] = (64, 8), num_heads: tuple[int, int] = (64, 8),

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Any from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
def __init__( def __init__(
self, self,
cuda_graph_params: CudaGraphBenchParams | None, cuda_graph_params: Optional[CudaGraphBenchParams],
label: str, label: str,
sub_label: str, sub_label: str,
description: str, description: str,

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from statistics import mean from statistics import mean
from typing import Any, NamedTuple from typing import Any, NamedTuple, Optional, Union
import numpy as np # type: ignore import numpy as np # type: ignore
import pandas as pd # type: ignore import pandas as pd # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
class UniformDistribution(Distribution): class UniformDistribution(Distribution):
def __init__( def __init__(
self, self,
min_val: int | float, min_val: Union[int, float],
max_val: int | float, max_val: Union[int, float],
is_integer: bool = True, is_integer: bool = True,
) -> None: ) -> None:
self.min_val = min_val self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
class ConstantDistribution(Distribution): class ConstantDistribution(Distribution):
def __init__(self, value: int | float) -> None: def __init__(self, value: Union[int, float]) -> None:
self.value = value self.value = value
self.max_val = value self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
class ZipfDistribution(Distribution): class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None: def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha self.alpha = alpha
self.max_val = max_val self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
class PoissonDistribution(Distribution): class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None: def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha self.alpha = alpha
self.max_val = max_val self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution): class LognormalDistribution(Distribution):
def __init__( def __init__(
self, self,
mean: float | None = None, mean: Optional[float] = None,
sigma: float | None = None, sigma: Optional[float] = None,
average: int | None = None, average: Optional[int] = None,
median_ratio: float | None = None, median_ratio: Optional[float] = None,
max_val: int | None = None, max_val: Optional[int] = None,
) -> None: ) -> None:
self.average = average self.average = average
self.median_ratio = median_ratio self.median_ratio = median_ratio

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum from enum import Enum
from http import HTTPStatus from http import HTTPStatus
from statistics import mean from statistics import mean
from typing import NamedTuple from typing import NamedTuple, Union
import aiohttp # type: ignore import aiohttp # type: ignore
import numpy as np # type: ignore import numpy as np # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
class DebugStats: class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None: def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger self.logger = logger
self.metrics: dict[str, MovingAverage | MetricStats] = { self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
"moving_avg_ttft_ms": MovingAverage(window_size), "moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size), "moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(), "ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(
if args.verbose: if args.verbose:
curr_time_sec: float = time.perf_counter() curr_time_sec: float = time.perf_counter()
time_since_last_turn: str | float = "N/A" time_since_last_turn: Union[str, float] = "N/A"
if conv_id in time_of_last_turn: if conv_id in time_of_last_turn:
time_since_last_turn = round( time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3 curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
) )
rps: str | float = round(len(client_metrics) / runtime_sec, 3) rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients): if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low # Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed) # (threshold can be tuned if needed)
rps = "N/A" rps = "N/A"
runtime_left_sec: str | float = round( runtime_left_sec: Union[str, float] = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3 (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
) )
if percent < 0.05: if percent < 0.05:

View File

@ -13,7 +13,7 @@ import argparse
import json import json
import random import random
from statistics import mean from statistics import mean
from typing import Any from typing import Any, Optional
import pandas as pd # type: ignore import pandas as pd # type: ignore
import tqdm # type: ignore import tqdm # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid( def content_is_valid(
content: str, min_content_len: int | None, max_content_len: int | None content: str, min_content_len: Optional[int], max_content_len: Optional[int]
) -> bool: ) -> bool:
if min_content_len and len(content) < min_content_len: if min_content_len and len(content) < min_content_len:
return False return False
@ -37,7 +37,7 @@ def content_is_valid(
def print_stats( def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
) -> None: ) -> None:
# Collect statistics # Collect statistics
stats = [] stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int, seed: int,
input_file: str, input_file: str,
output_file: str, output_file: str,
max_items: int | None, max_items: Optional[int],
min_content_len: int | None = None, min_content_len: Optional[int] = None,
max_content_len: int | None = None, max_content_len: Optional[int] = None,
min_turns: int | None = None, min_turns: Optional[int] = None,
max_turns: int | None = None, max_turns: Optional[int] = None,
model: str | None = None, model: Optional[str] = None,
) -> None: ) -> None:
if min_turns and max_turns: if min_turns and max_turns:
assert min_turns <= max_turns assert min_turns <= max_turns

View File

@ -22,10 +22,10 @@ else()
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
) )
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif() endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR) if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif() endif()

View File

@ -1,12 +0,0 @@
codecov:
require_ci_to_pass: false
fixes:
# Map source code paths to repository root paths
# Wildcards match any Python version (python3.*)
- "/vllm-workspace/src/vllm/::vllm/"
- "/vllm-workspace/vllm/::vllm/"
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"

View File

@ -125,37 +125,32 @@ public:
} }
static void set_split_kv (KernelArguments& args) { static void set_split_kv (KernelArguments& args) {
// printf("set_split_kv start");
if (args.split_kv >= 1) return; if (args.split_kv >= 1) return;
auto [H, K, D, B] = args.problem_shape; auto [H, K, D, B] = args.problem_shape;
// std::cout << H << " " << K << " " << D << " " << B << "\n";
int sm_count = args.hw_info.sm_count; int sm_count = args.hw_info.sm_count;
float seq_length_k = static_cast<float>(K) / 1024.0f; // printf(" sm_count = %d\n", sm_count);
int max_splits = 1; int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
if (B <= 4 && seq_length_k >= 16) { // TODO: This avoids a hang when the batch size larger than 1 and
max_splits = 16; // there is more than 1 kv_splits.
// Discuss with NVIDIA how this can be fixed.
if (B > 1) {
max_splits = min(1, max_splits);
} }
else if (B <= 8 && seq_length_k >= 4) {
max_splits = 8; // printf(" max_splits = %d\n", max_splits);
}
else if ((B <= 16 && seq_length_k >= 8) ||
(B == 48 && seq_length_k >= 32)) {
max_splits = 4;
}
else if ((B <= 32 && seq_length_k >= 16) ||
(B == 96 && seq_length_k >= 16)) {
max_splits = 2;
}
else {
max_splits = 1;
}
// Wave-aware scheduling: ensure integer number of waves in K dimension
int sms_per_batch = max(1, sm_count / B); int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);
int split_heur = min(max_splits, sms_per_batch); int split_heur = min(max_splits, sms_per_batch);
int waves = ceil_div(B * split_heur, sm_count); int waves = ceil_div(B * split_heur, sm_count);
int k_waves = ceil_div(max_splits, split_heur); int k_waves = ceil_div(max_splits, split_heur);
int split_wave_aware = ceil_div(max_splits, k_waves); int split_wave_aware = ceil_div(max_splits, k_waves);
args.split_kv = split_wave_aware; args.split_kv = split_wave_aware;
// printf(" args.split_kv = %d\n", args.split_kv);
} }
/// Determines whether the GEMM can execute the given problem. /// Determines whether the GEMM can execute the given problem.

View File

@ -5,11 +5,11 @@
namespace vllm { namespace vllm {
// vllm_is_batch_invariant(); returns true // vllm_kernel_override_batch_invariant(); returns true
// if env VLLM_BATCH_INVARIANT=1 // if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
inline bool vllm_is_batch_invariant() { inline bool vllm_kernel_override_batch_invariant() {
static bool cached = []() { static bool cached = []() {
std::string env_key = "VLLM_BATCH_INVARIANT"; std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str()); const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0; return (val && std::atoi(val) != 0) ? 1 : 0;
}(); }();

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum import enum
from typing import Union
from cutlass_library import * from cutlass_library import *
@ -21,7 +22,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative = enum_auto() TmaWarpSpecializedCooperative = enum_auto()
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeNames, # type: ignore **DataTypeNames, # type: ignore
**{ **{
VLLMDataType.u4b8: "u4b8", VLLMDataType.u4b8: "u4b8",
@ -29,7 +30,7 @@ VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
}, },
} }
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeTag, # type: ignore **DataTypeTag, # type: ignore
**{ **{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -37,7 +38,7 @@ VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
}, },
} }
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
**DataTypeSize, # type: ignore **DataTypeSize, # type: ignore
**{ **{
VLLMDataType.u4b8: 4, VLLMDataType.u4b8: 4,
@ -45,7 +46,7 @@ VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
}, },
} }
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u4b8: "vllm::kU4B8",
VLLMDataType.u8b128: "vllm::kU8B128", VLLMDataType.u8b128: "vllm::kU8B128",
DataType.u4: "vllm::kU4", DataType.u4: "vllm::kU4",
@ -56,7 +57,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.bf16: "vllm::kBfloat16", DataType.bf16: "vllm::kBfloat16",
} }
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.u8: "at::ScalarType::Byte", DataType.u8: "at::ScalarType::Byte",
DataType.s8: "at::ScalarType::Char", DataType.s8: "at::ScalarType::Char",
DataType.e4m3: "at::ScalarType::Float8_e4m3fn", DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -66,7 +67,9 @@ VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.f32: "at::ScalarType::Float", DataType.f32: "at::ScalarType::Float",
} }
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { VLLMKernelScheduleTag: dict[
Union[MixedInputKernelScheduleType, KernelScheduleType], str
] = {
**KernelScheduleTag, # type: ignore **KernelScheduleTag, # type: ignore
**{ **{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501

View File

@ -2,7 +2,6 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -19,22 +18,11 @@ __global__ void rms_norm_kernel(
const float epsilon, const int num_tokens, const int hidden_size) { const float epsilon, const int num_tokens, const int hidden_size) {
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride;
constexpr int VEC_SIZE = 8; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) { const float x = (float)input[blockIdx.x * input_stride + idx];
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
}; }
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;
@ -426,7 +414,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
wt_ptr % req_alignment_bytes == 0; wt_ptr % req_alignment_bytes == 0;
bool offsets_are_multiple_of_vector_width = bool offsets_are_multiple_of_vector_width =
hidden_size % vector_width == 0 && input_stride % vector_width == 0; hidden_size % vector_width == 0 && input_stride % vector_width == 0;
bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width && if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
!batch_invariant_launch) { !batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8); LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -474,7 +462,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr()); auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr()); auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0; bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) { if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
LAUNCH_FUSED_POLY_NORM(8); LAUNCH_FUSED_POLY_NORM(8);
} else { } else {

View File

@ -10,7 +10,6 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -29,22 +28,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
const float x = (float)input[blockIdx.x * input_stride + idx];
constexpr int VEC_SIZE = 8;
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
}; }
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;
@ -254,7 +241,7 @@ void fused_add_rms_norm_static_fp8_quant(
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr()); auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
bool ptrs_are_aligned = bool ptrs_are_aligned =
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 && if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
!batch_invariant_launch) { !batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8); LAUNCH_FUSED_ADD_RMS_NORM(8);

View File

@ -8,77 +8,12 @@
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "../dispatch_utils.h" #include "../dispatch_utils.h"
#include "core/math.hpp"
#define CEILDIV(x, y) (((x) + (y) - 1) / (y)) #define CEILDIV(x, y) (((x) + (y) - 1) / (y))
namespace vllm { namespace vllm {
namespace moe { namespace moe {
namespace batched_moe_align_block_size {
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
static constexpr int32_t num_threads = 1024;
static constexpr int32_t num_blocks = 1;
__global__ void batched_moe_align_block_size_kernel(
int32_t const num_batches, int32_t const max_tokens_per_batch,
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
int32_t* __restrict__ num_tokens_post_pad) {
// TODO(varun): This is a naive implementation. Could be optimized.
size_t const batch_id = threadIdx.x;
size_t const stride = blockDim.x * gridDim.x;
int32_t const num_blocks_per_batch =
CEILDIV(max_tokens_per_batch, block_size);
int32_t const sorted_ids_size =
num_blocks_per_batch * num_batches * block_size;
int32_t const block_ids_size = sorted_ids_size / block_size;
int32_t const SENTINEL =
num_batches * max_tokens_per_batch; // To denote invalid entries.
// Intialize sorted_ids
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
sorted_ids[i] = SENTINEL;
}
// Intialize expert_ids with -1
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
block_ids[i] = -1;
}
int32_t b_num_tokens = 0;
if (batch_id < num_batches) {
b_num_tokens = batch_num_tokens[batch_id];
}
int32_t const ceil_b_num_tokens =
CEILDIV(b_num_tokens, block_size) * block_size;
// Compute prefix sum over token counts per expert
using BlockScan = cub::BlockScan<int32_t, 1024>;
__shared__ typename BlockScan::TempStorage temp_storage;
int cumsum_val;
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
__syncthreads();
bool const is_last_batch = batch_id == (num_batches - 1);
if (is_last_batch) {
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
}
if (batch_id < num_batches) {
int32_t const batch_offset = batch_id * max_tokens_per_batch;
for (size_t i = 0; i < b_num_tokens; ++i) {
sorted_ids[cumsum_val + i] = batch_offset + i;
}
int32_t const block_start = cumsum_val / block_size;
int32_t const num_blocks = ceil_b_num_tokens / block_size;
for (size_t i = 0; i < num_blocks; ++i) {
block_ids[block_start + i] = batch_id;
}
}
}
} // namespace batched_moe_align_block_size
template <typename scalar_t> template <typename scalar_t>
__global__ void moe_align_block_size_kernel( __global__ void moe_align_block_size_kernel(
const scalar_t* __restrict__ topk_ids, const scalar_t* __restrict__ topk_ids,
@ -345,33 +280,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
}); });
} }
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
torch::Tensor const& batch_num_tokens,
torch::Tensor sorted_ids,
torch::Tensor batch_ids,
torch::Tensor num_tokens_post_pad) {
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t const B = batch_num_tokens.size(0);
int32_t const num_blocks_per_batch =
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
int32_t const num_blocks = num_blocks_per_batch * B;
int64_t const sorted_ids_size = num_blocks * block_size;
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
TORCH_CHECK(B <= batched_kernel::num_threads);
batched_kernel::batched_moe_align_block_size_kernel<<<
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>());
}
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
torch::Tensor& output) // [num_tokens, hidden_size] torch::Tensor& output) // [num_tokens, hidden_size]
{ {

View File

@ -12,14 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad); torch::Tensor num_tokens_post_pad);
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
torch::Tensor const& expert_num_tokens,
torch::Tensor sorted_ids,
torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM #ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales, torch::Tensor b_qweight, torch::Tensor b_scales,

View File

@ -21,6 +21,7 @@
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "../cub_helpers.h" #include "../cub_helpers.h"
#include "../core/batch_invariant.hpp"
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>; using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
static constexpr int VPT = Constants::VPT; static constexpr int VPT = Constants::VPT;
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP; static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP; const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);

View File

@ -22,17 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()"); " Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size, but for the batched case.
m.def(
"batched_moe_align_block_size(int max_tokens_per_batch,"
" int block_size, Tensor expert_num_tokens,"
" Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("batched_moe_align_block_size", torch::kCUDA,
&batched_moe_align_block_size);
#ifndef USE_ROCM #ifndef USE_ROCM
m.def( m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "

View File

@ -9,6 +9,7 @@ from collections.abc import Iterable
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass, fields from dataclasses import dataclass, fields
from functools import reduce from functools import reduce
from typing import Optional, Union
import jinja2 import jinja2
from vllm_cutlass_library_extension import ( from vllm_cutlass_library_extension import (
@ -258,7 +259,7 @@ class ScheduleConfig:
@dataclass(frozen=True) @dataclass(frozen=True)
class TypeConfig: class TypeConfig:
a: DataType a: DataType
b: DataType | VLLMDataType b: Union[DataType, VLLMDataType]
b_group_scale: DataType b_group_scale: DataType
b_group_zeropoint: DataType b_group_zeropoint: DataType
b_channel_scale: DataType b_channel_scale: DataType
@ -279,7 +280,7 @@ class PrepackTypeConfig:
class ImplConfig: class ImplConfig:
types: TypeConfig types: TypeConfig
schedules: list[ScheduleConfig] schedules: list[ScheduleConfig]
heuristic: list[tuple[str | None, ScheduleConfig]] heuristic: list[tuple[Optional[str], ScheduleConfig]]
def generate_sch_sig(schedule_config: ScheduleConfig) -> str: def generate_sch_sig(schedule_config: ScheduleConfig) -> str:

View File

@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
__global__ __quickreduce_launch_bounds_two_shot__ static void __global__ __quickreduce_launch_bounds_two_shot__ static void
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks, allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
int rank, uint8_t** dbuffer_list, int rank, uint8_t** dbuffer_list,
uint32_t data_offset, uint32_t flag_color, uint32_t data_offset, uint32_t flag_color) {
int64_t data_size_per_phase) {
int block = blockIdx.x; int block = blockIdx.x;
int grid = gridDim.x; int grid = gridDim.x;
while (block < num_blocks) { while (block < num_blocks) {
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset, AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
flag_color, data_size_per_phase); flag_color);
block += grid; block += grid;
flag_color++; flag_color++;
} }
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} else if (world_size == 4) { \ } else if (world_size == 4) { \
using LineCodec = __codec<T, 4>; \ using LineCodec = __codec<T, 4>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \ using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} else if (world_size == 8) { \ } else if (world_size == 8) { \
using LineCodec = __codec<T, 8>; \ using LineCodec = __codec<T, 8>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \ using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} }
enum QuickReduceQuantLevel { enum QuickReduceQuantLevel {

View File

@ -553,12 +553,13 @@ struct AllReduceTwoshot {
int const rank, // rank index int const rank, // rank index
uint8_t** __restrict__ buffer_list, // communication buffers uint8_t** __restrict__ buffer_list, // communication buffers
uint32_t const data_offset, // offset to start of the data buffer uint32_t const data_offset, // offset to start of the data buffer
uint32_t flag_color, int64_t data_size_per_phase) { uint32_t flag_color) {
// Topology // Topology
int thread = threadIdx.x + threadIdx.y * kWavefront; int thread = threadIdx.x + threadIdx.y * kWavefront;
uint8_t* rank_buffer = buffer_list[rank]; uint8_t* rank_buffer = buffer_list[rank];
Codec codec(thread, rank); Codec codec(thread, rank);
int block_id = blockIdx.x; int block_id = blockIdx.x;
int grid_size = gridDim.x;
// -------------------------------------------------------- // --------------------------------------------------------
// Read input into registers // Read input into registers
int32x4_t tA[kAtoms]; int32x4_t tA[kAtoms];
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
// rank responsible for this segment. // rank responsible for this segment.
uint32_t comm_data0_offset = uint32_t comm_data0_offset =
data_offset + block_id * Codec::kTransmittedTileSize; data_offset + block_id * Codec::kTransmittedTileSize;
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset; uint32_t comm_data1_offset =
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t)); uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset; uint32_t comm_flags1_offset =
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
for (int r = 0; r < kWorldSize; r++) { for (int r = 0; r < kWorldSize; r++) {
int32x4_t* send_buffer = int32x4_t* send_buffer =

View File

@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=500 ARG VLLM_MAX_SIZE_MB=450
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -359,8 +359,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer pre-compiled kernel cache and binaries # Install FlashInfer pre-compiled kernel cache and binaries
# https://docs.flashinfer.ai/installation.html # https://docs.flashinfer.ai/installation.html
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.4.1 \ uv pip install --system flashinfer-cubin==0.4.0 \
&& uv pip install --system flashinfer-jit-cache==0.4.1 \ && uv pip install --system flashinfer-jit-cache==0.4.0 \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config && flashinfer show-config

View File

@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
# build flashinfer for torch nightly from source around 10 mins # build flashinfer for torch nightly from source around 10 mins
# release version: v0.4.1 # release version: v0.4.0
# todo(elainewy): cache flashinfer build result for faster build # todo(elainewy): cache flashinfer build result for faster build
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
echo "git clone flashinfer..." \ echo "git clone flashinfer..." \
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
&& cd flashinfer \ && cd flashinfer \
&& git checkout v0.4.1\ && git checkout v0.4.0 \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& echo "finish git clone flashinfer..." \ && echo "finish git clone flashinfer..." \
&& rm -rf build \ && rm -rf build \

View File

@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl apt-transport-https ca-certificates wget curl
# Remove sccache # Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR ARG COMMON_WORKDIR

View File

@ -11,7 +11,8 @@ The following code splits the model across 2 GPUs.
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
tensor_parallel_size=2)
``` ```
!!! warning !!! warning
@ -23,7 +24,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
!!! note !!! note
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
## Quantization ## Quantization
@ -42,7 +43,9 @@ and the maximum batch size (`max_num_seqs` option).
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2)
``` ```
## Reduce CUDA Graphs ## Reduce CUDA Graphs
@ -58,12 +61,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
```python ```python
from vllm import LLM from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode from vllm.config import CompilationConfig, CompilationLevel
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, level=CompilationLevel.PIECEWISE,
# By default, it goes up to max_num_seqs # By default, it goes up to max_num_seqs
cudagraph_capture_sizes=[1, 2, 4, 8, 16], cudagraph_capture_sizes=[1, 2, 4, 8, 16],
), ),
@ -75,7 +78,8 @@ You can disable graph capturing completely via the `enforce_eager` flag:
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
enforce_eager=True)
``` ```
## Adjust cache size ## Adjust cache size
@ -93,10 +97,8 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
from vllm import LLM from vllm import LLM
# Accept up to 3 images and 1 video per prompt # Accept up to 3 images and 1 video per prompt
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"image": 3, "video": 1})
limit_mm_per_prompt={"image": 3, "video": 1},
)
``` ```
You can go a step further and disable unused modalities completely by setting its limit to zero. You can go a step further and disable unused modalities completely by setting its limit to zero.
@ -106,10 +108,8 @@ For example, if your application only accepts image input, there is no need to a
from vllm import LLM from vllm import LLM
# Accept any number of images but no videos # Accept any number of images but no videos
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"video": 0})
limit_mm_per_prompt={"video": 0},
)
``` ```
You can even run a multi-modal model for text-only inference: You can even run a multi-modal model for text-only inference:
@ -118,10 +118,8 @@ You can even run a multi-modal model for text-only inference:
from vllm import LLM from vllm import LLM
# Don't accept images. Just text. # Don't accept images. Just text.
llm = LLM( llm = LLM(model="google/gemma-3-27b-it",
model="google/gemma-3-27b-it", limit_mm_per_prompt={"image": 0})
limit_mm_per_prompt={"image": 0},
)
``` ```
### Configurable options ### Configurable options
@ -175,14 +173,14 @@ Here are some examples:
from vllm import LLM from vllm import LLM
# Available for Qwen2-VL series models # Available for Qwen2-VL series models
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_kwargs={
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28 "max_pixels": 768 * 768, # Default is 1280 * 28 * 28
) })
# Available for InternVL series models # Available for InternVL series models
llm = LLM( llm = LLM(model="OpenGVLab/InternVL2-2B",
model="OpenGVLab/InternVL2-2B", mm_processor_kwargs={
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12 "max_dynamic_patch": 4, # Default is 12
) })
``` ```

View File

@ -100,7 +100,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.3-70B-Instruct, model="meta-llama/Llama-3.3-70B-Instruct,
tensor_parallel_size=4, tensor_parallel_size=4,
pipeline_parallel_size=2, pipeline_parallel_size=2
) )
``` ```
@ -174,14 +174,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
Known supported models (with corresponding benchmarks): Known supported models (with corresponding benchmarks):
- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>) - dots_ocr (<gh-pr:25466>)
- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>) - GLM-4.1V or above (<gh-pr:23168>)
- InternVL (<https://github.com/vllm-project/vllm/pull/23909>) - InternVL (<gh-pr:23909>)
- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>) - Kimi-VL (<gh-pr:23817>)
- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>) - Llama4 (<gh-pr:18368>)
- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>) - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>) - Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
- Step3 (<https://github.com/vllm-project/vllm/pull/22697>) - Step3 (<gh-pr:22697>)
## Input Processing ## Input Processing
@ -257,24 +257,18 @@ Examples:
```python ```python
# Use a larger cache # Use a larger cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=8)
mm_processor_cache_gb=8,
)
# Use a shared-memory based IPC cache # Use a shared-memory based IPC cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", tensor_parallel_size=2,
tensor_parallel_size=2, mm_processor_cache_type="shm",
mm_processor_cache_type="shm", mm_processor_cache_gb=8)
mm_processor_cache_gb=8,
)
# Disable the cache # Disable the cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=0)
mm_processor_cache_gb=0,
)
``` ```
### Cache Placement ### Cache Placement

View File

@ -96,7 +96,7 @@ Although its common to do this with GPUs, don't try to fragment 2 or 8 differ
### Tune your workloads ### Tune your workloads
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
### Future Topics We'll Cover ### Future Topics We'll Cover

View File

@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
## License ## License
See [LICENSE](../../LICENSE). See <gh-file:LICENSE>.
## Developing ## Developing
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
!!! tip !!! tip
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12. vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
### Documentation ### Documentation
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml). MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
Get started with: Get started with:
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
!!! important !!! important
If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md). If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
## Pull Requests & Code Reviews ## Pull Requests & Code Reviews
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
### DCO and Signed-off-by ### DCO and Signed-off-by
When contributing changes to this project, you must agree to the [DCO](../../DCO). When contributing changes to this project, you must agree to the <gh-file:DCO>.
Commits must include a `Signed-off-by:` header which certifies agreement with Commits must include a `Signed-off-by:` header which certifies agreement with
the terms of the DCO. the terms of the DCO.

View File

@ -35,7 +35,6 @@ th {
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` | | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
| Random | ✅ | ✅ | `synthetic` | | Random | ✅ | ✅ | `synthetic` |
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
| RandomForReranking | ✅ | ✅ | `synthetic` |
| Prefix Repetition | ✅ | ✅ | `synthetic` | | Prefix Repetition | ✅ | ✅ | `synthetic` |
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` | | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
@ -822,7 +821,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
- CLIP: `--backend openai-embeddings-clip` - CLIP: `--backend openai-embeddings-clip`
- VLM2Vec: `--backend openai-embeddings-vlm2vec` - VLM2Vec: `--backend openai-embeddings-vlm2vec`
For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format. For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it. You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings. For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
@ -879,51 +878,6 @@ vllm bench serve \
</details> </details>
#### Reranker Benchmark
Benchmark the performance of rerank requests in vLLM.
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Unlike generative models which use Completions API or Chat Completions API,
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
For reranking, the only supported dataset is `--dataset-name random-rerank`
Start the server:
```bash
vllm serve BAAI/bge-reranker-v2-m3
```
Run the benchmark:
```bash
vllm bench serve \
--model BAAI/bge-reranker-v2-m3 \
--backend vllm-rerank \
--endpoint /v1/rerank \
--dataset-name random-rerank \
--tokenizer BAAI/bge-reranker-v2-m3 \
--random-input-len 512 \
--num-prompts 10 \
--random-batch-size 5
```
For reranker models, this will create `num_prompts / random_batch_size` requests with
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
In the example above, this results in 2 rerank requests with 5 "documents" each where
each document has close to 512 tokens.
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
with an embedding model, also set `--no_reranker`. Because in this case the query is
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
to account for the extra prompt which is the query. The token accounting to report the
throughput numbers correctly is also adjusted.
</details>
[](){ #performance-benchmarks } [](){ #performance-benchmarks }
## Performance Benchmarks ## Performance Benchmarks
@ -962,7 +916,7 @@ For more results visualization, check the [visualizing the results](https://gith
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
### Continuous Benchmarking ### Continuous Benchmarking
@ -996,4 +950,4 @@ These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lm
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md). More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).

View File

@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
Strip timestamps and colorization: Strip timestamps and colorization:
[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh) <gh-file:.buildkite/scripts/ci-clean-log.sh>
```bash ```bash
./ci-clean-log.sh ci.log ./ci-clean-log.sh ci.log
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
CI test failures may be flaky. Use a bash loop to run repeatedly: CI test failures may be flaky. Use a bash loop to run repeatedly:
[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh) <gh-file:.buildkite/scripts/rerun-test.sh>
```bash ```bash
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp] ./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]

View File

@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
PyTorch version as early as possible when a new [PyTorch stable PyTorch version as early as possible when a new [PyTorch stable
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available. release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
This process is non-trivial due to the gap between PyTorch This process is non-trivial due to the gap between PyTorch
releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
update along with a list of potential issues and how to address them. update along with a list of potential issues and how to address them.
## Test PyTorch release candidates (RCs) ## Test PyTorch release candidates (RCs)
@ -85,7 +85,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
it doesn't populate the cache, so re-running it to warm up the cache it doesn't populate the cache, so re-running it to warm up the cache
is ineffective. is ineffective.
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419> While ongoing efforts like [#17419](gh-issue:17419)
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH` address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`) to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
when manually triggering a build on Buildkite. This branch accomplishes two things: when manually triggering a build on Buildkite. This branch accomplishes two things:
@ -138,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
for different platforms in vLLM CI/CD allows us to selectively choose for different platforms in vLLM CI/CD allows us to selectively choose
which platforms to update. For instance, updating XPU requires the corresponding which platforms to update. For instance, updating XPU requires the corresponding
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel. release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm, While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU. <gh-pr:17444> completed the update for XPU.

View File

@ -1,6 +1,6 @@
# Dockerfile # Dockerfile
We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM. We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
More information about deploying with Docker can be found [here](../../deployment/docker.md). More information about deploying with Docker can be found [here](../../deployment/docker.md).
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:

View File

@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
## 1. Bring your model code ## 1. Bring your model code
First, clone the PyTorch model code from the source repository. First, clone the PyTorch model code from the source repository.
For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
!!! warning !!! warning
@ -73,8 +73,8 @@ def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor: ) -> torch.Tensor:
... ...
``` ```
@ -83,7 +83,7 @@ def forward(
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples. For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
## 3. (Optional) Implement tensor parallelism and quantization support ## 3. (Optional) Implement tensor parallelism and quantization support
@ -130,22 +130,22 @@ We consider 3 different scenarios:
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers. 2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers. 3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
V0-only classes and code will be removed in the very near future. V0-only classes and code will be removed in the very near future.
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized. The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol). These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively. For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
Please follow the same guidelines as case (2) for implementing these models. Please follow the same guidelines as case (2) for implementing these models.
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention). We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.

View File

@ -16,7 +16,7 @@ Further update the model as follows:
... ...
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"): if modality.startswith("image"):
return "<image>" return "<image>"
@ -45,14 +45,14 @@ Further update the model as follows:
... ...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
assert self.vision_encoder is not None assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input) image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( def get_multimodal_embeddings(
self, self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
**kwargs: object,
) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments # Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
For example, if the model supports any number of images but only one video per prompt: For example, if the model supports any number of images but only one video per prompt:
```python ```python
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": 1} return {"image": None, "video": 1}
``` ```
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
@ -421,10 +421,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
```python ```python
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor() image_processor = self.get_image_processor()
return ImageSize( return ImageSize(width=image_processor.size["width"],
width=image_processor.size["width"], height=image_processor.size["height"])
height=image_processor.size["height"],
)
``` ```
Fuyu does not expect image placeholders in the inputs to HF processor, so Fuyu does not expect image placeholders in the inputs to HF processor, so
@ -454,12 +452,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
return { return {
"image": "image":
self._get_dummy_images( self._get_dummy_images(width=target_width,
width=target_width, height=target_height,
height=target_height, num_images=num_images,
num_images=num_images, overrides=image_overrides)
overrides=image_overrides,
)
} }
``` ```
@ -507,7 +503,7 @@ return a schema of the tensors outputted by the HF processor that are related to
``` ```
!!! note !!! note
Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
=== "With postprocessing: Fuyu" === "With postprocessing: Fuyu"
@ -569,7 +565,7 @@ return a schema of the tensors outputted by the HF processor that are related to
``` ```
!!! note !!! note
Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
for text-only inputs to prevent unnecessary warnings from HF processor. for text-only inputs to prevent unnecessary warnings from HF processor.
!!! note !!! note
@ -748,7 +744,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
@ -784,7 +781,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
@ -812,11 +810,9 @@ to register them to the multi-modal registry:
from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.interfaces import SupportsMultiModal
+ from vllm.multimodal import MULTIMODAL_REGISTRY + from vllm.multimodal import MULTIMODAL_REGISTRY
+ @MULTIMODAL_REGISTRY.register_processor( + @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+ YourMultiModalProcessor, + info=YourProcessingInfo,
+ info=YourProcessingInfo, + dummy_inputs=YourDummyInputsBuilder)
+ dummy_inputs=YourDummyInputsBuilder,
+ )
class YourModelForImage2Seq(nn.Module, SupportsMultiModal): class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
``` ```
@ -828,8 +824,8 @@ Some HF processors directly insert feature tokens without replacing anything in
Examples: Examples:
- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py) - BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py) - Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
### Handling prompt updates unrelated to multi-modal data ### Handling prompt updates unrelated to multi-modal data
@ -837,9 +833,9 @@ Examples:
Examples: Examples:
- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py) - Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py) - Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py) - Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
### Custom HF processor ### Custom HF processor
@ -847,6 +843,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
Examples: Examples:
- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py) - DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py) - InternVL: <gh-file:vllm/model_executor/models/internvl.py>
- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py) - Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>

View File

@ -11,8 +11,8 @@ This page provides detailed instructions on how to do so.
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source]. To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
This gives you the ability to modify the codebase and test your model. This gives you the ability to modify the codebase and test your model.
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory. After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM. Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model! Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
!!! important !!! important
@ -42,7 +42,7 @@ def register():
ModelRegistry.register_model( ModelRegistry.register_model(
"YourModelForCausalLM", "YourModelForCausalLM",
"your_code:YourModelForCausalLM", "your_code:YourModelForCausalLM"
) )
``` ```

View File

@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
### Model loading ### Model loading
Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py). Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
!!! important !!! important
@ -26,18 +26,18 @@ Passing these tests provides more confidence that your implementation is correct
### Model correctness ### Model correctness
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models). These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
#### Generative models #### Generative models
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py): For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
#### Pooling models #### Pooling models
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py). For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
[](){ #mm-processing-tests } [](){ #mm-processing-tests }
@ -45,7 +45,7 @@ For [pooling models](../../models/pooling_models.md), we simply check the cosine
#### Common tests #### Common tests
Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs: Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
- Text + multi-modal data - Text + multi-modal data
- Tokens + multi-modal data - Tokens + multi-modal data
@ -54,6 +54,6 @@ Adding your model to [tests/models/multimodal/processing/test_common.py](../../.
#### Model-specific tests #### Model-specific tests
You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model. You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py). For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.

View File

@ -15,9 +15,8 @@ Declare supported languages and capabilities:
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
??? code "supported_languages and supports_transcription_only" ??? code "supported_languages and supports_transcription_only"
```python ```python
from typing import ClassVar, Mapping, Literal from typing import ClassVar, Mapping, Optional, Literal
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
@ -44,7 +43,6 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
This is for controlling general behavior of the API when serving your model: This is for controlling general behavior of the API when serving your model:
??? code "get_speech_to_text_config()" ??? code "get_speech_to_text_config()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -73,7 +71,6 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -84,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
language: str | None, language: Optional[str],
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
to_language: str | None, to_language: Optional[str],
) -> PromptType: ) -> PromptType:
# Example with a free-form instruction prompt # Example with a free-form instruction prompt
task_word = "Transcribe" if task_type == "transcribe" else "Translate" task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -110,7 +107,6 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -121,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
language: str | None, language: Optional[str],
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
to_language: str | None, to_language: Optional[str],
) -> PromptType: ) -> PromptType:
if language is None: if language is None:
raise ValueError("Language must be specified") raise ValueError("Language must be specified")
@ -152,16 +148,12 @@ Language validation via [validate_language][vllm.model_executor.models.interface
If your model requires a language and you want a default, override this method (see Whisper): If your model requires a language and you want a default, override this method (see Whisper):
??? code "validate_language()" ??? code "validate_language()"
```python ```python
@classmethod @classmethod
def validate_language(cls, language: str | None) -> str | None: def validate_language(cls, language: Optional[str]) -> Optional[str]:
if language is None: if language is None:
logger.warning( logger.warning(
"Defaulting to language='en'. If you wish to transcribe " "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
"audio in a different language, pass the `language` field "
"in the TranscriptionRequest."
)
language = "en" language = "en"
return super().validate_language(language) return super().validate_language(language)
``` ```
@ -173,7 +165,6 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
Provide a fast duration→token estimate to improve streaming usage statistics: Provide a fast duration→token estimate to improve streaming usage statistics:
??? code "get_num_audio_tokens()" ??? code "get_num_audio_tokens()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -184,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
audio_duration_s: float, audio_duration_s: float,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
) -> int | None: ) -> Optional[int]:
# Return None if unknown; otherwise return an estimate. # Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example return int(audio_duration_s * stt_config.sample_rate // 320) # example
``` ```
@ -200,7 +191,6 @@ The API server takes care of basic audio I/O and optional chunking before buildi
Relevant server logic: Relevant server logic:
??? code "_preprocess_speech_to_text()" ??? code "_preprocess_speech_to_text()"
```python ```python
# vllm/entrypoints/openai/speech_to_text.py # vllm/entrypoints/openai/speech_to_text.py
async def _preprocess_speech_to_text(...): async def _preprocess_speech_to_text(...):
@ -248,9 +238,9 @@ No extra registration is required beyond having your model class available via t
## Examples in-tree ## Examples in-tree
- Whisper encoderdecoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py) - Whisper encoderdecoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py) - Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py) - Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
## Test with the API ## Test with the API
@ -278,7 +268,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
http://localhost:8000/v1/audio/translations http://localhost:8000/v1/audio/translations
``` ```
Or check out more examples in [examples/online_serving](../../../examples/online_serving). Or check out more examples in <gh-file:examples/online_serving>.
!!! note !!! note
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.

View File

@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
#### Offline Inference #### Offline Inference
Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example. Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
#### OpenAI Server #### OpenAI Server

View File

@ -10,7 +10,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
```bash ```bash
docker run --runtime nvidia --gpus all \ docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=$HF_TOKEN" \ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-p 8000:8000 \ -p 8000:8000 \
--ipc=host \ --ipc=host \
vllm/vllm-openai:latest \ vllm/vllm-openai:latest \
@ -22,7 +22,7 @@ This image can also be used with other container engines such as [Podman](https:
```bash ```bash
podman run --device nvidia.com/gpu=all \ podman run --device nvidia.com/gpu=all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=$HF_TOKEN" \ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-p 8000:8000 \ -p 8000:8000 \
--ipc=host \ --ipc=host \
docker.io/vllm/vllm-openai:latest \ docker.io/vllm/vllm-openai:latest \
@ -37,7 +37,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
memory to share data between processes under the hood, particularly for tensor parallel inference. memory to share data between processes under the hood, particularly for tensor parallel inference.
!!! note !!! note
Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>). Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
If you need to use those dependencies (having accepted the license terms), If you need to use those dependencies (having accepted the license terms),
create a custom Dockerfile on top of the base image with an extra layer that installs them: create a custom Dockerfile on top of the base image with an extra layer that installs them:
@ -66,7 +66,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
## Building vLLM's Docker Image from Source ## Building vLLM's Docker Image from Source
You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM: You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
```bash ```bash
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
@ -128,7 +128,7 @@ To run vLLM with the custom-built Docker image:
docker run --runtime nvidia --gpus all \ docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
-p 8000:8000 \ -p 8000:8000 \
--env "HF_TOKEN=<secret>" \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
vllm/vllm-openai <args...> vllm/vllm-openai <args...>
``` ```

View File

@ -5,7 +5,7 @@
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray. [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh). without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm). When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).

View File

@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
??? console "Command" ??? console "Command"
```bash ```python
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: <JWT TOKEN>' \ -H 'Authorization: <JWT TOKEN>' \
@ -81,7 +81,7 @@ You should get a response like:
??? console "Response" ??? console "Response"
```json ```python
{ {
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
"result": { "result": {

View File

@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
client = OpenAI( client = OpenAI(
base_url="https://gateway.<gateway domain>", base_url="https://gateway.<gateway domain>",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>", api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
) )
completion = client.chat.completions.create( completion = client.chat.completions.create(
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
"role": "user", "role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.", "content": "Compose a poem that explains the concept of recursion in programming.",
} }
], ]
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)

View File

@ -34,7 +34,7 @@ pip install vllm haystack-ai
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
model="mistralai/Mistral-7B-Instruct-v0.1", model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs={"max_tokens": 512}, generation_kwargs = {"max_tokens": 512}
) )
response = generator.run( response = generator.run(

View File

@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
import os import os
client = OpenAI( client = OpenAI(
base_url=DEPLOYMENT_URL, base_url = DEPLOYMENT_URL,
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
) )
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
model="HuggingFaceTB/SmolLM3-3B", model = "HuggingFaceTB/SmolLM3-3B",
messages=[ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "Give me a brief explanation of gravity in simple terms.", "text": "Give me a brief explanation of gravity in simple terms."
} }
], ]
} }
], ],
stream=True, stream = True
) )
for message in chat_completion: for message in chat_completion:
print(message.choices[0].delta.content, end="") print(message.choices[0].delta.content, end = "")
``` ```
!!! note !!! note
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
import os import os
client = OpenAI( client = OpenAI(
base_url=DEPLOYMENT_URL, base_url = DEPLOYMENT_URL,
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
) )
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
model="ibm-granite/granite-docling-258M", model = "ibm-granite/granite-docling-258M",
messages=[ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png", "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
}, }
}, },
{ {
"type": "text", "type": "text",
"text": "Convert this page to docling.", "text": "Convert this page to docling."
}, }
] ]
} }
], ],
stream=True, stream = True
) )
for message in chat_completion: for message in chat_completion:
print(message.choices[0].delta.content, end="") print(message.choices[0].delta.content, end = "")
``` ```
!!! note !!! note

View File

@ -36,16 +36,15 @@ pip install vllm litellm
```python ```python
import litellm import litellm
messages = [{"content": "Hello, how are you?", "role": "user"}] messages = [{ "content": "Hello, how are you?","role": "user"}]
# hosted_vllm is prefix key word and necessary # hosted_vllm is prefix key word and necessary
response = litellm.completion( response = litellm.completion(
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
messages=messages, messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2, temperature=0.2,
max_tokens=80, max_tokens=80)
)
print(response) print(response)
``` ```

View File

@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
- name: vllm-leader - name: vllm-leader
image: docker.io/vllm/vllm-openai:latest image: docker.io/vllm/vllm-openai:latest
env: env:
- name: HF_TOKEN - name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token> value: <your-hf-token>
command: command:
- sh - sh
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
cpu: 125 cpu: 125
env: env:
- name: HF_TOKEN - name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token> value: <your-hf-token>
volumeMounts: volumeMounts:
- mountPath: /dev/shm - mountPath: /dev/shm

View File

@ -36,11 +36,11 @@ pip install -U vllm \
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py) 1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
1. Run the script 1. Run the script
```bash ```python
python retrieval_augmented_generation_with_langchain.py python retrieval_augmented_generation_with_langchain.py
``` ```
@ -74,10 +74,10 @@ pip install vllm \
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py) 1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
1. Run the script: 1. Run the script:
```bash ```python
python retrieval_augmented_generation_with_llamaindex.py python retrieval_augmented_generation_with_llamaindex.py
``` ```

View File

@ -20,7 +20,7 @@ pip install vllm streamlit openai
vllm serve Qwen/Qwen1.5-0.5B-Chat vllm serve Qwen/Qwen1.5-0.5B-Chat
``` ```
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py) 1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
1. Start the streamlit web UI and start to chat: 1. Start the streamlit web UI and start to chat:

View File

@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
"vllm serve meta-llama/Llama-3.2-1B-Instruct" "vllm serve meta-llama/Llama-3.2-1B-Instruct"
] ]
env: env:
- name: HF_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: hf-token-secret name: hf-token-secret
@ -209,7 +209,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
] ]
env: env:
- name: HF_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: hf-token-secret name: hf-token-secret
@ -298,7 +298,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
] ]
env: env:
- name: HF_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: hf-token-secret name: hf-token-secret

View File

@ -49,7 +49,7 @@ Here is a sample of `LLM` class usage:
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py). The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
### OpenAI-Compatible API Server ### OpenAI-Compatible API Server
@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
vllm serve <model> vllm serve <model>
``` ```
The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py). The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
Sometimes you may see the API server entrypoint used directly instead of via the Sometimes you may see the API server entrypoint used directly instead of via the
`vllm` CLI command. For example: `vllm` CLI command. For example:
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
`python -m vllm.entrypoints.openai.api_server` is deprecated `python -m vllm.entrypoints.openai.api_server` is deprecated
and may become unsupported in a future release. and may become unsupported in a future release.
That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py). That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document. More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
@ -101,7 +101,7 @@ processing.
- **Output Processing**: Processes the outputs generated by the model, decoding the - **Output Processing**: Processes the outputs generated by the model, decoding the
token IDs from a language model into human-readable text. token IDs from a language model into human-readable text.
The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py). The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
### AsyncLLMEngine ### AsyncLLMEngine
@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
can handle multiple concurrent requests and stream outputs to clients. can handle multiple concurrent requests and stream outputs to clients.
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py). API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py). The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
## Worker ## Worker

View File

@ -17,7 +17,7 @@ In this document we will discuss the:
In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches). In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
!!! note !!! note
The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>. The following contents are mostly based on the last commit of <gh-pr:20059>.
## Motivation ## Motivation
@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode. The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
!!! note !!! note
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs). The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
### `CudagraphDispatcher` ### `CudagraphDispatcher`
@ -106,11 +106,9 @@ The dispatch code looks like:
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...) batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor) runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
# execution # execution
with set_forward_context( with set_forward_context(...,
..., cudagraph_runtime_mode=runtime_mode,
cudagraph_runtime_mode=runtime_mode, batch_descriptor=batch_descriptor):
batch_descriptor=batch_descriptor,
):
output = self.model(...) output = self.model(...)
``` ```
@ -167,7 +165,7 @@ class AttentionCGSupport(enum.Enum):
"""NO CUDA Graphs support""" """NO CUDA Graphs support"""
``` ```
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
The following table lists backends that support full CUDA Graphs at the time of writing. The following table lists backends that support full CUDA Graphs at the time of writing.
@ -202,12 +200,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
import vllm import vllm
from vllm.config import CUDAGraphMode from vllm.config import CUDAGraphMode
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
model = vllm.LLM( model = vllm.LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
dtype="auto", dtype='auto',
compilation_config=compilation_config, compilation_config = compilation_config,
) )
sampling_params = vllm.SamplingParams( sampling_params = vllm.SamplingParams(
temperature=0, # greedy decoding temperature=0, # greedy decoding
max_tokens=1024, max_tokens=1024,

View File

@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch * `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch * `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests. Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled. Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency` EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES` Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`

View File

@ -2,7 +2,7 @@
## Introduction ## Introduction
FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py) FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types. Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class. The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py). Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method. `FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens. The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
#### Step 1: Add an All2All manager #### Step 1: Add an All2All manager
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py). The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
#### Step 2: Add a FusedMoEPrepareAndFinalize Type #### Step 2: Add a FusedMoEPrepareAndFinalize Type
@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
### How To Unit Test ### How To Unit Test
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py). We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
compatible, runs some correctness tests. compatible, runs some correctness tests.
If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations, If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively. 1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`, 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`, `Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py) `Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
Doing this will add the new implementation to the test suite. Doing this will add the new implementation to the test suite.
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script. The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts` Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
with incompatible types, the script will error. with incompatible types, the script will error.
### How To Profile ### How To Profile
Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py) Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types. `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts` Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`

View File

@ -6,11 +6,11 @@ When performing an inference with IO Processor plugins, the prompt type is defin
## Writing an IO Processor Plugin ## Writing an IO Processor Plugin
IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface: IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
```python ```python
IOProcessorInput = TypeVar("IOProcessorInput") IOProcessorInput = TypeVar('IOProcessorInput')
IOProcessorOutput = TypeVar("IOProcessorOutput") IOProcessorOutput = TypeVar('IOProcessorOutput')
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@ -21,32 +21,30 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
def pre_process( def pre_process(
self, self,
prompt: IOProcessorInput, prompt: IOProcessorInput,
request_id: str | None = None, request_id: Optional[str] = None,
**kwargs, **kwargs,
) -> PromptType | Sequence[PromptType]: ) -> Union[PromptType, Sequence[PromptType]]:
raise NotImplementedError raise NotImplementedError
async def pre_process_async( async def pre_process_async(
self, self,
prompt: IOProcessorInput, prompt: IOProcessorInput,
request_id: str | None = None, request_id: Optional[str] = None,
**kwargs, **kwargs,
) -> PromptType | Sequence[PromptType]: ) -> Union[PromptType, Sequence[PromptType]]:
return self.pre_process(prompt, request_id, **kwargs) return self.pre_process(prompt, request_id, **kwargs)
@abstractmethod @abstractmethod
def post_process( def post_process(self,
self, model_output: Sequence[PoolingRequestOutput],
model_output: Sequence[PoolingRequestOutput], request_id: Optional[str] = None,
request_id: str | None = None, **kwargs) -> IOProcessorOutput:
**kwargs,
) -> IOProcessorOutput:
raise NotImplementedError raise NotImplementedError
async def post_process_async( async def post_process_async(
self, self,
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
request_id: str | None = None, request_id: Optional[str] = None,
**kwargs, **kwargs,
) -> IOProcessorOutput: ) -> IOProcessorOutput:
collected_output = [item async for i, item in model_output] collected_output = [item async for i, item in model_output]
@ -58,8 +56,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@abstractmethod @abstractmethod
def output_to_response( def output_to_response(
self, plugin_output: IOProcessorOutput self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
) -> IOProcessorResponse:
raise NotImplementedError raise NotImplementedError
``` ```
@ -67,9 +64,9 @@ The `parse_request` method is used for validating the user prompt and converting
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference. The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output. The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py). The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples. An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
## Using an IO Processor plugin ## Using an IO Processor plugin

View File

@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
from collections.abc import Sequence from collections.abc import Sequence
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
@abstractmethod @abstractmethod
def update_state( def update_state(
self, self,
batch_update: "BatchUpdate" | None, batch_update: Optional["BatchUpdate"],
) -> None: ) -> None:
"""Called when there are new output tokens, prior """Called when there are new output tokens, prior
to each forward pass. to each forward pass.
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`: * `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
* Use the `BatchUpdate` members to update logits processor internal state * Use the `BatchUpdate` members to update logits processor internal state
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.

View File

@ -80,13 +80,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
- `vllm:request_decode_time_seconds` - Requests decode time. - `vllm:request_decode_time_seconds` - Requests decode time.
- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group. - `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here. See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
### Prometheus Client Library ### Prometheus Client Library
Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs. Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657): With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
```bash ```bash
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*' $ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
@ -99,7 +99,7 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
### Multi-process Mode ### Multi-process Mode
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>. In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
### Built in Python/Process Metrics ### Built in Python/Process Metrics
@ -125,32 +125,32 @@ vLLM instance.
For background, these are some of the relevant PRs which added the v0 metrics: For background, these are some of the relevant PRs which added the v0 metrics:
- <https://github.com/vllm-project/vllm/pull/1890> - <gh-pr:1890>
- <https://github.com/vllm-project/vllm/pull/2316> - <gh-pr:2316>
- <https://github.com/vllm-project/vllm/pull/2730> - <gh-pr:2730>
- <https://github.com/vllm-project/vllm/pull/4464> - <gh-pr:4464>
- <https://github.com/vllm-project/vllm/pull/7279> - <gh-pr:7279>
Also note the ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where e.g. [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781). Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
## v1 Design ## v1 Design
### v1 PRs ### v1 PRs
For background, here are the relevant v1 PRs relating to the v1 For background, here are the relevant v1 PRs relating to the v1
metrics issue <https://github.com/vllm-project/vllm/issues/10582>: metrics issue <gh-issue:10582>:
- <https://github.com/vllm-project/vllm/pull/11962> - <gh-pr:11962>
- <https://github.com/vllm-project/vllm/pull/11973> - <gh-pr:11973>
- <https://github.com/vllm-project/vllm/pull/10907> - <gh-pr:10907>
- <https://github.com/vllm-project/vllm/pull/12416> - <gh-pr:12416>
- <https://github.com/vllm-project/vllm/pull/12478> - <gh-pr:12478>
- <https://github.com/vllm-project/vllm/pull/12516> - <gh-pr:12516>
- <https://github.com/vllm-project/vllm/pull/12530> - <gh-pr:12530>
- <https://github.com/vllm-project/vllm/pull/12561> - <gh-pr:12561>
- <https://github.com/vllm-project/vllm/pull/12579> - <gh-pr:12579>
- <https://github.com/vllm-project/vllm/pull/12592> - <gh-pr:12592>
- <https://github.com/vllm-project/vllm/pull/12644> - <gh-pr:12644>
### Metrics Collection ### Metrics Collection
@ -369,7 +369,7 @@ vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="F
However, `prometheus_client` has However, `prometheus_client` has
[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) - [never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
for [unclear reasons](https://github.com/vllm-project/vllm/pull/7279#discussion_r1710417152). We for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
simply use a `Gauge` metric set to 1 and simply use a `Gauge` metric set to 1 and
`multiprocess_mode="mostrecent"` instead. `multiprocess_mode="mostrecent"` instead.
@ -394,7 +394,7 @@ distinguish between per-adapter counts. This should be revisited.
Note that `multiprocess_mode="livemostrecent"` is used - the most Note that `multiprocess_mode="livemostrecent"` is used - the most
recent metric is used, but only from currently running processes. recent metric is used, but only from currently running processes.
This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is This was added in <gh-pr:9477> and there is
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54). [at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
If we revisit this design and deprecate the old metric, we should reduce If we revisit this design and deprecate the old metric, we should reduce
the need for a significant deprecation period by making the change in the need for a significant deprecation period by making the change in
@ -402,7 +402,7 @@ v0 also and asking this project to move to the new metric.
### Prefix Cache metrics ### Prefix Cache metrics
The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
some interesting points which may be relevant to how we approach some interesting points which may be relevant to how we approach
future metrics. future metrics.
@ -439,8 +439,8 @@ suddenly (from their perspective) when it is removed, even if there is
an equivalent metric for them to use. an equivalent metric for them to use.
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code), [deprecated](gh-pr:2764) (with a comment in the code),
[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218). [removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
In general: In general:
@ -460,35 +460,33 @@ the project-wide deprecation policy.
### Unimplemented - `vllm:tokens_total` ### Unimplemented - `vllm:tokens_total`
Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be Added by <gh-pr:4464>, but apparently never implemented. This can just be
removed. removed.
### Duplicated - Queue Time ### Duplicated - Queue Time
The `vllm:time_in_queue_requests` Histogram metric was added by The `vllm:time_in_queue_requests` Histogram metric was added by
<https://github.com/vllm-project/vllm/pull/9659> and its calculation is: <gh-pr:9659> and its calculation is:
```python ```python
self.metrics.first_scheduled_time = now self.metrics.first_scheduled_time = now
self.metrics.time_in_queue = now - self.metrics.arrival_time self.metrics.time_in_queue = now - self.metrics.arrival_time
``` ```
Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
us with: us with:
```python ```python
if seq_group.is_finished(): if seq_group.is_finished():
if ( if (seq_group.metrics.first_scheduled_time is not None and
seq_group.metrics.first_scheduled_time is not None seq_group.metrics.first_token_time is not None):
and seq_group.metrics.first_token_time is not None
):
time_queue_requests.append( time_queue_requests.append(
seq_group.metrics.first_scheduled_time - seq_group.metrics.first_scheduled_time -
seq_group.metrics.arrival_time seq_group.metrics.arrival_time)
)
... ...
if seq_group.metrics.time_in_queue is not None: if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(seq_group.metrics.time_in_queue) time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
``` ```
This seems duplicative, and one of them should be removed. The latter This seems duplicative, and one of them should be removed. The latter
@ -513,7 +511,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`. with `--swap-space` and `--preemption-mode`.
In v0, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The In v0, [vLLM has long supported beam search](gh-issue:6226). The
SequenceGroup encapsulated the idea of N Sequences which SequenceGroup encapsulated the idea of N Sequences which
all shared the same prompt kv blocks. This enabled KV cache block all shared the same prompt kv blocks. This enabled KV cache block
sharing between requests, and copy-on-write to do branching. CPU sharing between requests, and copy-on-write to do branching. CPU
@ -526,7 +524,7 @@ and the part of the prompt that was evicted can be recomputed.
SequenceGroup was removed in V1, although a replacement will be SequenceGroup was removed in V1, although a replacement will be
required for "parallel sampling" (`n>1`). required for "parallel sampling" (`n>1`).
[Beam search was moved out of the core (in V0)](https://github.com/vllm-project/vllm/issues/8306). There was a [Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
lot of complex code for a very uncommon feature. lot of complex code for a very uncommon feature.
In V1, with prefix caching being better (zero over head) and therefore In V1, with prefix caching being better (zero over head) and therefore
@ -541,7 +539,7 @@ Some v0 metrics are only relevant in the context of "parallel
sampling". This is where the `n` parameter in a request is used to sampling". This is where the `n` parameter in a request is used to
request multiple completions from the same prompt. request multiple completions from the same prompt.
As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should As part of adding parallel sampling support in <gh-pr:10980>, we should
also add these metrics. also add these metrics.
- `vllm:request_params_n` (Histogram) - `vllm:request_params_n` (Histogram)
@ -566,7 +564,7 @@ model and then validate those tokens with the larger model.
- `vllm:spec_decode_num_draft_tokens_total` (Counter) - `vllm:spec_decode_num_draft_tokens_total` (Counter)
- `vllm:spec_decode_num_emitted_tokens_total` (Counter) - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)" There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
speculative decoding to v1. Other techniques will follow. We should speculative decoding to v1. Other techniques will follow. We should
revisit the v0 metrics in this context. revisit the v0 metrics in this context.
@ -587,7 +585,7 @@ see:
- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk) - [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ) - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>. - <gh-issue:5041> and <gh-pr:12726>.
This is a non-trivial topic. Consider this comment from Rob: This is a non-trivial topic. Consider this comment from Rob:
@ -654,7 +652,7 @@ fall under the more general heading of "Observability".
v0 has support for OpenTelemetry tracing: v0 has support for OpenTelemetry tracing:
- Added by <https://github.com/vllm-project/vllm/pull/4687> - Added by <gh-pr:4687>
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces` - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
- [User-facing docs](../examples/online_serving/opentelemetry.md) - [User-facing docs](../examples/online_serving/opentelemetry.md)
@ -685,7 +683,7 @@ documentation for this option states:
> use of possibly costly and or blocking operations and hence might > use of possibly costly and or blocking operations and hence might
> have a performance impact. > have a performance impact.
The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
as: as:
```text ```text

View File

@ -60,7 +60,7 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
## Processor Output Caching ## Processor Output Caching
Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again. Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache. When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.

View File

@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | | deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | | marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
| marlin experts | standard,</br>batched | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | | marlin experts | standard | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts] |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | | pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | | iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | | backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------| |----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` | | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`| | deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |

View File

@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
Related PRs: Related PRs:
- <https://github.com/vllm-project/vllm/pull/8823> - <gh-pr:8823>
## Prior State in v1 ## Prior State in v1

View File

@ -112,8 +112,8 @@ class KVCacheBlock:
ref_cnt: int ref_cnt: int
# The pointers to form a doubly linked list for the free queue. # The pointers to form a doubly linked list for the free queue.
prev_free_block: "KVCacheBlock | None" = None prev_free_block: Optional["KVCacheBlock"] = None
next_free_block: "KVCacheBlock | None" = None next_free_block: Optional["KVCacheBlock"] = None
``` ```
There are two design points to highlight: There are two design points to highlight:

Some files were not shown because too many files have changed in this diff Show More