Compare commits

..

4 Commits

Author SHA1 Message Date
22bf5c5077 fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-11 11:38:33 -07:00
3a8990743e add truncation
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-11 11:20:31 -07:00
fbc2cc8217 merge 2025-10-11 11:09:22 -07:00
efd4bc967d [Misc] Remove in ModelRunnerOutput
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-23 21:09:20 -07:00
1150 changed files with 14829 additions and 23579 deletions

View File

@ -5,11 +5,11 @@ import os
import sys import sys
import zipfile import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Note that we have 800 MiB quota, please use it wisely. # Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 . # See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.90
limit: 100
num_fewshot: 0

View File

@ -1,11 +0,0 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,5 +1,4 @@
# For vllm script, with -t option (tensor parallel size) # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -1 +0,0 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -1,44 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -1,50 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,27 +19,21 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096) max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
model_args = ( model_args = (
f"pretrained={eval_config['model_name']}," f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={tp_size},"
f"enforce_eager=true," f"enforce_eager=true,"
f"add_bos_token=true," f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}," f"max_model_len={max_model_len}"
) )
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model=backend, model="vllm",
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help batch_size="auto",
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
apply_chat_template=backend == "vllm-vlm",
batch_size=batch_size,
) )
return results return results

View File

@ -8,7 +8,7 @@ steps:
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest # Add job to create multi-arch manifest

File diff suppressed because it is too large Load Diff

View File

@ -527,8 +527,7 @@ steps:
# since torchao nightly is only compatible with torch nightly currently # since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- uv pip install --system torchao==0.13.0
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@ -734,16 +733,6 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true

View File

@ -1,10 +1,5 @@
[run] [run]
# Track the installed vllm package (this is what actually gets imported during tests) source = vllm
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit = omit =
*/tests/* */tests/*
*/test_* */test_*
@ -17,16 +12,6 @@ omit =
*/benchmarks/* */benchmarks/*
*/docs/* */docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report] [report]
exclude_lines = exclude_lines =
pragma: no cover pragma: no cover

View File

@ -1,4 +0,0 @@
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f

11
.github/CODEOWNERS vendored
View File

@ -5,7 +5,9 @@
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
@ -24,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson /vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
@ -118,11 +121,3 @@ mkdocs.yaml @hmellor
# KVConnector installation files # KVConnector installation files
/requirements/kv_connectors.txt @NickLucche /requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop

View File

@ -13,7 +13,6 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Label issues based on keywords - name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with: with:
script: | script: |
@ -43,6 +42,7 @@ jobs:
searchIn: "body" searchIn: "body"
}, },
], ],
// Substring search - matches anywhere in text (partial matches) // Substring search - matches anywhere in text (partial matches)
substrings: [ substrings: [
{ {
@ -89,12 +89,14 @@ jobs:
term: "hip_", term: "hip_",
searchIn: "both" searchIn: "both"
}, },
// ROCm tools and libraries // ROCm tools and libraries
{ {
term: "hipify", term: "hipify",
searchIn: "both" searchIn: "both"
}, },
], ],
// Regex patterns - for complex pattern matching // Regex patterns - for complex pattern matching
regexPatterns: [ regexPatterns: [
{ {
@ -105,17 +107,13 @@ jobs:
} }
], ],
}, },
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
}; };
// Helper function to create regex based on search type // Helper function to create regex based on search type
function createSearchRegex(term, type) { function createSearchRegex(term, type) {
// Escape special regex characters in the term // Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) { switch (type) {
case 'keyword': case 'keyword':
// Word boundary search - matches whole words only // Word boundary search - matches whole words only
@ -127,13 +125,16 @@ jobs:
throw new Error(`Unknown search type: ${type}`); throw new Error(`Unknown search type: ${type}`);
} }
} }
// Helper function to find matching terms in text with line information // Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = []; const matches = [];
const lines = text.split('\n'); const lines = text.split('\n');
for (const termConfig of searchTerms) { for (const termConfig of searchTerms) {
let regex; let regex;
let term, searchIn, pattern, description, flags; let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object) // Handle different input formats (string or object)
if (typeof termConfig === 'string') { if (typeof termConfig === 'string') {
term = termConfig; term = termConfig;
@ -145,17 +146,21 @@ jobs:
description = termConfig.description; description = termConfig.description;
flags = termConfig.flags; flags = termConfig.flags;
} }
// Skip if this term shouldn't be searched in the current location // Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) { if (searchIn !== 'both' && searchIn !== searchLocation) {
continue; continue;
} }
// Create appropriate regex // Create appropriate regex
if (searchType === 'regex') { if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi"); regex = new RegExp(pattern, flags || "gi");
} else { } else {
regex = createSearchRegex(term, searchType); regex = createSearchRegex(term, searchType);
} }
const termMatches = []; const termMatches = [];
// Check each line for matches // Check each line for matches
lines.forEach((line, lineIndex) => { lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex); const lineMatches = line.match(regex);
@ -170,14 +175,15 @@ jobs:
originalTerm: term || pattern, originalTerm: term || pattern,
description: description, description: description,
// Show context around the match in the line // Show context around the match in the line
context: line.length > 100 ? context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim() : line.trim()
}); });
}); });
} }
}); });
if (termMatches.length > 0) { if (termMatches.length > 0) {
matches.push({ matches.push({
term: term || (description || pattern), term: term || (description || pattern),
@ -190,48 +196,64 @@ jobs:
}); });
} }
} }
return matches; return matches;
} }
// Helper function to check if label should be added // Helper function to check if label should be added
async function processLabel(labelName, config) { async function processLabel(labelName, config) {
const body = context.payload.issue.body || ""; const body = context.payload.issue.body || "";
const title = context.payload.issue.title || ""; const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`); core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`); core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`); core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false; let shouldAddLabel = false;
let allMatches = []; let allMatches = [];
let reason = ''; let reason = '';
const keywords = config.keywords || []; const keywords = config.keywords || [];
const substrings = config.substrings || []; const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || []; const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title // Search in title
if (title.trim()) { if (title.trim()) {
core.notice(`Searching in title: "${title}"`); core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
} }
// Search in body // Search in body
if (body.trim()) { if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`); core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
} }
if (allMatches.length > 0) { if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`); core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) { for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') { if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else { } else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} }
// Show details for each match // Show details for each match
termMatch.matches.forEach((match, index) => { termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -244,6 +266,7 @@ jobs:
} }
}); });
} }
shouldAddLabel = true; shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -251,10 +274,13 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
} }
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) { if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name); const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) { if (!existingLabels.includes(labelName)) {
@ -270,92 +296,14 @@ jobs:
core.notice(`Label "${labelName}" already present.`); core.notice(`Label "${labelName}" already present.`);
return false; return false;
} }
core.notice(`No matching terms found for label "${labelName}".`); core.notice(`No matching terms found for label "${labelName}".`);
return false; return false;
} }
// Process all configured labels // Process all configured labels
const labelsAddedResults = await Promise.all( const processLabels = Object.entries(labelConfig)
Object.entries(labelConfig).map(([labelName, config]) => .map(([labelName, config]) => processLabel(labelName, config));
processLabel(labelName, config).then(added => ({ labelName, added })) const labelsAdded = await Promise.all(processLabels);
) const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
); core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}

View File

@ -16,7 +16,6 @@ repos:
rev: v1.38.1 rev: v1.38.1
hooks: hooks:
- id: typos - id: typos
args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v21.1.2 rev: v21.1.2
hooks: hooks:

View File

@ -8,6 +8,7 @@ import sys
import time import time
import traceback import traceback
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp import aiohttp
import huggingface_hub.constants import huggingface_hub.constants
@ -27,13 +28,13 @@ class RequestFuncInput:
prompt_len: int prompt_len: int
output_len: int output_len: int
model: str model: str
model_name: str | None = None model_name: Optional[str] = None
logprobs: int | None = None logprobs: Optional[int] = None
extra_body: dict | None = None extra_body: Optional[dict] = None
multi_modal_content: dict | list[dict] | None = None multi_modal_content: Optional[dict | list[dict]] = None
ignore_eos: bool = False ignore_eos: bool = False
language: str | None = None language: Optional[str] = None
request_id: str | None = None request_id: Optional[str] = None
@dataclass @dataclass
@ -51,7 +52,7 @@ class RequestFuncOutput:
async def async_request_tgi( async def async_request_tgi(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
@ -132,7 +133,7 @@ async def async_request_tgi(
async def async_request_trt_llm( async def async_request_trt_llm(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
@ -203,7 +204,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii( async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions( async def async_request_openai_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
@ -366,7 +367,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions( async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), ( assert api_url.endswith(("chat/completions", "profile")), (
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio( async def async_request_openai_audio(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep. # Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile import soundfile
@ -609,7 +610,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
**kwargs, **kwargs,
) -> PreTrainedTokenizer | PreTrainedTokenizerFast: ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists( if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path pretrained_model_name_or_path
): ):

View File

@ -32,6 +32,7 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import Optional
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int], input_length_range: tuple[int, int],
fixed_output_len: int | None, fixed_output_len: Optional[int],
) -> list[Request]: ) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")
@ -127,7 +128,7 @@ def sample_requests_from_random(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int], input_length_range: tuple[int, int],
fixed_output_len: int | None, fixed_output_len: Optional[int],
prefix_len: int, prefix_len: int,
) -> list[Request]: ) -> list[Request]:
requests = [] requests = []

View File

@ -7,6 +7,7 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -23,7 +24,7 @@ def sample_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
fixed_output_len: int | None, fixed_output_len: Optional[int],
) -> list[tuple[str, int, int, int]]: ) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")

View File

@ -32,6 +32,7 @@ import uuid
import warnings import warnings
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional
import datasets import datasets
import numpy as np import numpy as np
@ -315,7 +316,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str], selected_percentile_metrics: list[str],
selected_percentiles: list[float], selected_percentiles: list[float],
goodput_config_dict: dict[str, float] | None = None, goodput_config_dict: Optional[dict[str, float]] = None,
) -> tuple[BenchmarkMetrics, list[int]]: ) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = [] actual_output_lens: list[int] = []
total_input = 0 total_input = 0
@ -435,9 +436,9 @@ async def benchmark(
selected_percentile_metrics: list[str], selected_percentile_metrics: list[str],
selected_percentiles: list[str], selected_percentiles: list[str],
ignore_eos: bool, ignore_eos: bool,
max_concurrency: int | None, max_concurrency: Optional[int],
structured_output_ratio: float, structured_output_ratio: float,
goodput_config_dict: dict[str, float] | None = None, goodput_config_dict: Optional[dict[str, float]] = None,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]

View File

@ -6,7 +6,7 @@ import math
import os import os
import time import time
from types import TracebackType from types import TracebackType
from typing import Any from typing import Any, Optional, Union
def convert_to_pytorch_benchmark_format( def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None: def __init__(self, scale: int) -> None:
self.cnt: int = 0 self.cnt: int = 0
self._sum: int = 0 self._sum: int = 0
self._max: int | None = None self._max: Optional[int] = None
self.scale = scale self.scale = scale
self.start_time: int = time.monotonic_ns() self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
else: else:
self._max = max(self._max, v) self._max = max(self._max, v)
def avg(self) -> float | str: def avg(self) -> Union[float, str]:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> float | str: def max(self) -> Union[float, str]:
return self._max / self.scale if self._max else "N/A" return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[float | str]: def dump_avg_max(self) -> list[Union[float, str]]:
return [self.avg(), self.max()] return [self.avg(), self.max()]
def __enter__(self) -> None: def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
def __exit__( def __exit__(
self, self,
exc_type: type[BaseException] | None, exc_type: Optional[type[BaseException]],
exc_value: BaseException | None, exc_value: Optional[BaseException],
exc_traceback: TracebackType | None, exc_traceback: Optional[TracebackType],
) -> None: ) -> None:
self.collect(time.monotonic_ns() - self.start_time) self.collect(time.monotonic_ns() - self.start_time)

View File

@ -6,7 +6,8 @@ import copy
import itertools import itertools
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Callable
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark

View File

@ -6,7 +6,8 @@ import copy
import itertools import itertools
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -52,7 +53,7 @@ def bench_int8(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels.""" """Benchmark INT8-based kernels."""
assert dtype == torch.int8 assert dtype == torch.int8
@ -107,7 +108,7 @@ def bench_fp8(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels.""" """Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn assert dtype == torch.float8_e4m3fn
@ -182,7 +183,7 @@ def bench(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
if dtype == torch.int8: if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -200,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run( def run(
dtype: torch.dtype, dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]], MKNs: Iterable[tuple[int, int, int]],
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
results = [] results = []
for m, k, n in MKNs: for m, k, n in MKNs:

View File

@ -3,9 +3,10 @@
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from itertools import product from itertools import product
from typing import Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl( def unfused_int8_impl(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
# Norm # Norm
@ -67,7 +68,7 @@ def unfused_int8_impl(
def unfused_fp8_impl( def unfused_fp8_impl(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
# Norm # Norm
@ -84,7 +85,7 @@ def unfused_fp8_impl(
def fused_impl( def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
out, _ = ops.rms_norm_dynamic_per_token_quant( out, _ = ops.rms_norm_dynamic_per_token_quant(

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from collections.abc import Callable from typing import Callable
from unittest.mock import patch from unittest.mock import patch
import pandas as pd import pandas as pd

View File

@ -22,8 +22,8 @@ Example:
import json import json
import os import os
import time import time
from collections.abc import Callable
from contextlib import nullcontext from contextlib import nullcontext
from typing import Callable, Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
def benchmark_allreduce_single( def benchmark_allreduce_single(
self, self,
sequence_length: int, sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
should_use_fn: Callable[[torch.Tensor], bool], should_use_fn: Callable[[torch.Tensor], bool],
context, context,
num_warmup: int, num_warmup: int,
num_trials: int, num_trials: int,
) -> float | None: ) -> Optional[float]:
"""Benchmark method with CUDA graph optimization.""" """Benchmark method with CUDA graph optimization."""
try: try:
# Create test tensor (2D: sequence_length x hidden_size) # Create test tensor (2D: sequence_length x hidden_size)

View File

@ -6,12 +6,11 @@ import copy
import json import json
import pickle import pickle
import time import time
from collections.abc import Callable
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from itertools import product from itertools import product
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -159,7 +158,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor, seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor,
scaling: float, scaling: float,
add_inputs: bool | None, add_inputs: Optional[bool],
): ):
""" """
Torch group gemm reference implementation to test correctness of Torch group gemm reference implementation to test correctness of
@ -317,8 +316,8 @@ class BenchmarkContext:
lora_rank: int lora_rank: int
sort_by_lora_id: bool sort_by_lora_id: bool
dtype: torch.dtype dtype: torch.dtype
seq_length: int | None = None seq_length: Optional[int] = None
num_slices: int | None = None # num_slices for slice based ops num_slices: Optional[int] = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext": def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self) ctx = copy.copy(self)
@ -562,7 +561,7 @@ class BenchmarkTensors:
} }
def bench_fn_kwargs( def bench_fn_kwargs(
self, op_type: OpType, add_inputs: bool | None = None self, op_type: OpType, add_inputs: Optional[bool] = None
) -> dict[str, Any]: ) -> dict[str, Any]:
if op_type.is_shrink_fn(): if op_type.is_shrink_fn():
assert add_inputs is None assert add_inputs is None
@ -576,7 +575,7 @@ class BenchmarkTensors:
raise ValueError(f"Unrecognized optype {self}") raise ValueError(f"Unrecognized optype {self}")
def test_correctness( def test_correctness(
self, op_type: OpType, expand_fn_add_inputs: bool | None self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
) -> bool: ) -> bool:
""" """
Test correctness of op_type implementation against a grouped gemm Test correctness of op_type implementation against a grouped gemm
@ -612,8 +611,8 @@ def bench_optype(
ctx: BenchmarkContext, ctx: BenchmarkContext,
arg_pool_size: int, arg_pool_size: int,
op_type: OpType, op_type: OpType,
cuda_graph_nops: int | None = None, cuda_graph_nops: Optional[int] = None,
expand_fn_add_inputs: bool | None = None, expand_fn_add_inputs: Optional[bool] = None,
test_correctness: bool = False, test_correctness: bool = False,
) -> TMeasurement: ) -> TMeasurement:
assert arg_pool_size >= 1 assert arg_pool_size >= 1
@ -680,7 +679,7 @@ def bench_torch_mm(
ctx: BenchmarkContext, ctx: BenchmarkContext,
arg_pool_size: int, arg_pool_size: int,
op_type: OpType, op_type: OpType,
cuda_graph_nops: int | None = None, cuda_graph_nops: Optional[int] = None,
) -> TMeasurement: ) -> TMeasurement:
""" """
Benchmark basic torch.mm as a roofline. Benchmark basic torch.mm as a roofline.
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
""" """
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
compare = TBenchmark.Compare(timers) compare = TBenchmark.Compare(timers)
compare.print() compare.print()

View File

@ -8,9 +8,10 @@ import math
import os import os
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from itertools import product from itertools import product
from typing import Callable, Optional
import pandas as pd import pandas as pd
import torch import torch
@ -62,23 +63,23 @@ class BenchmarkTensors:
a: torch.Tensor a: torch.Tensor
w_q: torch.Tensor w_q: torch.Tensor
group_size: int | None group_size: Optional[int]
wtype: ScalarType wtype: ScalarType
w_g_s: torch.Tensor w_g_s: torch.Tensor
w_g_zp: torch.Tensor | None w_g_zp: Optional[torch.Tensor]
w_ch_s: torch.Tensor | None w_ch_s: Optional[torch.Tensor]
w_tok_s: torch.Tensor | None w_tok_s: Optional[torch.Tensor]
@dataclass @dataclass
class TypeConfig: class TypeConfig:
act_type: torch.dtype act_type: torch.dtype
weight_type: ScalarType weight_type: ScalarType
output_type: torch.dtype | None output_type: Optional[torch.dtype]
group_scale_type: torch.dtype | None group_scale_type: Optional[torch.dtype]
group_zero_type: torch.dtype | None group_zero_type: Optional[torch.dtype]
channel_scale_type: torch.dtype | None channel_scale_type: Optional[torch.dtype]
token_scale_type: torch.dtype | None token_scale_type: Optional[torch.dtype]
def rand_data(shape, dtype=torch.float16, scale=1): def rand_data(shape, dtype=torch.float16, scale=1):
@ -92,8 +93,8 @@ def quantize_and_pack(
atype: torch.dtype, atype: torch.dtype,
w: torch.Tensor, w: torch.Tensor,
wtype: ScalarType, wtype: ScalarType,
stype: torch.dtype | None, stype: Optional[torch.dtype],
group_size: int | None, group_size: Optional[int],
zero_points: bool = False, zero_points: bool = False,
): ):
assert wtype.is_integer(), "TODO: support floating point weights" assert wtype.is_integer(), "TODO: support floating point weights"
@ -112,7 +113,7 @@ def quantize_and_pack(
def create_bench_tensors( def create_bench_tensors(
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
) -> list[BenchmarkTensors]: ) -> list[BenchmarkTensors]:
m, n, k = shape m, n, k = shape
@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res return res
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
def bench( def bench(

View File

@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else: else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)

View File

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
hidden_size = config.hidden_size hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute use_customized_permute = args.use_customized_permute

View File

@ -3,6 +3,7 @@
import random import random
import time import time
from typing import Optional
import torch import torch
@ -36,7 +37,7 @@ def main(
seed: int, seed: int,
do_profile: bool, do_profile: bool,
device: str = "cuda", device: str = "cuda",
kv_cache_dtype: str | None = None, kv_cache_dtype: Optional[str] = None,
) -> None: ) -> None:
current_platform.seed_everything(seed) current_platform.seed_everything(seed)

View File

@ -3,8 +3,8 @@
import argparse import argparse
import math import math
from collections.abc import Callable
from contextlib import contextmanager from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch from unittest.mock import patch
import torch import torch

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random import random
import time import time

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random import random
import time import time

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from typing import Optional, Union
import torch import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -20,8 +21,8 @@ class HuggingFaceRMSNorm(nn.Module):
def forward( def forward(
self, self,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
orig_dtype = x.dtype orig_dtype = x.dtype
x = x.to(torch.float32) x = x.to(torch.float32)
if residual is not None: if residual is not None:
@ -40,7 +41,7 @@ class HuggingFaceRMSNorm(nn.Module):
def rmsnorm_naive( def rmsnorm_naive(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -64,7 +65,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer( def rmsnorm_flashinfer(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
orig_shape = x.shape orig_shape = x.shape
@ -88,7 +89,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm( def rmsnorm_vllm(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
orig_shape = x.shape orig_shape = x.shape

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate from itertools import accumulate
from typing import Optional
import nvtx import nvtx
import torch import torch
@ -17,7 +18,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int, seq_len: int,
num_heads: int, num_heads: int,
head_size: int, head_size: int,
rotary_dim: int | None, rotary_dim: Optional[int],
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: str, device: str,

View File

@ -4,6 +4,7 @@
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from typing import Optional
import flashinfer import flashinfer
import torch import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad() @torch.no_grad()
def benchmark_decode( def benchmark_decode(
dtype: torch.dtype, dtype: torch.dtype,
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
batch_size: int, batch_size: int,
max_seq_len: int, max_seq_len: int,
num_heads: tuple[int, int] = (64, 8), num_heads: tuple[int, int] = (64, 8),

View File

@ -4,6 +4,7 @@
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from typing import Optional
import flashinfer import flashinfer
import torch import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad() @torch.no_grad()
def benchmark_prefill( def benchmark_prefill(
dtype: torch.dtype, dtype: torch.dtype,
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
batch_size: int, batch_size: int,
max_seq_len: int, max_seq_len: int,
num_heads: tuple[int, int] = (64, 8), num_heads: tuple[int, int] = (64, 8),

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Any from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
def __init__( def __init__(
self, self,
cuda_graph_params: CudaGraphBenchParams | None, cuda_graph_params: Optional[CudaGraphBenchParams],
label: str, label: str,
sub_label: str, sub_label: str,
description: str, description: str,

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from statistics import mean from statistics import mean
from typing import Any, NamedTuple from typing import Any, NamedTuple, Optional, Union
import numpy as np # type: ignore import numpy as np # type: ignore
import pandas as pd # type: ignore import pandas as pd # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
class UniformDistribution(Distribution): class UniformDistribution(Distribution):
def __init__( def __init__(
self, self,
min_val: int | float, min_val: Union[int, float],
max_val: int | float, max_val: Union[int, float],
is_integer: bool = True, is_integer: bool = True,
) -> None: ) -> None:
self.min_val = min_val self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
class ConstantDistribution(Distribution): class ConstantDistribution(Distribution):
def __init__(self, value: int | float) -> None: def __init__(self, value: Union[int, float]) -> None:
self.value = value self.value = value
self.max_val = value self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
class ZipfDistribution(Distribution): class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None: def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha self.alpha = alpha
self.max_val = max_val self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
class PoissonDistribution(Distribution): class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None: def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha self.alpha = alpha
self.max_val = max_val self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution): class LognormalDistribution(Distribution):
def __init__( def __init__(
self, self,
mean: float | None = None, mean: Optional[float] = None,
sigma: float | None = None, sigma: Optional[float] = None,
average: int | None = None, average: Optional[int] = None,
median_ratio: float | None = None, median_ratio: Optional[float] = None,
max_val: int | None = None, max_val: Optional[int] = None,
) -> None: ) -> None:
self.average = average self.average = average
self.median_ratio = median_ratio self.median_ratio = median_ratio

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum from enum import Enum
from http import HTTPStatus from http import HTTPStatus
from statistics import mean from statistics import mean
from typing import NamedTuple from typing import NamedTuple, Union
import aiohttp # type: ignore import aiohttp # type: ignore
import numpy as np # type: ignore import numpy as np # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
class DebugStats: class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None: def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger self.logger = logger
self.metrics: dict[str, MovingAverage | MetricStats] = { self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
"moving_avg_ttft_ms": MovingAverage(window_size), "moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size), "moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(), "ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(
if args.verbose: if args.verbose:
curr_time_sec: float = time.perf_counter() curr_time_sec: float = time.perf_counter()
time_since_last_turn: str | float = "N/A" time_since_last_turn: Union[str, float] = "N/A"
if conv_id in time_of_last_turn: if conv_id in time_of_last_turn:
time_since_last_turn = round( time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3 curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
) )
rps: str | float = round(len(client_metrics) / runtime_sec, 3) rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients): if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low # Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed) # (threshold can be tuned if needed)
rps = "N/A" rps = "N/A"
runtime_left_sec: str | float = round( runtime_left_sec: Union[str, float] = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3 (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
) )
if percent < 0.05: if percent < 0.05:

View File

@ -13,7 +13,7 @@ import argparse
import json import json
import random import random
from statistics import mean from statistics import mean
from typing import Any from typing import Any, Optional
import pandas as pd # type: ignore import pandas as pd # type: ignore
import tqdm # type: ignore import tqdm # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid( def content_is_valid(
content: str, min_content_len: int | None, max_content_len: int | None content: str, min_content_len: Optional[int], max_content_len: Optional[int]
) -> bool: ) -> bool:
if min_content_len and len(content) < min_content_len: if min_content_len and len(content) < min_content_len:
return False return False
@ -37,7 +37,7 @@ def content_is_valid(
def print_stats( def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
) -> None: ) -> None:
# Collect statistics # Collect statistics
stats = [] stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int, seed: int,
input_file: str, input_file: str,
output_file: str, output_file: str,
max_items: int | None, max_items: Optional[int],
min_content_len: int | None = None, min_content_len: Optional[int] = None,
max_content_len: int | None = None, max_content_len: Optional[int] = None,
min_turns: int | None = None, min_turns: Optional[int] = None,
max_turns: int | None = None, max_turns: Optional[int] = None,
model: str | None = None, model: Optional[str] = None,
) -> None: ) -> None:
if min_turns and max_turns: if min_turns and max_turns:
assert min_turns <= max_turns assert min_turns <= max_turns

View File

@ -22,10 +22,10 @@ else()
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
) )
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif() endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR) if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif() endif()

View File

@ -1,12 +0,0 @@
codecov:
require_ci_to_pass: false
fixes:
# Map source code paths to repository root paths
# Wildcards match any Python version (python3.*)
- "/vllm-workspace/src/vllm/::vllm/"
- "/vllm-workspace/vllm/::vllm/"
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum import enum
from typing import Union
from cutlass_library import * from cutlass_library import *
@ -21,7 +22,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative = enum_auto() TmaWarpSpecializedCooperative = enum_auto()
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeNames, # type: ignore **DataTypeNames, # type: ignore
**{ **{
VLLMDataType.u4b8: "u4b8", VLLMDataType.u4b8: "u4b8",
@ -29,7 +30,7 @@ VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
}, },
} }
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeTag, # type: ignore **DataTypeTag, # type: ignore
**{ **{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -37,7 +38,7 @@ VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
}, },
} }
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
**DataTypeSize, # type: ignore **DataTypeSize, # type: ignore
**{ **{
VLLMDataType.u4b8: 4, VLLMDataType.u4b8: 4,
@ -45,7 +46,7 @@ VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
}, },
} }
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u4b8: "vllm::kU4B8",
VLLMDataType.u8b128: "vllm::kU8B128", VLLMDataType.u8b128: "vllm::kU8B128",
DataType.u4: "vllm::kU4", DataType.u4: "vllm::kU4",
@ -56,7 +57,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.bf16: "vllm::kBfloat16", DataType.bf16: "vllm::kBfloat16",
} }
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.u8: "at::ScalarType::Byte", DataType.u8: "at::ScalarType::Byte",
DataType.s8: "at::ScalarType::Char", DataType.s8: "at::ScalarType::Char",
DataType.e4m3: "at::ScalarType::Float8_e4m3fn", DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -66,7 +67,9 @@ VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.f32: "at::ScalarType::Float", DataType.f32: "at::ScalarType::Float",
} }
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { VLLMKernelScheduleTag: dict[
Union[MixedInputKernelScheduleType, KernelScheduleType], str
] = {
**KernelScheduleTag, # type: ignore **KernelScheduleTag, # type: ignore
**{ **{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501

View File

@ -2,7 +2,6 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -19,22 +18,11 @@ __global__ void rms_norm_kernel(
const float epsilon, const int num_tokens, const int hidden_size) { const float epsilon, const int num_tokens, const int hidden_size) {
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride;
constexpr int VEC_SIZE = 8; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) { const float x = (float)input[blockIdx.x * input_stride + idx];
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
}; }
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;

View File

@ -10,7 +10,6 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -29,22 +28,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
const float x = (float)input[blockIdx.x * input_stride + idx];
constexpr int VEC_SIZE = 8;
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
}; }
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;

View File

@ -21,6 +21,7 @@
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "../cub_helpers.h" #include "../cub_helpers.h"
#include "../core/batch_invariant.hpp"
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>; using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
static constexpr int VPT = Constants::VPT; static constexpr int VPT = Constants::VPT;
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP; static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP; const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);

View File

@ -9,6 +9,7 @@ from collections.abc import Iterable
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass, fields from dataclasses import dataclass, fields
from functools import reduce from functools import reduce
from typing import Optional, Union
import jinja2 import jinja2
from vllm_cutlass_library_extension import ( from vllm_cutlass_library_extension import (
@ -258,7 +259,7 @@ class ScheduleConfig:
@dataclass(frozen=True) @dataclass(frozen=True)
class TypeConfig: class TypeConfig:
a: DataType a: DataType
b: DataType | VLLMDataType b: Union[DataType, VLLMDataType]
b_group_scale: DataType b_group_scale: DataType
b_group_zeropoint: DataType b_group_zeropoint: DataType
b_channel_scale: DataType b_channel_scale: DataType
@ -279,7 +280,7 @@ class PrepackTypeConfig:
class ImplConfig: class ImplConfig:
types: TypeConfig types: TypeConfig
schedules: list[ScheduleConfig] schedules: list[ScheduleConfig]
heuristic: list[tuple[str | None, ScheduleConfig]] heuristic: list[tuple[Optional[str], ScheduleConfig]]
def generate_sch_sig(schedule_config: ScheduleConfig) -> str: def generate_sch_sig(schedule_config: ScheduleConfig) -> str:

View File

@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
__global__ __quickreduce_launch_bounds_two_shot__ static void __global__ __quickreduce_launch_bounds_two_shot__ static void
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks, allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
int rank, uint8_t** dbuffer_list, int rank, uint8_t** dbuffer_list,
uint32_t data_offset, uint32_t flag_color, uint32_t data_offset, uint32_t flag_color) {
int64_t data_size_per_phase) {
int block = blockIdx.x; int block = blockIdx.x;
int grid = gridDim.x; int grid = gridDim.x;
while (block < num_blocks) { while (block < num_blocks) {
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset, AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
flag_color, data_size_per_phase); flag_color);
block += grid; block += grid;
flag_color++; flag_color++;
} }
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} else if (world_size == 4) { \ } else if (world_size == 4) { \
using LineCodec = __codec<T, 4>; \ using LineCodec = __codec<T, 4>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \ using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} else if (world_size == 8) { \ } else if (world_size == 8) { \
using LineCodec = __codec<T, 8>; \ using LineCodec = __codec<T, 8>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \ using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} }
enum QuickReduceQuantLevel { enum QuickReduceQuantLevel {

View File

@ -553,12 +553,13 @@ struct AllReduceTwoshot {
int const rank, // rank index int const rank, // rank index
uint8_t** __restrict__ buffer_list, // communication buffers uint8_t** __restrict__ buffer_list, // communication buffers
uint32_t const data_offset, // offset to start of the data buffer uint32_t const data_offset, // offset to start of the data buffer
uint32_t flag_color, int64_t data_size_per_phase) { uint32_t flag_color) {
// Topology // Topology
int thread = threadIdx.x + threadIdx.y * kWavefront; int thread = threadIdx.x + threadIdx.y * kWavefront;
uint8_t* rank_buffer = buffer_list[rank]; uint8_t* rank_buffer = buffer_list[rank];
Codec codec(thread, rank); Codec codec(thread, rank);
int block_id = blockIdx.x; int block_id = blockIdx.x;
int grid_size = gridDim.x;
// -------------------------------------------------------- // --------------------------------------------------------
// Read input into registers // Read input into registers
int32x4_t tA[kAtoms]; int32x4_t tA[kAtoms];
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
// rank responsible for this segment. // rank responsible for this segment.
uint32_t comm_data0_offset = uint32_t comm_data0_offset =
data_offset + block_id * Codec::kTransmittedTileSize; data_offset + block_id * Codec::kTransmittedTileSize;
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset; uint32_t comm_data1_offset =
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t)); uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset; uint32_t comm_flags1_offset =
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
for (int r = 0; r < kWorldSize; r++) { for (int r = 0; r < kWorldSize; r++) {
int32x4_t* send_buffer = int32x4_t* send_buffer =

View File

@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=500 ARG VLLM_MAX_SIZE_MB=450
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \

View File

@ -11,7 +11,8 @@ The following code splits the model across 2 GPUs.
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
tensor_parallel_size=2)
``` ```
!!! warning !!! warning
@ -42,7 +43,9 @@ and the maximum batch size (`max_num_seqs` option).
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2)
``` ```
## Reduce CUDA Graphs ## Reduce CUDA Graphs
@ -58,12 +61,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
```python ```python
from vllm import LLM from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode from vllm.config import CompilationConfig, CompilationLevel
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, level=CompilationLevel.PIECEWISE,
# By default, it goes up to max_num_seqs # By default, it goes up to max_num_seqs
cudagraph_capture_sizes=[1, 2, 4, 8, 16], cudagraph_capture_sizes=[1, 2, 4, 8, 16],
), ),
@ -75,7 +78,8 @@ You can disable graph capturing completely via the `enforce_eager` flag:
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
enforce_eager=True)
``` ```
## Adjust cache size ## Adjust cache size
@ -93,10 +97,8 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
from vllm import LLM from vllm import LLM
# Accept up to 3 images and 1 video per prompt # Accept up to 3 images and 1 video per prompt
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"image": 3, "video": 1})
limit_mm_per_prompt={"image": 3, "video": 1},
)
``` ```
You can go a step further and disable unused modalities completely by setting its limit to zero. You can go a step further and disable unused modalities completely by setting its limit to zero.
@ -106,10 +108,8 @@ For example, if your application only accepts image input, there is no need to a
from vllm import LLM from vllm import LLM
# Accept any number of images but no videos # Accept any number of images but no videos
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"video": 0})
limit_mm_per_prompt={"video": 0},
)
``` ```
You can even run a multi-modal model for text-only inference: You can even run a multi-modal model for text-only inference:
@ -118,10 +118,8 @@ You can even run a multi-modal model for text-only inference:
from vllm import LLM from vllm import LLM
# Don't accept images. Just text. # Don't accept images. Just text.
llm = LLM( llm = LLM(model="google/gemma-3-27b-it",
model="google/gemma-3-27b-it", limit_mm_per_prompt={"image": 0})
limit_mm_per_prompt={"image": 0},
)
``` ```
### Configurable options ### Configurable options
@ -175,14 +173,14 @@ Here are some examples:
from vllm import LLM from vllm import LLM
# Available for Qwen2-VL series models # Available for Qwen2-VL series models
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_kwargs={
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28 "max_pixels": 768 * 768, # Default is 1280 * 28 * 28
) })
# Available for InternVL series models # Available for InternVL series models
llm = LLM( llm = LLM(model="OpenGVLab/InternVL2-2B",
model="OpenGVLab/InternVL2-2B", mm_processor_kwargs={
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12 "max_dynamic_patch": 4, # Default is 12
) })
``` ```

View File

@ -100,7 +100,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.3-70B-Instruct, model="meta-llama/Llama-3.3-70B-Instruct,
tensor_parallel_size=4, tensor_parallel_size=4,
pipeline_parallel_size=2, pipeline_parallel_size=2
) )
``` ```
@ -257,24 +257,18 @@ Examples:
```python ```python
# Use a larger cache # Use a larger cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=8)
mm_processor_cache_gb=8,
)
# Use a shared-memory based IPC cache # Use a shared-memory based IPC cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", tensor_parallel_size=2,
tensor_parallel_size=2, mm_processor_cache_type="shm",
mm_processor_cache_type="shm", mm_processor_cache_gb=8)
mm_processor_cache_gb=8,
)
# Disable the cache # Disable the cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=0)
mm_processor_cache_gb=0,
)
``` ```
### Cache Placement ### Cache Placement

View File

@ -35,7 +35,6 @@ th {
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` | | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
| Random | ✅ | ✅ | `synthetic` | | Random | ✅ | ✅ | `synthetic` |
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
| RandomForReranking | ✅ | ✅ | `synthetic` |
| Prefix Repetition | ✅ | ✅ | `synthetic` | | Prefix Repetition | ✅ | ✅ | `synthetic` |
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` | | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
@ -879,51 +878,6 @@ vllm bench serve \
</details> </details>
#### Reranker Benchmark
Benchmark the performance of rerank requests in vLLM.
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Unlike generative models which use Completions API or Chat Completions API,
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
For reranking, the only supported dataset is `--dataset-name random-rerank`
Start the server:
```bash
vllm serve BAAI/bge-reranker-v2-m3
```
Run the benchmark:
```bash
vllm bench serve \
--model BAAI/bge-reranker-v2-m3 \
--backend vllm-rerank \
--endpoint /v1/rerank \
--dataset-name random-rerank \
--tokenizer BAAI/bge-reranker-v2-m3 \
--random-input-len 512 \
--num-prompts 10 \
--random-batch-size 5
```
For reranker models, this will create `num_prompts / random_batch_size` requests with
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
In the example above, this results in 2 rerank requests with 5 "documents" each where
each document has close to 512 tokens.
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
with an embedding model, also set `--no_reranker`. Because in this case the query is
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
to account for the extra prompt which is the query. The token accounting to report the
throughput numbers correctly is also adjusted.
</details>
[](){ #performance-benchmarks } [](){ #performance-benchmarks }
## Performance Benchmarks ## Performance Benchmarks

View File

@ -73,8 +73,8 @@ def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor: ) -> torch.Tensor:
... ...
``` ```

View File

@ -16,7 +16,7 @@ Further update the model as follows:
... ...
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"): if modality.startswith("image"):
return "<image>" return "<image>"
@ -45,14 +45,14 @@ Further update the model as follows:
... ...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
assert self.vision_encoder is not None assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input) image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( def get_multimodal_embeddings(
self, self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
**kwargs: object,
) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments # Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
For example, if the model supports any number of images but only one video per prompt: For example, if the model supports any number of images but only one video per prompt:
```python ```python
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": 1} return {"image": None, "video": 1}
``` ```
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
@ -421,10 +421,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
```python ```python
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor() image_processor = self.get_image_processor()
return ImageSize( return ImageSize(width=image_processor.size["width"],
width=image_processor.size["width"], height=image_processor.size["height"])
height=image_processor.size["height"],
)
``` ```
Fuyu does not expect image placeholders in the inputs to HF processor, so Fuyu does not expect image placeholders in the inputs to HF processor, so
@ -454,12 +452,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
return { return {
"image": "image":
self._get_dummy_images( self._get_dummy_images(width=target_width,
width=target_width, height=target_height,
height=target_height, num_images=num_images,
num_images=num_images, overrides=image_overrides)
overrides=image_overrides,
)
} }
``` ```
@ -748,7 +744,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
@ -784,7 +781,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
@ -812,11 +810,9 @@ to register them to the multi-modal registry:
from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.interfaces import SupportsMultiModal
+ from vllm.multimodal import MULTIMODAL_REGISTRY + from vllm.multimodal import MULTIMODAL_REGISTRY
+ @MULTIMODAL_REGISTRY.register_processor( + @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+ YourMultiModalProcessor, + info=YourProcessingInfo,
+ info=YourProcessingInfo, + dummy_inputs=YourDummyInputsBuilder)
+ dummy_inputs=YourDummyInputsBuilder,
+ )
class YourModelForImage2Seq(nn.Module, SupportsMultiModal): class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
``` ```

View File

@ -42,7 +42,7 @@ def register():
ModelRegistry.register_model( ModelRegistry.register_model(
"YourModelForCausalLM", "YourModelForCausalLM",
"your_code:YourModelForCausalLM", "your_code:YourModelForCausalLM"
) )
``` ```

View File

@ -15,9 +15,8 @@ Declare supported languages and capabilities:
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
??? code "supported_languages and supports_transcription_only" ??? code "supported_languages and supports_transcription_only"
```python ```python
from typing import ClassVar, Mapping, Literal from typing import ClassVar, Mapping, Optional, Literal
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
@ -44,7 +43,6 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
This is for controlling general behavior of the API when serving your model: This is for controlling general behavior of the API when serving your model:
??? code "get_speech_to_text_config()" ??? code "get_speech_to_text_config()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -73,7 +71,6 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -84,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
language: str | None, language: Optional[str],
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
to_language: str | None, to_language: Optional[str],
) -> PromptType: ) -> PromptType:
# Example with a free-form instruction prompt # Example with a free-form instruction prompt
task_word = "Transcribe" if task_type == "transcribe" else "Translate" task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -110,7 +107,6 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -121,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
language: str | None, language: Optional[str],
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
to_language: str | None, to_language: Optional[str],
) -> PromptType: ) -> PromptType:
if language is None: if language is None:
raise ValueError("Language must be specified") raise ValueError("Language must be specified")
@ -152,16 +148,12 @@ Language validation via [validate_language][vllm.model_executor.models.interface
If your model requires a language and you want a default, override this method (see Whisper): If your model requires a language and you want a default, override this method (see Whisper):
??? code "validate_language()" ??? code "validate_language()"
```python ```python
@classmethod @classmethod
def validate_language(cls, language: str | None) -> str | None: def validate_language(cls, language: Optional[str]) -> Optional[str]:
if language is None: if language is None:
logger.warning( logger.warning(
"Defaulting to language='en'. If you wish to transcribe " "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
"audio in a different language, pass the `language` field "
"in the TranscriptionRequest."
)
language = "en" language = "en"
return super().validate_language(language) return super().validate_language(language)
``` ```
@ -173,7 +165,6 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
Provide a fast duration→token estimate to improve streaming usage statistics: Provide a fast duration→token estimate to improve streaming usage statistics:
??? code "get_num_audio_tokens()" ??? code "get_num_audio_tokens()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -184,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
audio_duration_s: float, audio_duration_s: float,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
) -> int | None: ) -> Optional[int]:
# Return None if unknown; otherwise return an estimate. # Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example return int(audio_duration_s * stt_config.sample_rate // 320) # example
``` ```
@ -200,7 +191,6 @@ The API server takes care of basic audio I/O and optional chunking before buildi
Relevant server logic: Relevant server logic:
??? code "_preprocess_speech_to_text()" ??? code "_preprocess_speech_to_text()"
```python ```python
# vllm/entrypoints/openai/speech_to_text.py # vllm/entrypoints/openai/speech_to_text.py
async def _preprocess_speech_to_text(...): async def _preprocess_speech_to_text(...):

View File

@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
??? console "Command" ??? console "Command"
```bash ```python
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: <JWT TOKEN>' \ -H 'Authorization: <JWT TOKEN>' \
@ -81,7 +81,7 @@ You should get a response like:
??? console "Response" ??? console "Response"
```json ```python
{ {
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
"result": { "result": {

View File

@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
client = OpenAI( client = OpenAI(
base_url="https://gateway.<gateway domain>", base_url="https://gateway.<gateway domain>",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>", api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
) )
completion = client.chat.completions.create( completion = client.chat.completions.create(
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
"role": "user", "role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.", "content": "Compose a poem that explains the concept of recursion in programming.",
} }
], ]
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)

View File

@ -34,7 +34,7 @@ pip install vllm haystack-ai
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
model="mistralai/Mistral-7B-Instruct-v0.1", model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs={"max_tokens": 512}, generation_kwargs = {"max_tokens": 512}
) )
response = generator.run( response = generator.run(

View File

@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
import os import os
client = OpenAI( client = OpenAI(
base_url=DEPLOYMENT_URL, base_url = DEPLOYMENT_URL,
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
) )
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
model="HuggingFaceTB/SmolLM3-3B", model = "HuggingFaceTB/SmolLM3-3B",
messages=[ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "Give me a brief explanation of gravity in simple terms.", "text": "Give me a brief explanation of gravity in simple terms."
} }
], ]
} }
], ],
stream=True, stream = True
) )
for message in chat_completion: for message in chat_completion:
print(message.choices[0].delta.content, end="") print(message.choices[0].delta.content, end = "")
``` ```
!!! note !!! note
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
import os import os
client = OpenAI( client = OpenAI(
base_url=DEPLOYMENT_URL, base_url = DEPLOYMENT_URL,
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
) )
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
model="ibm-granite/granite-docling-258M", model = "ibm-granite/granite-docling-258M",
messages=[ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png", "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
}, }
}, },
{ {
"type": "text", "type": "text",
"text": "Convert this page to docling.", "text": "Convert this page to docling."
}, }
] ]
} }
], ],
stream=True, stream = True
) )
for message in chat_completion: for message in chat_completion:
print(message.choices[0].delta.content, end="") print(message.choices[0].delta.content, end = "")
``` ```
!!! note !!! note

View File

@ -36,16 +36,15 @@ pip install vllm litellm
```python ```python
import litellm import litellm
messages = [{"content": "Hello, how are you?", "role": "user"}] messages = [{ "content": "Hello, how are you?","role": "user"}]
# hosted_vllm is prefix key word and necessary # hosted_vllm is prefix key word and necessary
response = litellm.completion( response = litellm.completion(
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
messages=messages, messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2, temperature=0.2,
max_tokens=80, max_tokens=80)
)
print(response) print(response)
``` ```

View File

@ -40,7 +40,7 @@ pip install -U vllm \
1. Run the script 1. Run the script
```bash ```python
python retrieval_augmented_generation_with_langchain.py python retrieval_augmented_generation_with_langchain.py
``` ```
@ -78,6 +78,6 @@ pip install vllm \
1. Run the script: 1. Run the script:
```bash ```python
python retrieval_augmented_generation_with_llamaindex.py python retrieval_augmented_generation_with_llamaindex.py
``` ```

View File

@ -106,11 +106,9 @@ The dispatch code looks like:
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...) batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor) runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
# execution # execution
with set_forward_context( with set_forward_context(...,
..., cudagraph_runtime_mode=runtime_mode,
cudagraph_runtime_mode=runtime_mode, batch_descriptor=batch_descriptor):
batch_descriptor=batch_descriptor,
):
output = self.model(...) output = self.model(...)
``` ```
@ -167,7 +165,7 @@ class AttentionCGSupport(enum.Enum):
"""NO CUDA Graphs support""" """NO CUDA Graphs support"""
``` ```
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
The following table lists backends that support full CUDA Graphs at the time of writing. The following table lists backends that support full CUDA Graphs at the time of writing.
@ -202,12 +200,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
import vllm import vllm
from vllm.config import CUDAGraphMode from vllm.config import CUDAGraphMode
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
model = vllm.LLM( model = vllm.LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
dtype="auto", dtype='auto',
compilation_config=compilation_config, compilation_config = compilation_config,
) )
sampling_params = vllm.SamplingParams( sampling_params = vllm.SamplingParams(
temperature=0, # greedy decoding temperature=0, # greedy decoding
max_tokens=1024, max_tokens=1024,

View File

@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch * `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch * `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests. Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled. Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency` EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES` Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`

View File

@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>): IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
```python ```python
IOProcessorInput = TypeVar("IOProcessorInput") IOProcessorInput = TypeVar('IOProcessorInput')
IOProcessorOutput = TypeVar("IOProcessorOutput") IOProcessorOutput = TypeVar('IOProcessorOutput')
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@ -21,32 +21,30 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
def pre_process( def pre_process(
self, self,
prompt: IOProcessorInput, prompt: IOProcessorInput,
request_id: str | None = None, request_id: Optional[str] = None,
**kwargs, **kwargs,
) -> PromptType | Sequence[PromptType]: ) -> Union[PromptType, Sequence[PromptType]]:
raise NotImplementedError raise NotImplementedError
async def pre_process_async( async def pre_process_async(
self, self,
prompt: IOProcessorInput, prompt: IOProcessorInput,
request_id: str | None = None, request_id: Optional[str] = None,
**kwargs, **kwargs,
) -> PromptType | Sequence[PromptType]: ) -> Union[PromptType, Sequence[PromptType]]:
return self.pre_process(prompt, request_id, **kwargs) return self.pre_process(prompt, request_id, **kwargs)
@abstractmethod @abstractmethod
def post_process( def post_process(self,
self, model_output: Sequence[PoolingRequestOutput],
model_output: Sequence[PoolingRequestOutput], request_id: Optional[str] = None,
request_id: str | None = None, **kwargs) -> IOProcessorOutput:
**kwargs,
) -> IOProcessorOutput:
raise NotImplementedError raise NotImplementedError
async def post_process_async( async def post_process_async(
self, self,
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
request_id: str | None = None, request_id: Optional[str] = None,
**kwargs, **kwargs,
) -> IOProcessorOutput: ) -> IOProcessorOutput:
collected_output = [item async for i, item in model_output] collected_output = [item async for i, item in model_output]
@ -58,8 +56,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@abstractmethod @abstractmethod
def output_to_response( def output_to_response(
self, plugin_output: IOProcessorOutput self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
) -> IOProcessorResponse:
raise NotImplementedError raise NotImplementedError
``` ```

View File

@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
from collections.abc import Sequence from collections.abc import Sequence
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
@abstractmethod @abstractmethod
def update_state( def update_state(
self, self,
batch_update: "BatchUpdate" | None, batch_update: Optional["BatchUpdate"],
) -> None: ) -> None:
"""Called when there are new output tokens, prior """Called when there are new output tokens, prior
to each forward pass. to each forward pass.
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`: * `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
* Use the `BatchUpdate` members to update logits processor internal state * Use the `BatchUpdate` members to update logits processor internal state
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.

View File

@ -478,17 +478,15 @@ us with:
```python ```python
if seq_group.is_finished(): if seq_group.is_finished():
if ( if (seq_group.metrics.first_scheduled_time is not None and
seq_group.metrics.first_scheduled_time is not None seq_group.metrics.first_token_time is not None):
and seq_group.metrics.first_token_time is not None
):
time_queue_requests.append( time_queue_requests.append(
seq_group.metrics.first_scheduled_time - seq_group.metrics.first_scheduled_time -
seq_group.metrics.arrival_time seq_group.metrics.arrival_time)
)
... ...
if seq_group.metrics.time_in_queue is not None: if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(seq_group.metrics.time_in_queue) time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
``` ```
This seems duplicative, and one of them should be removed. The latter This seems duplicative, and one of them should be removed. The latter

View File

@ -112,8 +112,8 @@ class KVCacheBlock:
ref_cnt: int ref_cnt: int
# The pointers to form a doubly linked list for the free queue. # The pointers to form a doubly linked list for the free queue.
prev_free_block: "KVCacheBlock | None" = None prev_free_block: Optional["KVCacheBlock"] = None
next_free_block: "KVCacheBlock | None" = None next_free_block: Optional["KVCacheBlock"] = None
``` ```
There are two design points to highlight: There are two design points to highlight:

View File

@ -93,6 +93,7 @@ The contrived example below implements a custom logits processor which consumes
??? code "Example custom logits processor definition" ??? code "Example custom logits processor definition"
``` python ``` python
from typing import Optional
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
@ -111,7 +112,7 @@ The contrived example below implements a custom logits processor which consumes
"""Never impacts greedy sampling""" """Never impacts greedy sampling"""
return False return False
def update_state(self, batch_update: BatchUpdate | None): def update_state(self, batch_update: Optional[BatchUpdate]):
if not batch_update: if not batch_update:
return return

View File

@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
max_tokens=256, max_tokens=256,
stop=["[/assistant]"], stop=["[/assistant]"]
) )
prompts = [ prompts = [
@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
sampling_params, sampling_params,
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path), lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
) )
``` ```
@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)), lora_int_id=abs(hash(lora_name))
) )
return lora_request return lora_request
``` ```
@ -296,7 +296,10 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
if has_audio: if has_audio:
question = f"<|audio|>{question}" question = f"<|audio|>{question}"
chat = [ chat = [
{"role": "user", "content": question}, {
"role": "user",
"content": question
}
] ]
return tokenizer.apply_chat_template(chat, tokenize=False) return tokenizer.apply_chat_template(chat, tokenize=False)

View File

@ -154,7 +154,9 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": {"image": [image1, image2]}, "multi_modal_data": {
"image": [image1, image2]
},
}) })
for o in outputs: for o in outputs:
@ -181,24 +183,21 @@ conversation = [
{"role": "assistant", "content": "Hello! How can I assist you today?"}, {"role": "assistant", "content": "Hello! How can I assist you today?"},
{ {
"role": "user", "role": "user",
"content": [ "content": [{
{ "type": "image_url",
"type": "image_url", "image_url": {
"image_url": {"url": image_url}, "url": image_url
}, }
{ },{
"type": "image_pil", "type": "image_pil",
"image_pil": image_pil, "image_pil": image_pil
}, }, {
{ "type": "image_embeds",
"type": "image_embeds", "image_embeds": image_embeds
"image_embeds": image_embeds, }, {
}, "type": "text",
{ "text": "What's in these images?"
"type": "text", }],
"text": "What's in these images?",
},
],
}, },
] ]
@ -225,10 +224,7 @@ Multi-image input can be extended to perform video captioning. We show this with
message = { message = {
"role": "user", "role": "user",
"content": [ "content": [
{ {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
"type": "text",
"text": "Describe this set of frames. Consider the frames to be a part of the same video.",
},
], ],
} }
for i in range(len(video_frames)): for i in range(len(video_frames)):
@ -259,13 +255,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
# Custom black background for dark theme # Custom black background for dark theme
llm = LLM( llm = LLM(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}, media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
) )
# Custom brand color background (e.g., blue) # Custom brand color background (e.g., blue)
llm = LLM( llm = LLM(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}, media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
) )
``` ```
@ -298,23 +294,20 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
limit_mm_per_prompt={"video": 1}, limit_mm_per_prompt={"video": 1},
) )
sampling_params = SamplingParams(max_tokens=1024) sampling_params = SamplingParams(
max_tokens=1024,
)
video_messages = [ video_messages = [
{ {"role": "system", "content": "You are a helpful assistant."},
"role": "system", {"role": "user", "content": [
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": [
{"type": "text", "text": "describe this video."}, {"type": "text", "text": "describe this video."},
{ {
"type": "video", "type": "video",
"video": video_path, "video": video_path,
"total_pixels": 20480 * 28 * 28, "total_pixels": 20480 * 28 * 28,
"min_pixels": 16 * 28 * 28, "min_pixels": 16 * 28 * 28
}, }
] ]
}, },
] ]
@ -472,24 +465,21 @@ Then, you can use the OpenAI client as follows:
chat_response = client.chat.completions.create( chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
messages=[ messages=[{
{ "role": "user",
"role": "user", "content": [
"content": [ # NOTE: The prompt formatting with the image token `<image>` is not needed
# NOTE: The prompt formatting with the image token `<image>` is not needed # since the prompt will be processed automatically by the API server.
# since the prompt will be processed automatically by the API server. {"type": "text", "text": "Whats in this image?"},
{ {
"type": "text", "type": "image_url",
"text": "Whats in this image?", "image_url": {
url": image_url
}, },
{ "uuid": image_url # Optional
"type": "image_url", },
"image_url": {"url": image_url}, ],
"uuid": image_url, # Optional }],
},
],
}
],
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
@ -499,27 +489,26 @@ Then, you can use the OpenAI client as follows:
chat_response = client.chat.completions.create( chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
messages=[ messages=[{
{ "role": "user",
"role": "user", "content": [
"content": [ {"type": "text", "text": "What are the animals in these images?"},
{ {
"type": "text", "type": "image_url",
"text": "What are the animals in these images?", "image_url": {
"url": image_url_duck
}, },
{ "uuid": image_url_duck # Optional
"type": "image_url", },
"image_url": {"url": image_url_duck}, {
"uuid": image_url_duck, # Optional "type": "image_url",
"image_url": {
"url": image_url_lion
}, },
{ "uuid": image_url_lion # Optional
"type": "image_url", },
"image_url": {"url": image_url_lion}, ],
"uuid": image_url_lion, # Optional }],
},
],
}
],
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
``` ```
@ -571,22 +560,23 @@ Then, you can use the OpenAI client as follows:
## Use video url in the payload ## Use video url in the payload
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
messages=[ messages=[{
{ "role":
"role": "user", "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "What's in this video?", "text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
}, },
{ "uuid": video_url # Optional
"type": "video_url", },
"video_url": {"url": video_url}, ],
"uuid": video_url, # Optional }],
},
],
}
],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
@ -662,25 +652,23 @@ Then, you can use the OpenAI client as follows:
audio_base64 = encode_base64_content_from_url(audio_url) audio_base64 = encode_base64_content_from_url(audio_url)
chat_completion_from_base64 = client.chat.completions.create( chat_completion_from_base64 = client.chat.completions.create(
messages=[ messages=[{
{ "role": "user",
"role": "user", "content": [
"content": [ {
{ "type": "text",
"type": "text", "text": "What's in this audio?"
"text": "What's in this audio?", },
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav"
}, },
{ "uuid": audio_url # Optional
"type": "input_audio", },
"input_audio": { ],
"data": audio_base64, }],
"format": "wav",
},
"uuid": audio_url, # Optional
},
],
},
],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
@ -695,22 +683,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
```python ```python
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
messages=[ messages=[{
{ "role": "user",
"role": "user", "content": [
"content": [ {
{ "type": "text",
"type": "text", "text": "What's in this audio?"
"text": "What's in this audio?", },
{
"type": "audio_url",
"audio_url": {
"url": audio_url
}, },
{ "uuid": audio_url # Optional
"type": "audio_url", },
"audio_url": {"url": audio_url}, ],
"uuid": audio_url, # Optional }],
},
],
}
],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
@ -759,48 +747,43 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
# Basic usage - this is equivalent to the LLaVA example for offline inference # Basic usage - this is equivalent to the LLaVA example for offline inference
model = "llava-hf/llava-1.5-7b-hf" model = "llava-hf/llava-1.5-7b-hf"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}", "image_embeds": f"{base64_image_embedding}",
"uuid": image_url, # Optional "uuid": image_url # Optional
} }
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
model = "Qwen/Qwen2-VL-2B-Instruct" model = "Qwen/Qwen2-VL-2B-Instruct"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": {
"image_embeds": f"{base64_image_embedding}", # Required "image_embeds": f"{base64_image_embedding}" , # Required
"image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
}, },
"uuid": image_url, # Optional "uuid": image_url # Optional
} }
model = "openbmb/MiniCPM-V-2_6" model = "openbmb/MiniCPM-V-2_6"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": {
"image_embeds": f"{base64_image_embedding}", # Required "image_embeds": f"{base64_image_embedding}" , # Required
"image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6 "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
}, },
"uuid": image_url, # Optional "uuid": image_url # Optional
} }
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{ {
"role": "system", "type": "text",
"content": "You are a helpful assistant.", "text": "What's in this image?",
}, },
{ embeds,
"role": "user", ],
"content": [ },
{ ],
"type": "text",
"text": "What's in this image?",
},
embeds,
],
},
],
model=model, model=model,
) )
``` ```
@ -819,22 +802,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
{ {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": None, "image_embeds": None,
"uuid": image_uuid, "uuid": image_uuid
}, },
# input_audio: # input_audio:
{ {
"type": "input_audio", "type": "input_audio",
"input_audio": None, "input_audio": None,
"uuid": audio_uuid, "uuid": audio_uuid
}, },
# PIL Image: # PIL Image:
{ {
"type": "image_pil", "type": "image_pil",
"image_pil": None, "image_pil": None
"uuid": image_uuid, "uuid": image_uuid
}, }
``` ```

View File

@ -156,16 +156,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`). NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior. Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
## Experimental Feature
### Heterogenuous KV Layout support
Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
```bash
--kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
```
## Example Scripts/Code ## Example Scripts/Code
Refer to these example scripts in the vLLM repository: Refer to these example scripts in the vLLM repository:

View File

@ -1,9 +1,5 @@
# AutoAWQ # AutoAWQ
> ⚠️ **Warning:**
The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq).
For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ).
To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
The main benefits are lower latency and memory usage. The main benefits are lower latency and memory usage.
@ -22,15 +18,13 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
from awq import AutoAWQForCausalLM from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
model_path = "mistralai/Mistral-7B-Instruct-v0.2" model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
quant_path = "mistral-instruct-v0.2-awq" quant_path = 'mistral-instruct-v0.2-awq'
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
# Load model # Load model
model = AutoAWQForCausalLM.from_pretrained( model = AutoAWQForCausalLM.from_pretrained(
model_path, model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
low_cpu_mem_usage=True,
use_cache=False,
) )
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

View File

@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_round import AutoRound from auto_round import AutoRound
model_name = "Qwen/Qwen3-0.6B" model_name = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
bits, group_size, sym = 4, 128, True bits, group_size, sym = 4, 128, True

View File

@ -34,7 +34,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas", quantization="bitblas"
) )
``` ```
@ -53,6 +53,6 @@ llm = LLM(
dtype=torch.float16, dtype=torch.float16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas", quantization="bitblas",
max_model_len=1024, max_model_len=1024
) )
``` ```

View File

@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM( llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True
) )
``` ```
@ -43,7 +43,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitsandbytes", quantization="bitsandbytes"
) )
``` ```

View File

@ -41,9 +41,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, MODEL_ID, device_map="auto", torch_dtype="auto",
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
@ -65,10 +63,7 @@ Since simple RTN does not require data for weight quantization and the activatio
# Configure the simple PTQ quantization # Configure the simple PTQ quantization
recipe = QuantizationModifier( recipe = QuantizationModifier(
targets="Linear", targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
# Apply the quantization algorithm. # Apply the quantization algorithm.
oneshot(model=model, recipe=recipe) oneshot(model=model, recipe=recipe)

View File

@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant", "content": "You are a helpful assistant"
}, },
{ {
"role": "user", "role": "user",
"content": "Hello", "content": "Hello"
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?", "content": "Hello! How can I assist you today?"
}, },
{ {
"role": "user", "role": "user",
@ -67,10 +67,8 @@ You can also use the GGUF model directly through the LLM entrypoint:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM( llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params) outputs = llm.chat(conversation, sampling_params)

View File

@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
calibration_dataset = load_dataset( calibration_dataset = load_dataset(
"allenai/c4", "allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz", data_files="en/c4-train.00001-of-01024.json.gz",
split="train", split="train"
).select(range(1024))["text"] ).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) quant_config = QuantizeConfig(bits=4, group_size=128)

View File

@ -39,9 +39,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, MODEL_ID, device_map="auto", torch_dtype="auto",
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
@ -168,7 +166,7 @@ The following is an example of an expanded quantization recipe you can tune to y
}, },
ignore=["lm_head"], ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES, update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01, dampening_frac=0.01
) )
``` ```

View File

@ -44,9 +44,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, MODEL_ID, device_map="auto", torch_dtype="auto",
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```

View File

@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def main(): def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

View File

@ -41,11 +41,9 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM( llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
model="meta-llama/Llama-2-7b-chat-hf", kv_cache_dtype="fp8",
kv_cache_dtype="fp8", calculate_kv_scales=True)
calculate_kv_scales=True,
)
prompt = "London is the capital of" prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out) print(out)
@ -82,7 +80,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
# Select model and load it # Select model and load it
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto") model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset # Select calibration dataset

View File

@ -48,9 +48,7 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512 MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, MODEL_ID, device_map="auto", torch_dtype="auto",
device_map="auto",
dtype="auto",
) )
model.eval() model.eval()
@ -77,18 +75,10 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA] text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer( tokenized_outputs = tokenizer(text_data, return_tensors="pt",
text_data, padding=True, truncation=True, max_length=MAX_SEQ_LEN)
return_tensors="pt", calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
padding=True, batch_size=BATCH_SIZE, drop_last=True)
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
``` ```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
@ -113,32 +103,26 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file) load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec( FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
observer_method="min_max", is_dynamic=False).to_quantization_spec()
is_dynamic=False,
).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig( global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
input_tensors=FP8_PER_TENSOR_SPEC, weight=FP8_PER_TENSOR_SPEC)
weight=FP8_PER_TENSOR_SPEC,
)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = { kv_cache_quant_config = {name :
name: QuantizationConfig( QuantizationConfig(input_tensors=global_quant_config.input_tensors,
input_tensors=global_quant_config.input_tensors, weight=global_quant_config.weight,
weight=global_quant_config.weight, output_tensors=KV_CACHE_SPEC)
output_tensors=KV_CACHE_SPEC, for name in kv_cache_layer_names_for_llama}
)
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy() layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. # Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json" LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"] EXCLUDE_LAYERS = ["lm_head"]
@ -147,8 +131,7 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config, layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config, kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS, exclude=EXCLUDE_LAYERS,
algo_config=algo_config, algo_config=algo_config)
)
``` ```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
@ -182,11 +165,8 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad(): with torch.no_grad():
exporter.export_safetensors_model( exporter.export_safetensors_model(freezed_model,
freezed_model, quant_config=quant_config, tokenizer=tokenizer)
quant_config=quant_config,
tokenizer=tokenizer,
)
``` ```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
@ -209,11 +189,8 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM( llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", kv_cache_dtype='fp8',quantization='quark')
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)

View File

@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained( quantized_model = AutoModelForCausalLM.from_pretrained(
model_name, model_name,
dtype="auto", torch_dtype="auto",
device_map="auto", device_map="auto",
quantization_config=quantization_config quantization_config=quantization_config
) )

View File

@ -11,9 +11,6 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling | | Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------| |--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ | | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
@ -21,9 +18,8 @@ vLLM currently supports the following reasoning models:
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
!!! note !!! note
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
## Quickstart ## Quickstart
@ -119,11 +115,9 @@ OpenAI Python client library does not officially support `reasoning_content` att
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create( stream = client.chat.completions.create(model=model,
model=model, messages=messages,
messages=messages, stream=True)
stream=True,
)
print("client: Start streaming chat completions...") print("client: Start streaming chat completions...")
printed_reasoning_content = False printed_reasoning_content = False
@ -163,29 +157,27 @@ The reasoning content is also available when both tool calling and the reasoning
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
tools = [ tools = [{
{ "type": "function",
"type": "function", "function": {
"function": { "name": "get_weather",
"name": "get_weather", "description": "Get the current weather in a given location",
"description": "Get the current weather in a given location", "parameters": {
"parameters": { "type": "object",
"type": "object", "properties": {
"properties": { "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, },
}, "required": ["location", "unit"]
"required": ["location", "unit"], }
}
},
} }
] }]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto", tool_choice="auto"
) )
print(response) print(response)
@ -231,7 +223,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
previous_token_ids: Sequence[int], previous_token_ids: Sequence[int],
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
) -> DeltaMessage | None: ) -> Union[DeltaMessage, None]:
""" """
Instance method that should be implemented for extracting reasoning Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and from an incomplete response; for use when handling reasoning calls and
@ -241,10 +233,8 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
""" """
def extract_reasoning_content( def extract_reasoning_content(
self, self, model_output: str, request: ChatCompletionRequest
model_output: str, ) -> tuple[Optional[str], Optional[str]]:
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
""" """
Extract reasoning content from a complete model-generated string. Extract reasoning content from a complete model-generated string.
@ -282,10 +272,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
@classmethod @classmethod
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
return cls( return cls(start_token_id=tokenizer.encode(
start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0], "<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0], end_token_id=tokenizer.encode("</think>",
) add_special_tokens=False)[0])
def is_reasoning_end(self, input_ids: list[int]) -> bool: def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids return self.end_token_id in input_ids

View File

@ -27,29 +27,27 @@ Next, make a request that triggers the model to use the available tools:
return f"Getting the weather for {location} in {unit}..." return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather} tool_functions = {"get_weather": get_weather}
tools = [ tools = [{
{ "type": "function",
"type": "function", "function": {
"function": { "name": "get_weather",
"name": "get_weather", "description": "Get the current weather in a given location",
"description": "Get the current weather in a given location", "parameters": {
"parameters": { "type": "object",
"type": "object", "properties": {
"properties": { "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"],
}, },
}, "required": ["location", "unit"]
}, }
] }
}]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto", tool_choice="auto"
) )
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
@ -352,16 +350,6 @@ Supported models:
Flags: `--tool-call-parser qwen3_xml` Flags: `--tool-call-parser qwen3_xml`
### Olmo 3 Models (`olmo3`)
Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
Supported models:
* TODO (will be updated after Olmo 3 release)
Flags: `--tool-call-parser olmo3`
### Models with Pythonic Tool Calls (`pythonic`) ### Models with Pythonic Tool Calls (`pythonic`)
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
@ -414,7 +402,8 @@ Here is a summary of a plugin file:
# adjust request. e.g.: set skip special tokens # adjust request. e.g.: set skip special tokens
# to False for tool call output. # to False for tool call output.
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: def adjust_request(
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
return request return request
# implement the tool call parse for stream call # implement the tool call parse for stream call
@ -427,7 +416,7 @@ Here is a summary of a plugin file:
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
request: ChatCompletionRequest, request: ChatCompletionRequest,
) -> DeltaMessage | None: ) -> Union[DeltaMessage, None]:
return delta return delta
# implement the tool parse for non-stream call # implement the tool parse for non-stream call

View File

@ -23,46 +23,7 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: --8<-- "docs/getting_started/installation/cpu/build.inc.md"
```bash
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
```
Testing has been conducted on AWS Graviton3 instances for compatibility. Testing has been conducted on AWS Graviton3 instances for compatibility.

View File

@ -0,0 +1,45 @@
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu python setup.py install
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu python setup.py develop
```
!!! note
If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
# --8<-- [end:extra-information]

View File

@ -194,10 +194,8 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
completion = client.completions.create( completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
model="Qwen/Qwen2.5-1.5B-Instruct", prompt="San Francisco is a")
prompt="San Francisco is a",
)
print("Completion result:", completion) print("Completion result:", completion)
``` ```
@ -241,7 +239,7 @@ Alternatively, you can use the `openai` Python package:
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."}, {"role": "user", "content": "Tell me a joke."},
], ]
) )
print("Chat response:", chat_response) print("Chat response:", chat_response)
``` ```

View File

@ -22,11 +22,6 @@ sys.modules["vllm._C"] = MagicMock()
class PydanticMagicMock(MagicMock): class PydanticMagicMock(MagicMock):
"""`MagicMock` that's able to generate pydantic-core schemas.""" """`MagicMock` that's able to generate pydantic-core schemas."""
def __init__(self, *args, **kwargs):
name = kwargs.pop("name", None)
super().__init__(*args, **kwargs)
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
def __get_pydantic_core_schema__(self, source_type, handler): def __get_pydantic_core_schema__(self, source_type, handler):
return core_schema.any_schema() return core_schema.any_schema()
@ -47,9 +42,7 @@ def auto_mock(module, attr, max_mocks=50):
raise e raise e
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
logger.info("Mocking %s for argparse doc generation", e.name) logger.info("Mocking %s for argparse doc generation", e.name)
sys.modules[e.name] = PydanticMagicMock(name=e.name) sys.modules[e.name] = PydanticMagicMock()
except Exception as e:
logger.warning("Failed to import %s.%s: %s", module, attr, e)
raise ImportError( raise ImportError(
f"Failed to import {module}.{attr} after mocking {max_mocks} imports" f"Failed to import {module}.{attr} after mocking {max_mocks} imports"

View File

@ -60,7 +60,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True
) )
``` ```
@ -97,6 +97,6 @@ llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True,
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}, model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
) )
``` ```

Some files were not shown because too many files have changed in this diff Show More