Compare commits
12 Commits
v0.11.1rc1
...
zhuohan/mo
| Author | SHA1 | Date | |
|---|---|---|---|
| 99e2379b16 | |||
| da26dce7b2 | |||
| 48dcc72d7e | |||
| e3e2bb3865 | |||
| 4e2abe99b7 | |||
| 177f5d757f | |||
| dcf059ab84 | |||
| 98e71a4954 | |||
| 1f4472ba5f | |||
| 850876a183 | |||
| a608dfab45 | |||
| 2d4215b9a2 |
@ -5,11 +5,11 @@ import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||
# Note that we have 800 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/6326 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
|
||||
@ -8,7 +8,7 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -76,7 +76,7 @@ steps:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
17
.coveragerc
17
.coveragerc
@ -1,10 +1,5 @@
|
||||
[run]
|
||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||
# Use wildcard pattern to match the installed location
|
||||
source =
|
||||
vllm
|
||||
*/dist-packages/vllm
|
||||
*/site-packages/vllm
|
||||
source = vllm
|
||||
omit =
|
||||
*/tests/*
|
||||
*/test_*
|
||||
@ -17,16 +12,6 @@ omit =
|
||||
*/benchmarks/*
|
||||
*/docs/*
|
||||
|
||||
[paths]
|
||||
# Map all possible vllm locations to a canonical "vllm" path
|
||||
# This ensures coverage.combine properly merges data from different test runs
|
||||
source =
|
||||
vllm
|
||||
/vllm-workspace/src/vllm
|
||||
/vllm-workspace/vllm
|
||||
*/site-packages/vllm
|
||||
*/dist-packages/vllm
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
|
||||
@ -1,4 +0,0 @@
|
||||
# Migrate from `yapf` & `isort` to `ruff`
|
||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,7 +13,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Label issues based on keywords
|
||||
id: label-step
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
@ -43,6 +42,7 @@ jobs:
|
||||
searchIn: "body"
|
||||
},
|
||||
],
|
||||
|
||||
// Substring search - matches anywhere in text (partial matches)
|
||||
substrings: [
|
||||
{
|
||||
@ -89,12 +89,14 @@ jobs:
|
||||
term: "hip_",
|
||||
searchIn: "both"
|
||||
},
|
||||
|
||||
// ROCm tools and libraries
|
||||
{
|
||||
term: "hipify",
|
||||
searchIn: "both"
|
||||
},
|
||||
],
|
||||
|
||||
// Regex patterns - for complex pattern matching
|
||||
regexPatterns: [
|
||||
{
|
||||
@ -105,17 +107,13 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
// substrings: [...],
|
||||
// regexPatterns: [...]
|
||||
// },
|
||||
};
|
||||
|
||||
// Helper function to create regex based on search type
|
||||
function createSearchRegex(term, type) {
|
||||
// Escape special regex characters in the term
|
||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
|
||||
switch (type) {
|
||||
case 'keyword':
|
||||
// Word boundary search - matches whole words only
|
||||
@ -127,13 +125,16 @@ jobs:
|
||||
throw new Error(`Unknown search type: ${type}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to find matching terms in text with line information
|
||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||
const matches = [];
|
||||
const lines = text.split('\n');
|
||||
|
||||
for (const termConfig of searchTerms) {
|
||||
let regex;
|
||||
let term, searchIn, pattern, description, flags;
|
||||
|
||||
// Handle different input formats (string or object)
|
||||
if (typeof termConfig === 'string') {
|
||||
term = termConfig;
|
||||
@ -145,17 +146,21 @@ jobs:
|
||||
description = termConfig.description;
|
||||
flags = termConfig.flags;
|
||||
}
|
||||
|
||||
// Skip if this term shouldn't be searched in the current location
|
||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create appropriate regex
|
||||
if (searchType === 'regex') {
|
||||
regex = new RegExp(pattern, flags || "gi");
|
||||
} else {
|
||||
regex = createSearchRegex(term, searchType);
|
||||
}
|
||||
|
||||
const termMatches = [];
|
||||
|
||||
// Check each line for matches
|
||||
lines.forEach((line, lineIndex) => {
|
||||
const lineMatches = line.match(regex);
|
||||
@ -170,14 +175,15 @@ jobs:
|
||||
originalTerm: term || pattern,
|
||||
description: description,
|
||||
// Show context around the match in the line
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
: line.trim()
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (termMatches.length > 0) {
|
||||
matches.push({
|
||||
term: term || (description || pattern),
|
||||
@ -190,48 +196,64 @@ jobs:
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
// Helper function to check if label should be added
|
||||
async function processLabel(labelName, config) {
|
||||
const body = context.payload.issue.body || "";
|
||||
const title = context.payload.issue.title || "";
|
||||
|
||||
core.notice(`Processing label: ${labelName}`);
|
||||
core.notice(`Issue Title: "${title}"`);
|
||||
core.notice(`Issue Body length: ${body.length} characters`);
|
||||
|
||||
let shouldAddLabel = false;
|
||||
let allMatches = [];
|
||||
let reason = '';
|
||||
|
||||
const keywords = config.keywords || [];
|
||||
const substrings = config.substrings || [];
|
||||
const regexPatterns = config.regexPatterns || [];
|
||||
|
||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||
|
||||
// Search in title
|
||||
if (title.trim()) {
|
||||
core.notice(`Searching in title: "${title}"`);
|
||||
|
||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||
|
||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||
}
|
||||
|
||||
// Search in body
|
||||
if (body.trim()) {
|
||||
core.notice(`Searching in body (${body.length} characters)`);
|
||||
|
||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||
|
||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||
}
|
||||
|
||||
if (allMatches.length > 0) {
|
||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||
|
||||
for (const termMatch of allMatches) {
|
||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||
|
||||
if (termMatch.searchType === 'regex') {
|
||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
} else {
|
||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
}
|
||||
|
||||
// Show details for each match
|
||||
termMatch.matches.forEach((match, index) => {
|
||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||
@ -244,6 +266,7 @@ jobs:
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
shouldAddLabel = true;
|
||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||
@ -251,10 +274,13 @@ jobs:
|
||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||
|
||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||
}
|
||||
|
||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||
|
||||
if (shouldAddLabel) {
|
||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||
if (!existingLabels.includes(labelName)) {
|
||||
@ -270,92 +296,14 @@ jobs:
|
||||
core.notice(`Label "${labelName}" already present.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
core.notice(`No matching terms found for label "${labelName}".`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Process all configured labels
|
||||
const labelsAddedResults = await Promise.all(
|
||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||
)
|
||||
);
|
||||
|
||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
|
||||
// Return which labels were added for the next step
|
||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||
return addedLabels;
|
||||
|
||||
- name: CC users for labeled issues
|
||||
if: steps.label-step.outputs.labels_added != '[]'
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
// Configuration: Map labels to GitHub users to CC
|
||||
// You can add multiple users per label, and multiple label configurations
|
||||
const ccConfig = {
|
||||
rocm: {
|
||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||
},
|
||||
// Add more label -> user mappings here
|
||||
// Example:
|
||||
// cuda: {
|
||||
// users: ['user1', 'user2'],
|
||||
// message: 'CC {users} for CUDA-related issue'
|
||||
// },
|
||||
// performance: {
|
||||
// users: ['perfexpert'],
|
||||
// message: 'CC {users} for performance issue'
|
||||
// },
|
||||
};
|
||||
|
||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||
|
||||
// Get existing comments to check for already mentioned users
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
|
||||
const issueBody = context.payload.issue.body || '';
|
||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||
|
||||
// Process each label that was added
|
||||
for (const label of labelsAdded) {
|
||||
if (ccConfig[label]) {
|
||||
const config = ccConfig[label];
|
||||
const usersToMention = [];
|
||||
|
||||
// Check which users haven't been mentioned yet
|
||||
for (const user of config.users) {
|
||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||
if (!mentionPattern.test(allExistingText)) {
|
||||
usersToMention.push(user);
|
||||
} else {
|
||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||
}
|
||||
}
|
||||
|
||||
// Post comment if there are users to mention
|
||||
if (usersToMention.length > 0) {
|
||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||
const message = config.message.replace('{users}', mentions);
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: message
|
||||
});
|
||||
|
||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||
} else {
|
||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||
}
|
||||
}
|
||||
}
|
||||
const processLabels = Object.entries(labelConfig)
|
||||
.map(([labelName, config]) => processLabel(labelName, config));
|
||||
const labelsAdded = await Promise.all(processLabels);
|
||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
@ -16,7 +16,6 @@ repos:
|
||||
rev: v1.38.1
|
||||
hooks:
|
||||
- id: typos
|
||||
args: [--force-exclude]
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v21.1.2
|
||||
hooks:
|
||||
|
||||
12
codecov.yml
12
codecov.yml
@ -1,12 +0,0 @@
|
||||
codecov:
|
||||
require_ci_to_pass: false
|
||||
|
||||
fixes:
|
||||
# Map source code paths to repository root paths
|
||||
# Wildcards match any Python version (python3.*)
|
||||
- "/vllm-workspace/src/vllm/::vllm/"
|
||||
- "/vllm-workspace/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
@ -21,6 +21,7 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
#include "../core/batch_invariant.hpp"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
|
||||
@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
|
||||
__global__ __quickreduce_launch_bounds_two_shot__ static void
|
||||
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||
int rank, uint8_t** dbuffer_list,
|
||||
uint32_t data_offset, uint32_t flag_color,
|
||||
int64_t data_size_per_phase) {
|
||||
uint32_t data_offset, uint32_t flag_color) {
|
||||
int block = blockIdx.x;
|
||||
int grid = gridDim.x;
|
||||
|
||||
while (block < num_blocks) {
|
||||
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
|
||||
flag_color, data_size_per_phase);
|
||||
flag_color);
|
||||
block += grid;
|
||||
flag_color++;
|
||||
}
|
||||
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
flag_color); \
|
||||
} else if (world_size == 4) { \
|
||||
using LineCodec = __codec<T, 4>; \
|
||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
flag_color); \
|
||||
} else if (world_size == 8) { \
|
||||
using LineCodec = __codec<T, 8>; \
|
||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
flag_color); \
|
||||
}
|
||||
|
||||
enum QuickReduceQuantLevel {
|
||||
|
||||
@ -553,12 +553,13 @@ struct AllReduceTwoshot {
|
||||
int const rank, // rank index
|
||||
uint8_t** __restrict__ buffer_list, // communication buffers
|
||||
uint32_t const data_offset, // offset to start of the data buffer
|
||||
uint32_t flag_color, int64_t data_size_per_phase) {
|
||||
uint32_t flag_color) {
|
||||
// Topology
|
||||
int thread = threadIdx.x + threadIdx.y * kWavefront;
|
||||
uint8_t* rank_buffer = buffer_list[rank];
|
||||
Codec codec(thread, rank);
|
||||
int block_id = blockIdx.x;
|
||||
int grid_size = gridDim.x;
|
||||
// --------------------------------------------------------
|
||||
// Read input into registers
|
||||
int32x4_t tA[kAtoms];
|
||||
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
|
||||
// rank responsible for this segment.
|
||||
uint32_t comm_data0_offset =
|
||||
data_offset + block_id * Codec::kTransmittedTileSize;
|
||||
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
|
||||
uint32_t comm_data1_offset =
|
||||
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
|
||||
|
||||
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
|
||||
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
|
||||
uint32_t comm_flags1_offset =
|
||||
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
|
||||
|
||||
for (int r = 0; r < kWorldSize; r++) {
|
||||
int32x4_t* send_buffer =
|
||||
|
||||
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
# sync the default value with .buildkite/check-wheel-size.py
|
||||
ARG VLLM_MAX_SIZE_MB=500
|
||||
ARG VLLM_MAX_SIZE_MB=450
|
||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||
ARG RUN_WHEEL_CHECK=true
|
||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||
|
||||
@ -11,7 +11,8 @@ The following code splits the model across 2 GPUs.
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
@ -42,7 +43,9 @@ and the maximum batch size (`max_num_seqs` option).
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2)
|
||||
```
|
||||
|
||||
## Reduce CUDA Graphs
|
||||
@ -75,7 +78,8 @@ You can disable graph capturing completely via the `enforce_eager` flag:
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
enforce_eager=True)
|
||||
```
|
||||
|
||||
## Adjust cache size
|
||||
@ -93,10 +97,8 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
|
||||
from vllm import LLM
|
||||
|
||||
# Accept up to 3 images and 1 video per prompt
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1},
|
||||
)
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1})
|
||||
```
|
||||
|
||||
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
||||
@ -106,10 +108,8 @@ For example, if your application only accepts image input, there is no need to a
|
||||
from vllm import LLM
|
||||
|
||||
# Accept any number of images but no videos
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0},
|
||||
)
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0})
|
||||
```
|
||||
|
||||
You can even run a multi-modal model for text-only inference:
|
||||
@ -118,10 +118,8 @@ You can even run a multi-modal model for text-only inference:
|
||||
from vllm import LLM
|
||||
|
||||
# Don't accept images. Just text.
|
||||
llm = LLM(
|
||||
model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0},
|
||||
)
|
||||
llm = LLM(model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0})
|
||||
```
|
||||
|
||||
### Configurable options
|
||||
@ -175,14 +173,14 @@ Here are some examples:
|
||||
from vllm import LLM
|
||||
|
||||
# Available for Qwen2-VL series models
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
|
||||
)
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={
|
||||
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||
})
|
||||
|
||||
# Available for InternVL series models
|
||||
llm = LLM(
|
||||
model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
|
||||
)
|
||||
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={
|
||||
"max_dynamic_patch": 4, # Default is 12
|
||||
})
|
||||
```
|
||||
|
||||
@ -100,7 +100,7 @@ from vllm import LLM
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.3-70B-Instruct,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=2,
|
||||
pipeline_parallel_size=2
|
||||
)
|
||||
```
|
||||
|
||||
@ -257,24 +257,18 @@ Examples:
|
||||
|
||||
```python
|
||||
# Use a larger cache
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8)
|
||||
|
||||
# Use a shared-memory based IPC cache
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8)
|
||||
|
||||
# Disable the cache
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0,
|
||||
)
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0)
|
||||
```
|
||||
|
||||
### Cache Placement
|
||||
|
||||
@ -35,7 +35,6 @@ th {
|
||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||
| Random | ✅ | ✅ | `synthetic` |
|
||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
||||
| RandomForReranking | ✅ | ✅ | `synthetic` |
|
||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
||||
@ -879,51 +878,6 @@ vllm bench serve \
|
||||
|
||||
</details>
|
||||
|
||||
#### Reranker Benchmark
|
||||
|
||||
Benchmark the performance of rerank requests in vLLM.
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Unlike generative models which use Completions API or Chat Completions API,
|
||||
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
|
||||
|
||||
For reranking, the only supported dataset is `--dataset-name random-rerank`
|
||||
|
||||
Start the server:
|
||||
|
||||
```bash
|
||||
vllm serve BAAI/bge-reranker-v2-m3
|
||||
```
|
||||
|
||||
Run the benchmark:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--model BAAI/bge-reranker-v2-m3 \
|
||||
--backend vllm-rerank \
|
||||
--endpoint /v1/rerank \
|
||||
--dataset-name random-rerank \
|
||||
--tokenizer BAAI/bge-reranker-v2-m3 \
|
||||
--random-input-len 512 \
|
||||
--num-prompts 10 \
|
||||
--random-batch-size 5
|
||||
```
|
||||
|
||||
For reranker models, this will create `num_prompts / random_batch_size` requests with
|
||||
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
|
||||
In the example above, this results in 2 rerank requests with 5 "documents" each where
|
||||
each document has close to 512 tokens.
|
||||
|
||||
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
|
||||
with an embedding model, also set `--no_reranker`. Because in this case the query is
|
||||
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
to account for the extra prompt which is the query. The token accounting to report the
|
||||
throughput numbers correctly is also adjusted.
|
||||
|
||||
</details>
|
||||
|
||||
[](){ #performance-benchmarks }
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
@ -73,8 +73,8 @@ def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
@ -16,7 +16,7 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
if modality.startswith("image"):
|
||||
return "<image>"
|
||||
|
||||
@ -45,14 +45,14 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self,
|
||||
**kwargs: object,
|
||||
) -> MultiModalEmbeddings | None:
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
|
||||
For example, if the model supports any number of images but only one video per prompt:
|
||||
|
||||
```python
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
@ -421,10 +421,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(
|
||||
width=image_processor.size["width"],
|
||||
height=image_processor.size["height"],
|
||||
)
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
```
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
@ -454,12 +452,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(
|
||||
width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides,
|
||||
)
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
}
|
||||
```
|
||||
|
||||
@ -748,7 +744,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@ -784,7 +781,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@ -812,11 +810,9 @@ to register them to the multi-modal registry:
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(
|
||||
+ YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder,
|
||||
+ )
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder)
|
||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ def register():
|
||||
|
||||
ModelRegistry.register_model(
|
||||
"YourModelForCausalLM",
|
||||
"your_code:YourModelForCausalLM",
|
||||
"your_code:YourModelForCausalLM"
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -15,7 +15,6 @@ Declare supported languages and capabilities:
|
||||
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
|
||||
|
||||
??? code "supported_languages and supports_transcription_only"
|
||||
|
||||
```python
|
||||
from typing import ClassVar, Mapping, Literal
|
||||
import numpy as np
|
||||
@ -44,7 +43,6 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
|
||||
This is for controlling general behavior of the API when serving your model:
|
||||
|
||||
??? code "get_speech_to_text_config()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -73,7 +71,6 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
|
||||
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -110,7 +107,6 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -152,16 +148,12 @@ Language validation via [validate_language][vllm.model_executor.models.interface
|
||||
If your model requires a language and you want a default, override this method (see Whisper):
|
||||
|
||||
??? code "validate_language()"
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def validate_language(cls, language: str | None) -> str | None:
|
||||
if language is None:
|
||||
logger.warning(
|
||||
"Defaulting to language='en'. If you wish to transcribe "
|
||||
"audio in a different language, pass the `language` field "
|
||||
"in the TranscriptionRequest."
|
||||
)
|
||||
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||
language = "en"
|
||||
return super().validate_language(language)
|
||||
```
|
||||
@ -173,7 +165,6 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
|
||||
Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
|
||||
??? code "get_num_audio_tokens()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -200,7 +191,6 @@ The API server takes care of basic audio I/O and optional chunking before buildi
|
||||
Relevant server logic:
|
||||
|
||||
??? code "_preprocess_speech_to_text()"
|
||||
|
||||
```python
|
||||
# vllm/entrypoints/openai/speech_to_text.py
|
||||
async def _preprocess_speech_to_text(...):
|
||||
|
||||
@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
|
||||
|
||||
??? console "Command"
|
||||
|
||||
```bash
|
||||
```python
|
||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: <JWT TOKEN>' \
|
||||
@ -81,7 +81,7 @@ You should get a response like:
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
```python
|
||||
{
|
||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||
"result": {
|
||||
|
||||
@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://gateway.<gateway domain>",
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
"role": "user",
|
||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||
}
|
||||
],
|
||||
]
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
@ -34,7 +34,7 @@ pip install vllm haystack-ai
|
||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||
generation_kwargs={"max_tokens": 512},
|
||||
generation_kwargs = {"max_tokens": 512}
|
||||
)
|
||||
|
||||
response = generator.run(
|
||||
|
||||
@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model="HuggingFaceTB/SmolLM3-3B",
|
||||
messages=[
|
||||
model = "HuggingFaceTB/SmolLM3-3B",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Give me a brief explanation of gravity in simple terms.",
|
||||
"text": "Give me a brief explanation of gravity in simple terms."
|
||||
}
|
||||
],
|
||||
]
|
||||
}
|
||||
],
|
||||
stream=True,
|
||||
stream = True
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end="")
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
```
|
||||
|
||||
!!! note
|
||||
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model="ibm-granite/granite-docling-258M",
|
||||
messages=[
|
||||
model = "ibm-granite/granite-docling-258M",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
|
||||
},
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Convert this page to docling.",
|
||||
},
|
||||
"text": "Convert this page to docling."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
stream=True,
|
||||
stream = True
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end="")
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
```
|
||||
|
||||
!!! note
|
||||
|
||||
@ -36,16 +36,15 @@ pip install vllm litellm
|
||||
```python
|
||||
import litellm
|
||||
|
||||
messages = [{"content": "Hello, how are you?", "role": "user"}]
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
response = litellm.completion(
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80,
|
||||
)
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
@ -40,7 +40,7 @@ pip install -U vllm \
|
||||
|
||||
1. Run the script
|
||||
|
||||
```bash
|
||||
```python
|
||||
python retrieval_augmented_generation_with_langchain.py
|
||||
```
|
||||
|
||||
@ -78,6 +78,6 @@ pip install vllm \
|
||||
|
||||
1. Run the script:
|
||||
|
||||
```bash
|
||||
```python
|
||||
python retrieval_augmented_generation_with_llamaindex.py
|
||||
```
|
||||
|
||||
@ -106,11 +106,9 @@ The dispatch code looks like:
|
||||
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
|
||||
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
|
||||
# execution
|
||||
with set_forward_context(
|
||||
...,
|
||||
cudagraph_runtime_mode=runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
):
|
||||
with set_forward_context(...,
|
||||
cudagraph_runtime_mode=runtime_mode,
|
||||
batch_descriptor=batch_descriptor):
|
||||
output = self.model(...)
|
||||
```
|
||||
|
||||
@ -204,10 +202,10 @@ from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
model = vllm.LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype="auto",
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype='auto',
|
||||
compilation_config = compilation_config,
|
||||
)
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, # greedy decoding
|
||||
max_tokens=1024,
|
||||
|
||||
@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
|
||||
* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
|
||||
* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
|
||||
|
||||
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
|
||||
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
|
||||
|
||||
Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
|
||||
EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency`
|
||||
EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
|
||||
|
||||
Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
|
||||
|
||||
|
||||
@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
|
||||
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
||||
|
||||
```python
|
||||
IOProcessorInput = TypeVar("IOProcessorInput")
|
||||
IOProcessorOutput = TypeVar("IOProcessorOutput")
|
||||
IOProcessorInput = TypeVar('IOProcessorInput')
|
||||
IOProcessorOutput = TypeVar('IOProcessorOutput')
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
@ -21,32 +21,30 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
def pre_process(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: str | None = None,
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> PromptType | Sequence[PromptType]:
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def pre_process_async(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: str | None = None,
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> PromptType | Sequence[PromptType]:
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
return self.pre_process(prompt, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def post_process(
|
||||
self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
def post_process(self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs) -> IOProcessorOutput:
|
||||
raise NotImplementedError
|
||||
|
||||
async def post_process_async(
|
||||
self,
|
||||
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
|
||||
request_id: str | None = None,
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
collected_output = [item async for i, item in model_output]
|
||||
@ -58,8 +56,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
@abstractmethod
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput
|
||||
) -> IOProcessorResponse:
|
||||
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
|
||||
raise NotImplementedError
|
||||
```
|
||||
|
||||
|
||||
@ -478,17 +478,15 @@ us with:
|
||||
|
||||
```python
|
||||
if seq_group.is_finished():
|
||||
if (
|
||||
seq_group.metrics.first_scheduled_time is not None
|
||||
and seq_group.metrics.first_token_time is not None
|
||||
):
|
||||
if (seq_group.metrics.first_scheduled_time is not None and
|
||||
seq_group.metrics.first_token_time is not None):
|
||||
time_queue_requests.append(
|
||||
seq_group.metrics.first_scheduled_time -
|
||||
seq_group.metrics.arrival_time
|
||||
)
|
||||
seq_group.metrics.arrival_time)
|
||||
...
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
```
|
||||
|
||||
This seems duplicative, and one of them should be removed. The latter
|
||||
|
||||
@ -112,8 +112,8 @@ class KVCacheBlock:
|
||||
ref_cnt: int
|
||||
|
||||
# The pointers to form a doubly linked list for the free queue.
|
||||
prev_free_block: "KVCacheBlock | None" = None
|
||||
next_free_block: "KVCacheBlock | None" = None
|
||||
prev_free_block: Optional["KVCacheBlock"] = None
|
||||
next_free_block: Optional["KVCacheBlock"] = None
|
||||
```
|
||||
|
||||
There are two design points to highlight:
|
||||
|
||||
@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"],
|
||||
stop=["[/assistant]"]
|
||||
)
|
||||
|
||||
prompts = [
|
||||
@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
|
||||
)
|
||||
```
|
||||
|
||||
@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_path=local_path,
|
||||
lora_int_id=abs(hash(lora_name)),
|
||||
lora_int_id=abs(hash(lora_name))
|
||||
)
|
||||
return lora_request
|
||||
```
|
||||
@ -296,7 +296,10 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
||||
if has_audio:
|
||||
question = f"<|audio|>{question}"
|
||||
chat = [
|
||||
{"role": "user", "content": question},
|
||||
{
|
||||
"role": "user",
|
||||
"content": question
|
||||
}
|
||||
]
|
||||
return tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
|
||||
|
||||
@ -154,7 +154,9 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": [image1, image2]},
|
||||
"multi_modal_data": {
|
||||
"image": [image1, image2]
|
||||
},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
@ -181,24 +183,21 @@ conversation = [
|
||||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil,
|
||||
},
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?",
|
||||
},
|
||||
],
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil
|
||||
}, {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
}],
|
||||
},
|
||||
]
|
||||
|
||||
@ -225,10 +224,7 @@ Multi-image input can be extended to perform video captioning. We show this with
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this set of frames. Consider the frames to be a part of the same video.",
|
||||
},
|
||||
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
||||
],
|
||||
}
|
||||
for i in range(len(video_frames)):
|
||||
@ -259,13 +255,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
|
||||
# Custom black background for dark theme
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||||
)
|
||||
|
||||
# Custom brand color background (e.g., blue)
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||||
)
|
||||
```
|
||||
|
||||
@ -298,23 +294,20 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=1024)
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
)
|
||||
|
||||
video_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "describe this video."},
|
||||
{
|
||||
"type": "video",
|
||||
"video": video_path,
|
||||
"total_pixels": 20480 * 28 * 28,
|
||||
"min_pixels": 16 * 28 * 28,
|
||||
},
|
||||
"min_pixels": 16 * 28 * 28
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
||||
@ -472,24 +465,21 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What’s in this image?",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{"type": "text", "text": "What’s in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
url": image_url
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"uuid": image_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
|
||||
@ -499,27 +489,26 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the animals in these images?",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_duck
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_duck},
|
||||
"uuid": image_url_duck, # Optional
|
||||
"uuid": image_url_duck # Optional
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_lion
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_lion},
|
||||
"uuid": image_url_lion, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"uuid": image_url_lion # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
```
|
||||
@ -571,22 +560,23 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?",
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"uuid": video_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@ -662,25 +652,23 @@ Then, you can use the OpenAI client as follows:
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav"
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav",
|
||||
},
|
||||
"uuid": audio_url, # Optional
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@ -695,22 +683,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
||||
|
||||
```python
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": audio_url},
|
||||
"uuid": audio_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@ -759,48 +747,43 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
|
||||
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
||||
model = "llava-hf/llava-1.5-7b-hf"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": f"{base64_image_embedding}",
|
||||
"uuid": image_url, # Optional
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
|
||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||
model = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}", # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
},
|
||||
"uuid": image_url, # Optional
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
model = "openbmb/MiniCPM-V-2_6"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}", # Required
|
||||
"image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||
},
|
||||
"uuid": image_url, # Optional
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
},
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
)
|
||||
```
|
||||
@ -819,22 +802,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": None,
|
||||
"uuid": image_uuid,
|
||||
"uuid": image_uuid
|
||||
},
|
||||
|
||||
# input_audio:
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": None,
|
||||
"uuid": audio_uuid,
|
||||
"uuid": audio_uuid
|
||||
},
|
||||
|
||||
# PIL Image:
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": None,
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
"image_pil": None
|
||||
"uuid": image_uuid
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
@ -156,16 +156,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
|
||||
Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
|
||||
|
||||
## Experimental Feature
|
||||
|
||||
### Heterogenuous KV Layout support
|
||||
|
||||
Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
|
||||
|
||||
```bash
|
||||
--kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
|
||||
```
|
||||
|
||||
## Example Scripts/Code
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
@ -11,8 +11,6 @@ vLLM currently supports the following reasoning models:
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
@ -117,11 +115,9 @@ OpenAI Python client library does not officially support `reasoning_content` att
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
||||
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
||||
stream = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
stream=True,
|
||||
)
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
@ -161,29 +157,27 @@ The reasoning content is also available when both tool calling and the reasoning
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
}
|
||||
},
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
tool_choice="auto"
|
||||
)
|
||||
|
||||
print(response)
|
||||
@ -229,7 +223,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""
|
||||
Instance method that should be implemented for extracting reasoning
|
||||
from an incomplete response; for use when handling reasoning calls and
|
||||
@ -239,10 +233,8 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
||||
"""
|
||||
|
||||
def extract_reasoning_content(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
) -> tuple[str | None, str | None]:
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Extract reasoning content from a complete model-generated string.
|
||||
|
||||
@ -280,10 +272,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
|
||||
|
||||
@classmethod
|
||||
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
|
||||
return cls(
|
||||
start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
|
||||
end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
|
||||
)
|
||||
return cls(start_token_id=tokenizer.encode(
|
||||
"<think>", add_special_tokens=False)[0],
|
||||
end_token_id=tokenizer.encode("</think>",
|
||||
add_special_tokens=False)[0])
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.end_token_id in input_ids
|
||||
|
||||
@ -27,29 +27,27 @@ Next, make a request that triggers the model to use the available tools:
|
||||
return f"Getting the weather for {location} in {unit}..."
|
||||
tool_functions = {"get_weather": get_weather}
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
tool_choice="auto"
|
||||
)
|
||||
|
||||
tool_call = response.choices[0].message.tool_calls[0].function
|
||||
@ -404,7 +402,8 @@ Here is a summary of a plugin file:
|
||||
|
||||
# adjust request. e.g.: set skip special tokens
|
||||
# to False for tool call output.
|
||||
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||
def adjust_request(
|
||||
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||
return request
|
||||
|
||||
# implement the tool call parse for stream call
|
||||
@ -417,7 +416,7 @@ Here is a summary of a plugin file:
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
request: ChatCompletionRequest,
|
||||
) -> DeltaMessage | None:
|
||||
) -> Union[DeltaMessage, None]:
|
||||
return delta
|
||||
|
||||
# implement the tool parse for non-stream call
|
||||
|
||||
@ -23,7 +23,7 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information"
|
||||
--8<-- "docs/getting_started/installation/cpu/build.inc.md"
|
||||
|
||||
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
# --8<-- [start:extra-information]
|
||||
|
||||
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
|
||||
```bash
|
||||
@ -41,4 +39,7 @@ If you want to develop vLLM, install it in editable mode instead.
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py develop
|
||||
```
|
||||
|
||||
!!! note
|
||||
If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
|
||||
|
||||
# --8<-- [end:extra-information]
|
||||
|
||||
@ -335,108 +335,108 @@ th {
|
||||
}
|
||||
</style>
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ |
|
||||
| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ |
|
||||
| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ |
|
||||
| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ |
|
||||
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ |
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ |
|
||||
| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
|
||||
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
|
||||
| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
|
||||
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
|
||||
| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
|
||||
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
|
||||
| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
@ -453,21 +453,21 @@ See [this page](./pooling_models.md) for more information on how to use pooling
|
||||
|
||||
These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
|
||||
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | |
|
||||
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
|
||||
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
|
||||
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
|
||||
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
|
||||
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ |
|
||||
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ |
|
||||
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | ✅︎ |
|
||||
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | ✅︎ |
|
||||
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
@ -494,11 +494,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
|
||||
|
||||
These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
@ -511,16 +511,16 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
|
||||
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
|
||||
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | |
|
||||
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
|
||||
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | ✅︎ |
|
||||
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
@ -553,13 +553,13 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
|
||||
These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
@ -575,13 +575,13 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|
|
||||
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
|
||||
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
|
||||
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ |
|
||||
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | ✅︎ |
|
||||
|
||||
!!! note
|
||||
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner_client.py>.
|
||||
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
|
||||
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
@ -604,6 +604,29 @@ On the other hand, modalities separated by `/` are mutually exclusive.
|
||||
|
||||
See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
|
||||
|
||||
!!! important
|
||||
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
|
||||
or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
|
||||
|
||||
Offline inference:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2-VL-7B-Instruct",
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)
|
||||
```
|
||||
|
||||
Online serving:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
|
||||
```
|
||||
|
||||
**This is no longer required if you are using vLLM V1.**
|
||||
|
||||
!!! tip
|
||||
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
|
||||
|
||||
@ -640,70 +663,70 @@ See [this page](generative_models.md) for more information on how to use generat
|
||||
|
||||
These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
|
||||
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
|
||||
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ |
|
||||
| `MiDashengLMModel` | MiDashengLM | T + A<sup>+</sup> | `mispeech/midashenglm-7b` | | ✅︎ |
|
||||
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ |
|
||||
| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | |
|
||||
| `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
|
||||
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `MiDashengLMModel` | MiDashengLM | T + A<sup>+</sup> | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ |
|
||||
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
| `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ | ✅︎ |
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
|
||||
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
|
||||
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
|
||||
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
@ -788,11 +811,11 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
|
||||
Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
|
||||
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ |
|
||||
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
|
||||
### Pooling Models
|
||||
|
||||
@ -807,12 +830,12 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
|
||||
|
||||
The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
@ -824,9 +847,9 @@ The following table lists those that are tested in vLLM.
|
||||
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
|
||||
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
@ -14,16 +14,13 @@ Before using EP, you need to install the necessary dependencies. We are actively
|
||||
|
||||
### Backend Selection Guide
|
||||
|
||||
vLLM provides multiple communication backends for EP. Use `--all2all-backend` to select one:
|
||||
vLLM provides three communication backends for EP:
|
||||
|
||||
| Backend | Use Case | Features | Best For |
|
||||
|---------|----------|----------|----------|
|
||||
| `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
|
||||
| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development |
|
||||
| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
|
||||
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
|
||||
| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
|
||||
| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |
|
||||
| `pplx` | Single node | Chunked prefill support | Development, best for intra-node deployments |
|
||||
| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout | High-throughput scenarios, prefill-dominated workloads |
|
||||
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout | Low-latency scenarios, decode-dominated workloads |
|
||||
|
||||
## Single Node Deployment
|
||||
|
||||
@ -50,11 +47,11 @@ The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parall
|
||||
|
||||
```bash
|
||||
# Single node EP deployment with pplx backend
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU
|
||||
VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 \
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU
|
||||
--data-parallel-size 8 \ # Data parallelism across 8 processes
|
||||
--enable-expert-parallel \ # Enable expert parallelism
|
||||
--all2all-backend pplx # Use pplx communication backend
|
||||
--enable-expert-parallel # Enable expert parallelism
|
||||
```
|
||||
|
||||
## Multi-Node Deployment
|
||||
@ -73,8 +70,8 @@ The following example deploys `DeepSeek-V3-0324` across 2 nodes using `deepep_lo
|
||||
|
||||
```bash
|
||||
# Node 1 (Primary - handles incoming requests)
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--all2all-backend deepep_low_latency \
|
||||
VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # TP size per node
|
||||
--enable-expert-parallel \ # Enable EP
|
||||
--data-parallel-size 16 \ # Total DP size across all nodes
|
||||
@ -84,8 +81,8 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended)
|
||||
|
||||
# Node 2 (Secondary - headless mode, no API server)
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--all2all-backend deepep_low_latency \
|
||||
VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # TP size per node
|
||||
--enable-expert-parallel \ # Enable EP
|
||||
--data-parallel-size 16 \ # Total DP size across all nodes
|
||||
@ -172,12 +169,11 @@ Single node deployment with EPLB enabled:
|
||||
|
||||
```bash
|
||||
# Single node with EPLB load balancing
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # Tensor parallelism
|
||||
--data-parallel-size 8 \ # Data parallelism
|
||||
--enable-expert-parallel \ # Enable EP
|
||||
--all2all-backend pplx \ # Use pplx communication backend
|
||||
--enable-eplb \ # Enable load balancer
|
||||
VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # Tensor parallelism
|
||||
--data-parallel-size 8 \ # Data parallelism
|
||||
--enable-expert-parallel \ # Enable EP
|
||||
--enable-eplb \ # Enable load balancer
|
||||
--eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
|
||||
```
|
||||
|
||||
|
||||
@ -88,6 +88,12 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|
||||
| **Mamba Models** | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr> |
|
||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||
|
||||
vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
|
||||
|
||||
!!! tip
|
||||
|
||||
This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
|
||||
|
||||
See below for the status of models that are not yet supported or have more features planned in V1.
|
||||
|
||||
#### Embedding Models
|
||||
|
||||
@ -1,13 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import threading
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger
|
||||
|
||||
"""
|
||||
To run this example, run the following commands simultaneously with
|
||||
@ -23,64 +21,37 @@ send a request to the instance with DP rank 1.
|
||||
"""
|
||||
|
||||
|
||||
def _do_background_logging(engine, interval, stop_event):
|
||||
try:
|
||||
while not stop_event.is_set():
|
||||
asyncio.run(engine.do_log_stats())
|
||||
stop_event.wait(interval)
|
||||
except Exception as e:
|
||||
print(f"vLLM background logging shutdown: {e}")
|
||||
pass
|
||||
|
||||
|
||||
async def main():
|
||||
engine_args = AsyncEngineArgs(
|
||||
model="ibm-research/PowerMoE-3b",
|
||||
data_parallel_size=2,
|
||||
tensor_parallel_size=1,
|
||||
dtype="auto",
|
||||
max_model_len=2048,
|
||||
data_parallel_address="127.0.0.1",
|
||||
data_parallel_rpc_port=62300,
|
||||
data_parallel_size_local=1,
|
||||
enforce_eager=True,
|
||||
enable_log_requests=True,
|
||||
disable_custom_all_reduce=True,
|
||||
)
|
||||
|
||||
engine_client = AsyncLLMEngine.from_engine_args(
|
||||
engine_args,
|
||||
# Example: Using aggregated logger
|
||||
stat_loggers=[AggregatedLoggingStatLogger],
|
||||
)
|
||||
stop_logging_event = threading.Event()
|
||||
logging_thread = threading.Thread(
|
||||
target=_do_background_logging,
|
||||
args=(engine_client, 5, stop_logging_event),
|
||||
daemon=True,
|
||||
)
|
||||
logging_thread.start()
|
||||
engine_client = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
max_tokens=100,
|
||||
)
|
||||
num_prompts = 10
|
||||
for i in range(num_prompts):
|
||||
prompt = "Who won the 2004 World Series?"
|
||||
final_output: RequestOutput | None = None
|
||||
async for output in engine_client.generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
request_id=f"abcdef-{i}",
|
||||
data_parallel_rank=1,
|
||||
):
|
||||
final_output = output
|
||||
if final_output:
|
||||
print(final_output.outputs[0].text)
|
||||
|
||||
stop_logging_event.set()
|
||||
logging_thread.join()
|
||||
prompt = "Who won the 2004 World Series?"
|
||||
final_output: RequestOutput | None = None
|
||||
async for output in engine_client.generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
request_id="abcdef",
|
||||
data_parallel_rank=1,
|
||||
):
|
||||
final_output = output
|
||||
if final_output:
|
||||
print(final_output.outputs[0].text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -6,12 +6,6 @@
|
||||
python examples/online_serving/pooling/cohere_rerank_client.py
|
||||
```
|
||||
|
||||
## Embedding embed_dtype usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/embedding_embed_dtype_client.py
|
||||
```
|
||||
|
||||
## Jinaai rerank usage
|
||||
|
||||
```bash
|
||||
@ -21,7 +15,7 @@ python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/ner_client.py
|
||||
python examples/online_serving/pooling/ner.py
|
||||
```
|
||||
|
||||
## Openai chat embedding for multimodal usage
|
||||
|
||||
@ -1,59 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example Python client for embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve intfloat/e5-small
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
response = requests.post(api_url, headers=headers, json=prompt)
|
||||
return response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="intfloat/e5-small")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
|
||||
model_name = args.model
|
||||
|
||||
for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items():
|
||||
prompt = {
|
||||
"model": model_name,
|
||||
"input": "vLLM is great!",
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": embed_dtype,
|
||||
}
|
||||
response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
|
||||
embedding = []
|
||||
for data in response.json()["data"]:
|
||||
embedding.append(
|
||||
torch.frombuffer(
|
||||
base64.b64decode(data["embedding"]), dtype=torch_dtype
|
||||
).to(torch.float32)
|
||||
)
|
||||
embedding = torch.cat(embedding)
|
||||
print(embed_dtype, embedding.shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following:
|
||||
|
||||
```bash
|
||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
|
||||
structured-outputs
|
||||
structured-output
|
||||
```
|
||||
|
||||
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
|
||||
|
||||
@ -107,7 +107,6 @@ markers = [
|
||||
"distributed: run this test only in distributed GPU tests",
|
||||
"skip_v1: do not run this test with v1",
|
||||
"optional: optional tests that are automatically skipped, include --optional to run them",
|
||||
"extra_server_args: extra arguments to pass to the server fixture",
|
||||
]
|
||||
|
||||
[tool.ty.src]
|
||||
|
||||
@ -38,8 +38,8 @@ pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
einops # Required for Qwen2-VL.
|
||||
compressed-tensors == 0.12.2 # required for compressed-tensors
|
||||
depyf==0.20.0 # required for profiling and debugging with compilation config
|
||||
compressed-tensors == 0.11.0 # required for compressed-tensors
|
||||
depyf==0.19.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
python-json-logger # Used by logging as per examples/others/logging_configuration.md
|
||||
|
||||
@ -8,8 +8,6 @@ import os
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from vllm.envs import maybe_convert_bool
|
||||
|
||||
if TYPE_CHECKING:
|
||||
VLLM_CI_NO_SKIP: bool = False
|
||||
VLLM_CI_DTYPE: str | None = None
|
||||
@ -27,10 +25,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_CI_HEAD_DTYPE": lambda: os.getenv("VLLM_CI_HEAD_DTYPE", None),
|
||||
# Allow changing the head dtype used by transformers in tests
|
||||
"VLLM_CI_HF_DTYPE": lambda: os.getenv("VLLM_CI_HF_DTYPE", None),
|
||||
# Allow control over whether tests use enforce_eager
|
||||
"VLLM_CI_ENFORCE_EAGER": lambda: maybe_convert_bool(
|
||||
os.getenv("VLLM_CI_ENFORCE_EAGER", None)
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -258,13 +258,13 @@ def tractable_computation(
|
||||
|
||||
@torch.inference_mode
|
||||
def run_model(
|
||||
llama_config, use_compile: bool, backend: str, split_attn: bool = False
|
||||
llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False
|
||||
) -> torch.Tensor:
|
||||
if use_compile:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_cudagraph=True,
|
||||
backend=backend,
|
||||
use_inductor=use_inductor,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
)
|
||||
if split_attn:
|
||||
@ -338,8 +338,8 @@ def run_model(
|
||||
return output.cpu()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
||||
def test_toy_llama(backend: str):
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
def test_toy_llama(use_inductor: bool):
|
||||
# compare output with and without piecewise compilation
|
||||
|
||||
llama_config = LlamaConfig(
|
||||
@ -358,10 +358,10 @@ def test_toy_llama(backend: str):
|
||||
num_backend_compilations=0,
|
||||
num_cudagraph_captured=0,
|
||||
):
|
||||
outputs.append(run_model(llama_config, backend="eager", use_compile=False))
|
||||
run_model(tractable_config, backend="eager", use_compile=False)
|
||||
outputs.append(run_model(llama_config, use_inductor=False, use_compile=False))
|
||||
run_model(tractable_config, use_inductor=False, use_compile=False)
|
||||
|
||||
if backend == "inductor":
|
||||
if use_inductor:
|
||||
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
|
||||
else:
|
||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||
@ -377,8 +377,10 @@ def test_toy_llama(backend: str):
|
||||
num_cudagraph_captured=2,
|
||||
**kwargs,
|
||||
):
|
||||
outputs.append(run_model(llama_config, backend=backend, use_compile=True))
|
||||
run_model(tractable_config, backend=backend, use_compile=True)
|
||||
outputs.append(
|
||||
run_model(llama_config, use_inductor=use_inductor, use_compile=True)
|
||||
)
|
||||
run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
@ -393,9 +395,16 @@ def test_toy_llama(backend: str):
|
||||
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
):
|
||||
outputs.append(
|
||||
run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
|
||||
run_model(
|
||||
llama_config,
|
||||
use_inductor=use_inductor,
|
||||
use_compile=True,
|
||||
split_attn=True,
|
||||
)
|
||||
)
|
||||
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
|
||||
run_model(
|
||||
tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True
|
||||
)
|
||||
|
||||
for i in range(1, len(outputs)):
|
||||
assert torch.allclose(outputs[0], outputs[i])
|
||||
|
||||
@ -332,7 +332,7 @@ def async_tp_pass_on_test_model(
|
||||
|
||||
# this is a fake model name to construct the model config
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(
|
||||
model=model_name, trust_remote_code=True, dtype=dtype, seed=42
|
||||
)
|
||||
|
||||
@ -77,15 +77,14 @@ class TestSetting:
|
||||
method="encode",
|
||||
),
|
||||
# vision language model
|
||||
# See https://github.com/vllm-project/vllm/issues/26716.
|
||||
# TestSetting(
|
||||
# model="microsoft/Phi-3.5-vision-instruct",
|
||||
# model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
||||
# pp_size=2,
|
||||
# tp_size=1,
|
||||
# attn_backend="FLASH_ATTN",
|
||||
# method="generate_with_image",
|
||||
# ),
|
||||
TestSetting(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
||||
pp_size=2,
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate_with_image",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compile_correctness(
|
||||
@ -110,46 +109,41 @@ def test_compile_correctness(
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
final_args = [
|
||||
"--enforce-eager",
|
||||
*model_args,
|
||||
"-pp",
|
||||
str(pp_size),
|
||||
"-tp",
|
||||
str(tp_size),
|
||||
"-O.cudagraph_mode=none",
|
||||
]
|
||||
|
||||
all_args: list[list[str]] = []
|
||||
all_envs: list[dict[str, str] | None] = []
|
||||
|
||||
for comp_level in [
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.PIECEWISE,
|
||||
]:
|
||||
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
|
||||
all_args.append(
|
||||
final_args + [f"-O.level={level}", "-O.backend=inductor"]
|
||||
)
|
||||
all_args.append(final_args + [f"-O{level}"])
|
||||
all_envs.append({})
|
||||
|
||||
# inductor will change the output, so we only compare if the output
|
||||
# is close, not exactly the same.
|
||||
compare_all_settings(
|
||||
model,
|
||||
all_args,
|
||||
all_envs,
|
||||
method=method if method != "generate" else "generate_close",
|
||||
)
|
||||
all_envs.clear()
|
||||
all_args.clear()
|
||||
# inductor will change the output, so we only compare if the output
|
||||
# is close, not exactly the same.
|
||||
compare_all_settings(
|
||||
model,
|
||||
all_args,
|
||||
all_envs,
|
||||
method=method if method != "generate" else "generate_close",
|
||||
)
|
||||
all_envs.clear()
|
||||
all_args.clear()
|
||||
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
CompilationLevel.PIECEWISE,
|
||||
]:
|
||||
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
|
||||
all_envs.append({})
|
||||
all_args.append(final_args + [f"-O{level}"])
|
||||
all_envs.append({})
|
||||
|
||||
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
||||
|
||||
@ -229,7 +229,7 @@ def all_reduce_fusion_pass_on_test_model(
|
||||
|
||||
# this is a fake model name to construct the model config
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(
|
||||
model=model_name, trust_remote_code=True, dtype=dtype, seed=42
|
||||
)
|
||||
|
||||
@ -278,7 +278,7 @@ def sequence_parallelism_pass_on_test_model(
|
||||
|
||||
# this is a fake model name to construct the model config
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(
|
||||
model=model_name, trust_remote_code=True, dtype=dtype, seed=42
|
||||
)
|
||||
|
||||
@ -204,21 +204,17 @@ def _compare_cp_with_tp(
|
||||
|
||||
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
# [MLA attention only]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
|
||||
CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2),
|
||||
],
|
||||
"bigcode/gpt_bigcode-santacoder": [
|
||||
CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2),
|
||||
],
|
||||
}
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
"bigcode/gpt_bigcode-santacoder",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import multiprocessing
|
||||
import random
|
||||
|
||||
import pytest
|
||||
@ -9,7 +8,6 @@ import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.parallel_state import get_tp_group, graph_capture
|
||||
from vllm.platforms import current_platform
|
||||
@ -136,88 +134,3 @@ def test_custom_quick_allreduce(
|
||||
monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
|
||||
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
|
||||
|
||||
|
||||
def qr_variable_input(rank, world_size):
|
||||
"""
|
||||
When the tensor parallelism is set to 4 or 8, frequent changes
|
||||
in the input shape can cause QuickReduce to hang (this issue
|
||||
has been observed with the gpt_oss model).
|
||||
"""
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
qr_max_size = None # MB
|
||||
_ptr = ops.init_custom_qr(rank, world_size, qr_max_size)
|
||||
ranks = []
|
||||
for i in range(world_size):
|
||||
ranks.append(i)
|
||||
dist.init_process_group(
|
||||
backend="nccl",
|
||||
init_method="tcp://127.0.0.1:29500",
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
)
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="nccl")
|
||||
|
||||
handle = ops.qr_get_handle(_ptr)
|
||||
world_size = dist.get_world_size(group=cpu_group)
|
||||
handles = [None] * world_size
|
||||
dist.all_gather_object(handles, handle, group=cpu_group)
|
||||
ops.qr_open_handles(_ptr, handles)
|
||||
|
||||
num = 1
|
||||
s1 = 1024
|
||||
while num < 50000: # 50000 is sufficient to identify issues.
|
||||
dtype = torch.float16
|
||||
if num % 2 == 0:
|
||||
s2 = 1024
|
||||
inp1 = torch.zeros(
|
||||
(s1, s2), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
else:
|
||||
s2 = 2048
|
||||
inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device())
|
||||
result = torch.empty_like(inp1)
|
||||
# FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
|
||||
ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True)
|
||||
try:
|
||||
if inp1[0, 0] == 0:
|
||||
assert torch.all(result == 0)
|
||||
else:
|
||||
assert torch.all(result == world_size)
|
||||
except AssertionError:
|
||||
print("Assertion failed! Allreduce results are incorrect.")
|
||||
raise
|
||||
num += 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
|
||||
)
|
||||
@pytest.mark.parametrize("tp_size", [4, 8])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1])
|
||||
def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
|
||||
multiprocessing.set_start_method("spawn", force=True)
|
||||
# 60s is enough
|
||||
timeout = 60
|
||||
processes = []
|
||||
for rank in range(tp_size):
|
||||
p = multiprocessing.Process(target=qr_variable_input, args=(rank, tp_size))
|
||||
p.start()
|
||||
processes.append((rank, p))
|
||||
for rank, p in processes:
|
||||
p.join(timeout=timeout)
|
||||
if p.is_alive():
|
||||
for r, proc in processes:
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join()
|
||||
raise RuntimeError(f"QuickReduce hang detected after {timeout} seconds!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_custom_quick_allreduce_variable_input(tp_size=4, pipeline_parallel_size=1)
|
||||
|
||||
@ -247,10 +247,10 @@ async def test_tool_id_kimi_k2(
|
||||
)
|
||||
assert chat_completion.choices[0].message.tool_calls is not None
|
||||
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
||||
assert chat_completion.choices[0].message.tool_calls[0].id in [
|
||||
"functions.get_current_weather:0",
|
||||
"functions.get_forecast:1",
|
||||
]
|
||||
assert (
|
||||
chat_completion.choices[0].message.tool_calls[0].id
|
||||
== "functions.get_current_weather:0"
|
||||
)
|
||||
else:
|
||||
# Streaming test
|
||||
output_stream = await k2_client.chat.completions.create(
|
||||
@ -266,10 +266,7 @@ async def test_tool_id_kimi_k2(
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
for o in output:
|
||||
assert o.id is None or o.id in [
|
||||
"functions.get_current_weather:0",
|
||||
"functions.get_forecast:1",
|
||||
]
|
||||
assert o.id is None or o.id == "functions.get_current_weather:0"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -1,126 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def chat_server_with_force_include_usage(request): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"--enable-force-include-usage",
|
||||
"--port",
|
||||
"55857",
|
||||
"--gpu-memory-utilization",
|
||||
"0.2",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
|
||||
async with chat_server_with_force_include_usage.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_with_enable_force_include_usage(
|
||||
chat_client_with_force_include_usage: openai.AsyncOpenAI,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
]
|
||||
|
||||
stream = await chat_client_with_force_include_usage.chat.completions.create(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(min_tokens=10),
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
last_completion_tokens = 0
|
||||
async for chunk in stream:
|
||||
if not len(chunk.choices):
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert (
|
||||
last_completion_tokens == 0
|
||||
or chunk.usage.completion_tokens > last_completion_tokens
|
||||
or (
|
||||
not chunk.choices
|
||||
and chunk.usage.completion_tokens == last_completion_tokens
|
||||
)
|
||||
)
|
||||
assert chunk.usage.total_tokens == (
|
||||
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
||||
)
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def transcription_server_with_force_include_usage():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"--enforce-eager",
|
||||
"--enable-force-include-usage",
|
||||
"--gpu-memory-utilization",
|
||||
"0.2",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def transcription_client_with_force_include_usage(
|
||||
transcription_server_with_force_include_usage,
|
||||
):
|
||||
async with (
|
||||
transcription_server_with_force_include_usage.get_async_client() as async_client
|
||||
):
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_transcription_with_enable_force_include_usage(
|
||||
transcription_client_with_force_include_usage, winning_call
|
||||
):
|
||||
res = (
|
||||
await transcription_client_with_force_include_usage.audio.transcriptions.create(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
)
|
||||
|
||||
async for chunk in res:
|
||||
if not len(chunk.choices):
|
||||
# final usage sent
|
||||
usage = chunk.usage
|
||||
assert isinstance(usage, dict)
|
||||
assert usage["prompt_tokens"] > 0
|
||||
assert usage["completion_tokens"] > 0
|
||||
assert usage["total_tokens"] > 0
|
||||
else:
|
||||
assert not hasattr(chunk, "usage")
|
||||
@ -1,36 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from openai_harmony import (
|
||||
Message,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
|
||||
|
||||
|
||||
def test_serialize_message() -> None:
|
||||
dict_value = {"a": 1, "b": "2"}
|
||||
assert serialize_message(dict_value) == dict_value
|
||||
|
||||
msg_value = {
|
||||
"role": "assistant",
|
||||
"name": None,
|
||||
"content": [{"type": "text", "text": "Test 1"}],
|
||||
"channel": "analysis",
|
||||
}
|
||||
msg = Message.from_dict(msg_value)
|
||||
assert serialize_message(msg) == msg_value
|
||||
|
||||
|
||||
def test_serialize_messages() -> None:
|
||||
assert serialize_messages(None) is None
|
||||
assert serialize_messages([]) is None
|
||||
|
||||
dict_value = {"a": 3, "b": "4"}
|
||||
msg_value = {
|
||||
"role": "assistant",
|
||||
"name": None,
|
||||
"content": [{"type": "text", "text": "Test 2"}],
|
||||
"channel": "analysis",
|
||||
}
|
||||
msg = Message.from_dict(msg_value)
|
||||
assert serialize_messages([msg, dict_value]) == [msg_value, dict_value]
|
||||
@ -16,22 +16,6 @@ from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/gpt-oss-20b"
|
||||
|
||||
GET_WEATHER_SCHEMA = {
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
@ -321,54 +305,6 @@ async def test_streaming_types(client: OpenAI, model_name: str):
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
|
||||
# this links the "done" type with the "start" type
|
||||
# so every "done" type should have a corresponding "start" type
|
||||
# and every open block should be closed by the end of the stream
|
||||
pairs_of_event_types = {
|
||||
"response.completed": "response.created",
|
||||
"response.output_item.done": "response.output_item.added",
|
||||
"response.output_text.done": "response.output_text.delta",
|
||||
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||
"response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa
|
||||
}
|
||||
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
input_list = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}
|
||||
]
|
||||
stream_response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_list,
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
stack_of_event_types = []
|
||||
async for event in stream_response:
|
||||
if event.type == "response.created":
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type == "response.completed":
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
|
||||
stack_of_event_types.pop()
|
||||
if event.type.endswith("added"):
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("delta"):
|
||||
if stack_of_event_types[-1] == event.type:
|
||||
continue
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("done"):
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
|
||||
stack_of_event_types.pop()
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("background", [True, False])
|
||||
@ -547,7 +483,23 @@ def call_function(name, args):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
@ -613,7 +565,21 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
|
||||
},
|
||||
"strict": True,
|
||||
},
|
||||
GET_WEATHER_SCHEMA,
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
},
|
||||
]
|
||||
|
||||
response = await client.responses.create(
|
||||
@ -677,7 +643,23 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_required(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
]
|
||||
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.responses.create(
|
||||
@ -707,7 +689,23 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {"type": "number"},
|
||||
"longitude": {"type": "number"},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}
|
||||
]
|
||||
|
||||
input_messages = [
|
||||
{"role": "user", "content": "What's the weather like in Paris today?"}
|
||||
@ -747,74 +745,6 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
assert response_2.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_with_stream(client: OpenAI, model_name: str):
|
||||
tools = [GET_WEATHER_SCHEMA]
|
||||
input_list = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}
|
||||
]
|
||||
stream_response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_list,
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
assert stream_response is not None
|
||||
final_tool_calls = {}
|
||||
final_tool_calls_named = {}
|
||||
async for event in stream_response:
|
||||
if event.type == "response.output_item.added":
|
||||
if event.item.type != "function_call":
|
||||
continue
|
||||
final_tool_calls[event.output_index] = event.item
|
||||
final_tool_calls_named[event.item.name] = event.item
|
||||
elif event.type == "response.function_call_arguments.delta":
|
||||
index = event.output_index
|
||||
tool_call = final_tool_calls[index]
|
||||
if tool_call:
|
||||
tool_call.arguments += event.delta
|
||||
final_tool_calls_named[tool_call.name] = tool_call
|
||||
elif event.type == "response.function_call_arguments.done":
|
||||
assert event.arguments == final_tool_calls_named[event.name].arguments
|
||||
for tool_call in final_tool_calls.values():
|
||||
if (
|
||||
tool_call
|
||||
and tool_call.type == "function_call"
|
||||
and tool_call.name == "get_weather"
|
||||
):
|
||||
args = json.loads(tool_call.arguments)
|
||||
result = call_function(tool_call.name, args)
|
||||
input_list += [tool_call]
|
||||
break
|
||||
assert result is not None
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_list
|
||||
+ [
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
],
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
assert response is not None
|
||||
async for event in response:
|
||||
# check that no function call events in the stream
|
||||
assert event.type != "response.function_call_arguments.delta"
|
||||
assert event.type != "response.function_call_arguments.done"
|
||||
# check that the response contains output text
|
||||
if event.type == "response.completed":
|
||||
assert len(event.response.output) > 0
|
||||
assert event.response.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
|
||||
|
||||
@ -58,9 +58,7 @@ def test_pooling_params(llm: LLM):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_encode_api(llm: LLM):
|
||||
# chunked prefill does not support all pooling
|
||||
err_msg = "pooling_task must be one of.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompts, use_tqdm=False)
|
||||
|
||||
@ -35,6 +35,7 @@ def llm():
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(normalize):
|
||||
outputs = llm.embed(
|
||||
|
||||
@ -74,6 +74,7 @@ def test_multiple_pooling_params(llm: LLM):
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_right_side_truncation(llm: LLM):
|
||||
# Embeddings models should truncate the end of the prompt
|
||||
tokenizer = llm.get_tokenizer()
|
||||
|
||||
@ -33,6 +33,7 @@ def llm():
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(activation):
|
||||
text_1 = "What is the capital of France?"
|
||||
|
||||
@ -14,10 +14,7 @@ import torch.nn.functional as F
|
||||
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
|
||||
from tests.models.utils import check_embeddings_close
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
EmbeddingResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
@ -247,75 +244,6 @@ async def test_batch_base64_embedding(
|
||||
run_embedding_correctness_test(hf_model, input_texts, default_data)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_base64_embed_dtype(
|
||||
hf_model, server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
responses_float = await client.embeddings.create(
|
||||
input=input_texts, model=model_name, encoding_format="float"
|
||||
)
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
|
||||
for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items():
|
||||
responses_base64 = requests.post(
|
||||
server.url_for("/v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": embed_dtype,
|
||||
},
|
||||
)
|
||||
|
||||
base64_data = []
|
||||
for data in responses_base64.json()["data"]:
|
||||
base64_data.append(
|
||||
torch.frombuffer(base64.b64decode(data["embedding"]), dtype=torch_dtype)
|
||||
.to(torch.float32)
|
||||
.tolist()
|
||||
)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=base64_data,
|
||||
name_0="float_data",
|
||||
name_1="base64_data",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_base64_embed_dtype_not_supported(
|
||||
hf_model, server: RemoteOpenAIServer, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
bad_embed_dtype = "bad_embed_dtype"
|
||||
|
||||
responses_base64 = requests.post(
|
||||
server.url_for("/v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": bad_embed_dtype,
|
||||
},
|
||||
)
|
||||
|
||||
assert responses_base64.status_code == 400
|
||||
assert responses_base64.json()["error"]["message"].startswith(
|
||||
f"embed_dtype={bad_embed_dtype!r} is not supported."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
|
||||
|
||||
@ -6,11 +6,10 @@ import base64
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from tests.models.utils import check_embeddings_close
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE, PoolingResponse
|
||||
from vllm.entrypoints.openai.protocol import PoolingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
MODEL_NAME = "internlm/internlm2-1_8b-reward"
|
||||
@ -249,80 +248,6 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_base64_embed_dtype(server: RemoteOpenAIServer, model_name: str):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
url = server.url_for("pooling")
|
||||
float_response = requests.post(
|
||||
url,
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
responses_float = PoolingResponse.model_validate(float_response.json())
|
||||
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
|
||||
|
||||
for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items():
|
||||
responses_base64 = requests.post(
|
||||
url,
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": embed_dtype,
|
||||
},
|
||||
)
|
||||
|
||||
base64_data = []
|
||||
for data in responses_base64.json()["data"]:
|
||||
base64_data.append(
|
||||
torch.frombuffer(base64.b64decode(data["data"]), dtype=torch_dtype)
|
||||
.to(torch.float32)
|
||||
.tolist()
|
||||
)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=base64_data,
|
||||
name_0="float_data",
|
||||
name_1="base64_data",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_base64_embed_dtype_not_supported(
|
||||
server: RemoteOpenAIServer, model_name: str
|
||||
):
|
||||
input_texts = [
|
||||
"The best thing about vLLM is that it supports many different models",
|
||||
]
|
||||
|
||||
bad_embed_dtype = "bad_embed_dtype"
|
||||
|
||||
responses_base64 = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_texts,
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": bad_embed_dtype,
|
||||
},
|
||||
)
|
||||
|
||||
assert responses_base64.status_code == 400
|
||||
assert responses_base64.json()["error"]["message"].startswith(
|
||||
f"embed_dtype={bad_embed_dtype!r} is not supported."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invocations(server: RemoteOpenAIServer):
|
||||
input_texts = [
|
||||
|
||||
@ -15,6 +15,7 @@ VEC_HIDDEN_SIZES = range(1024, 1030)
|
||||
# Avoid combinatorial explosion with full Cartesian product
|
||||
NUM_TOKENS_HIDDEN_SIZES = [
|
||||
*[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
|
||||
*[(83, i) for i in [1, 1033, 2048, 5120]],
|
||||
*[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
|
||||
*[(4096, i) for i in [1, 64, 5137]],
|
||||
]
|
||||
|
||||
@ -11,7 +11,19 @@ from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
|
||||
HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing
|
||||
HIDDEN_SIZES = [
|
||||
8,
|
||||
768,
|
||||
769,
|
||||
770,
|
||||
771,
|
||||
5120,
|
||||
5124,
|
||||
5125,
|
||||
5126,
|
||||
8192,
|
||||
8199,
|
||||
] # Arbitrary values for testing
|
||||
ADD_RESIDUAL = [False, True]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
@ -106,7 +118,7 @@ def test_poly_norm(
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0])
|
||||
@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0])
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("strided_input", [False, True])
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm._custom_ops import permute_cols
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
def test_permute_cols(shape, dtype):
|
||||
x = torch.randn(shape, dtype=dtype).cuda()
|
||||
perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
|
||||
|
||||
@ -12,8 +12,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
IS_NEOX_STYLE = [True, False]
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
HEAD_SIZES = [64, 80, 120, 256]
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
HEAD_SIZES = [64, 80, 112, 120, 256]
|
||||
ROTARY_DIMS = [None, 32] # None means rotary dim == head size
|
||||
NUM_HEADS = [17] # Arbitrary values for testing
|
||||
BATCH_SIZES = [5] # Arbitrary values for testing
|
||||
|
||||
@ -183,7 +183,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("silu_activation", [False, True])
|
||||
@pytest.mark.parametrize("has_bias", [False, True])
|
||||
@pytest.mark.parametrize("seqlen", [1, 3])
|
||||
@ -265,7 +265,7 @@ def test_causal_conv1d_update_with_batch_gather(
|
||||
@pytest.mark.parametrize("silu_activation", [True])
|
||||
@pytest.mark.parametrize("has_bias", [True])
|
||||
@pytest.mark.parametrize("width", [4])
|
||||
@pytest.mark.parametrize("seqlen", [8, 249, 4096])
|
||||
@pytest.mark.parametrize("seqlen", [8, 30, 249, 2049, 4096])
|
||||
@pytest.mark.parametrize("dim", [64, 4096])
|
||||
@pytest.mark.parametrize("with_padding", [True, False])
|
||||
@pytest.mark.parametrize("batch", [4, 10])
|
||||
|
||||
@ -25,6 +25,7 @@ from vllm.utils import update_environment_variables
|
||||
(64, 1),
|
||||
(64, 2),
|
||||
(64, 4), # hidden_size be divisible by num_gpus
|
||||
(100, 5), # and n_groups must divide hidden_size
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16])
|
||||
|
||||
@ -229,8 +229,8 @@ def selective_scan_opcheck_fn(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("wtype", [torch.float32])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
|
||||
@pytest.mark.parametrize("seqlen", [128, 1024, 4096])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("seqlen", [128, 256, 512, 1024, 2048, 4096])
|
||||
@pytest.mark.parametrize("has_delta_bias", [True])
|
||||
@pytest.mark.parametrize("delta_softplus", [True])
|
||||
@pytest.mark.parametrize("has_z", [True])
|
||||
@ -238,7 +238,7 @@ def selective_scan_opcheck_fn(
|
||||
@pytest.mark.parametrize("varBC_groups", [1, 2])
|
||||
@pytest.mark.parametrize("is_variable_C", [True])
|
||||
@pytest.mark.parametrize("is_variable_B", [True])
|
||||
@pytest.mark.parametrize("scan_chunks", [1, 3])
|
||||
@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
|
||||
def test_selective_scan(
|
||||
is_variable_B,
|
||||
is_variable_C,
|
||||
@ -375,9 +375,9 @@ def test_selective_scan(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("has_z", [False, True])
|
||||
@pytest.mark.parametrize("dstate", [16, 64])
|
||||
@pytest.mark.parametrize("dstate", [16, 32, 64])
|
||||
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
|
||||
def test_selective_state_update(dim, dstate, has_z, itype):
|
||||
device = "cuda"
|
||||
@ -413,7 +413,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
|
||||
|
||||
@pytest.mark.parametrize("wtype", [torch.float32])
|
||||
@pytest.mark.parametrize("itype", [torch.float32])
|
||||
@pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096])
|
||||
@pytest.mark.parametrize("seqlen", [1, 128, 129, 256, 512, 1024, 2048, 4096])
|
||||
@pytest.mark.parametrize("return_last_state", [True])
|
||||
@pytest.mark.parametrize("has_delta_bias", [True])
|
||||
@pytest.mark.parametrize("delta_softplus", [True])
|
||||
@ -589,9 +589,9 @@ def test_selective_scan_varlen(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("has_z", [True])
|
||||
@pytest.mark.parametrize("dstate", [16, 64])
|
||||
@pytest.mark.parametrize("dstate", [16, 32, 64])
|
||||
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
|
||||
# tests correctness in case subset of the sequences are padded
|
||||
@pytest.mark.parametrize("with_padding", [True, False])
|
||||
@ -679,11 +679,11 @@ def test_selective_state_update_with_batch_indices(
|
||||
assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("has_z", [False, True])
|
||||
@pytest.mark.parametrize("tie_hdim", [False, True])
|
||||
@pytest.mark.parametrize("ngroups", [1, 4])
|
||||
@pytest.mark.parametrize("dstate", [16, 64])
|
||||
@pytest.mark.parametrize("ngroups", [1, 2, 4])
|
||||
@pytest.mark.parametrize("dstate", [16, 32, 64])
|
||||
@pytest.mark.parametrize("dim", [2048, 4096])
|
||||
def test_selective_state_update_with_heads_with_batch_indices(
|
||||
dim, dstate, ngroups, has_z, tie_hdim, itype
|
||||
|
||||
@ -188,9 +188,9 @@ def generate_continuous_batched_examples(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16])
|
||||
@pytest.mark.parametrize("n_heads", [4, 16, 32])
|
||||
@pytest.mark.parametrize("d_head", [5, 8, 32, 128])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
|
||||
@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
|
||||
@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
|
||||
def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
|
||||
# this tests the kernels on a single example (bs=1)
|
||||
@ -254,14 +254,15 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32])
|
||||
@pytest.mark.parametrize("n_heads", [4, 8])
|
||||
@pytest.mark.parametrize("d_head", [5, 16, 32])
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
|
||||
@pytest.mark.parametrize("n_heads", [4, 8, 13])
|
||||
@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
|
||||
@pytest.mark.parametrize(
|
||||
"seq_len_chunk_size_cases",
|
||||
[
|
||||
# small-ish chunk_size (8)
|
||||
(64, 8, 2, [(64, 32), (64, 32)]),
|
||||
(64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
|
||||
(64, 8, 2, [(8, 8), (8, 8), (8, 8)]), # chunk size boundary
|
||||
(
|
||||
64,
|
||||
@ -269,7 +270,16 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it
|
||||
2,
|
||||
[(4, 4), (4, 4), (4, 4), (4, 4)],
|
||||
), # chunk_size larger than cont batches
|
||||
(64, 8, 5, [(64, 32, 16, 8, 8)]),
|
||||
(
|
||||
64,
|
||||
8,
|
||||
5,
|
||||
[
|
||||
(64, 32, 16, 8, 8),
|
||||
(8, 16, 32, 16, 8),
|
||||
(8, 8, 16, 32, 16),
|
||||
],
|
||||
), # mode examples with varied lengths
|
||||
# large-ish chunk_size (256)
|
||||
(64, 256, 1, [(5,), (1,), (1,), (1,)]), # irregular sizes with small sequences
|
||||
(
|
||||
@ -349,7 +359,11 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
@pytest.mark.parametrize("chunk_size", [8, 256])
|
||||
@pytest.mark.parametrize(
|
||||
"seqlens",
|
||||
[(16, 20), (270, 88, 212, 203)],
|
||||
[
|
||||
(16, 2, 8, 13),
|
||||
(270, 88, 212, 203),
|
||||
(16, 20),
|
||||
],
|
||||
)
|
||||
def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
# This test verifies the correctness of the chunked prefill implementation
|
||||
|
||||
@ -26,7 +26,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
int4_w4a16_moe_quant_config,
|
||||
int8_w8a16_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_topk,
|
||||
modular_triton_fused_moe,
|
||||
@ -725,7 +724,7 @@ def test_fused_marlin_moe(
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map)
|
||||
|
||||
marlin_output = fused_marlin_moe(
|
||||
marlin_output = torch.ops.vllm.fused_marlin_moe(
|
||||
a,
|
||||
qweight1,
|
||||
qweight2,
|
||||
@ -838,7 +837,7 @@ def test_fused_marlin_moe_with_bias(m):
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1, b_bias2)
|
||||
|
||||
marlin_output = fused_marlin_moe(
|
||||
marlin_output = torch.ops.vllm.fused_marlin_moe(
|
||||
a,
|
||||
qweight1,
|
||||
qweight2,
|
||||
|
||||
@ -58,6 +58,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
@ -69,6 +70,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -79,6 +81,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
@ -90,6 +93,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
|
||||
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
|
||||
@ -103,6 +107,7 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True,
|
||||
gpu_memory_utilization=0.85,
|
||||
)
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
|
||||
@ -113,6 +113,7 @@ def test_llama_lora(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -126,6 +127,7 @@ def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -140,6 +142,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp2_serialize_and_deserialize_lora(
|
||||
tmp_path, sql_lora_files, sql_lora_huggingface_id
|
||||
):
|
||||
|
||||
@ -8,7 +8,7 @@ from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
@ -88,7 +88,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm",
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -112,7 +112,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm",
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
|
||||
@ -36,56 +36,55 @@ class Relu3(ReLUSquaredActivation):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"env, torch_level, backend, ops_enabled, default_on",
|
||||
"env, torch_level, use_inductor, ops_enabled, default_on",
|
||||
[
|
||||
# Default values based on compile level
|
||||
# - All by default (no Inductor compilation)
|
||||
(None, 0, "eager", [True] * 4, True),
|
||||
(None, 1, "eager", [True] * 4, True),
|
||||
(None, 2, "eager", [True] * 4, True),
|
||||
(None, 3, "eager", [True] * 4, True),
|
||||
(None, 0, False, [True] * 4, True),
|
||||
(None, 1, True, [True] * 4, True),
|
||||
(None, 2, False, [True] * 4, True),
|
||||
# - None by default (with Inductor)
|
||||
(None, 0, "inductor", [True] * 4, True),
|
||||
# - None by default (with Inductor)
|
||||
(None, 1, "inductor", [False] * 4, False),
|
||||
(None, 2, "inductor", [False] * 4, False),
|
||||
(None, 3, "inductor", [False] * 4, False),
|
||||
(None, 3, True, [False] * 4, False),
|
||||
(None, 4, True, [False] * 4, False),
|
||||
# - All by default (without Inductor)
|
||||
(None, 3, False, [True] * 4, True),
|
||||
(None, 4, False, [True] * 4, True),
|
||||
# Explicitly enabling/disabling
|
||||
#
|
||||
# Default: all
|
||||
#
|
||||
# All but SiluAndMul
|
||||
("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True),
|
||||
("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
|
||||
# Only ReLU3
|
||||
("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False),
|
||||
("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
|
||||
# All but SiluAndMul
|
||||
("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True),
|
||||
("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
|
||||
# All but ReLU3 (even if ReLU2 is on)
|
||||
("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True),
|
||||
("-relu3,+relu2", 3, False, [1, 1, 1, 0], True),
|
||||
# RMSNorm and SiluAndMul
|
||||
("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False),
|
||||
("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
|
||||
# All but RMSNorm
|
||||
("-rms_norm", 3, "eager", [0, 1, 1, 1], True),
|
||||
("-rms_norm", 3, False, [0, 1, 1, 1], True),
|
||||
#
|
||||
# Default: none
|
||||
#
|
||||
# Only ReLU3
|
||||
("none,+relu3", 3, "inductor", [0, 0, 0, 1], False),
|
||||
("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
|
||||
# All but RMSNorm
|
||||
("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True),
|
||||
("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
|
||||
],
|
||||
)
|
||||
def test_enabled_ops(
|
||||
env: str | None,
|
||||
torch_level: int,
|
||||
backend: str,
|
||||
use_inductor: bool,
|
||||
ops_enabled: list[int],
|
||||
default_on: bool,
|
||||
):
|
||||
custom_ops = env.split(",") if env else []
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
backend=backend, level=torch_level, custom_ops=custom_ops
|
||||
use_inductor=bool(use_inductor), level=torch_level, custom_ops=custom_ops
|
||||
)
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
|
||||
@ -3,15 +3,12 @@
|
||||
# Adapted from https://huggingface.co/docs/transformers/perplexity
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (
|
||||
GenerateModelInfo,
|
||||
TokensTextLogprobsPromptLogprobs,
|
||||
get_vllm_extra_kwargs,
|
||||
)
|
||||
from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
|
||||
from vllm.logprobs import Logprob
|
||||
|
||||
# See #24485
|
||||
@ -28,10 +25,27 @@ def wikitext_ppl_test(
|
||||
vllm_extra_kwargs=None,
|
||||
atol=PPL_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
|
||||
|
||||
# Allow vllm to test using the given dtype, such as float32
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
|
||||
|
||||
# Allow vllm to test using hf_overrides
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
# Allow changing the head dtype used by vllm in tests
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
gpu_memory_utilization=0.7,
|
||||
|
||||
@ -1,47 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["nie3e/sentiment-polish-gpt2-small"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for head_dtype_str in ["float32", "model"]:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
hf_overrides={"head_dtype": head_dtype_str},
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
model_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
if head_dtype_str == "float32":
|
||||
assert head_dtype == torch.float32
|
||||
elif head_dtype_str == "model":
|
||||
assert head_dtype == model_dtype
|
||||
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).float()
|
||||
vllm_output = torch.tensor(vllm_output).float()
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import types
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -13,12 +14,11 @@ from vllm.model_executor.models.bert import (
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Functional test: SPLADE formula correctness (no HF download needed)
|
||||
# 1) Functional test: SPLADE formula correctness (no HF download needed)
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("B,T,H,V", [(2, 3, 5, 7)])
|
||||
@torch.inference_mode
|
||||
def test_splade_pooler_matches_reference_formula(B, T, H, V):
|
||||
"""Ensure SPLADESparsePooler forward() matches the mathematical formula:
|
||||
log1p(relu(logits)) -> max over sequence length (after masking)."""
|
||||
@ -26,11 +26,9 @@ def test_splade_pooler_matches_reference_formula(B, T, H, V):
|
||||
|
||||
# Prepare [B] sequences of shape [T, H]
|
||||
hs_list = [torch.randn(T, H) for _ in range(B)]
|
||||
hs_tenser = torch.cat(hs_list)
|
||||
|
||||
# Simulate PoolingMetadata (only required fields)
|
||||
prompt_lens = [T, T - 1]
|
||||
prompt_lens_tenser = torch.tensor(prompt_lens, dtype=torch.int32)
|
||||
token_ids = torch.tensor(
|
||||
[
|
||||
[101, 5, 102], # Batch 0: [CLS], token, [SEP]
|
||||
@ -38,9 +36,7 @@ def test_splade_pooler_matches_reference_formula(B, T, H, V):
|
||||
],
|
||||
dtype=torch.long,
|
||||
)
|
||||
meta = types.SimpleNamespace(
|
||||
prompt_lens=prompt_lens_tenser, prompt_token_ids=token_ids
|
||||
)
|
||||
meta = types.SimpleNamespace(prompt_lens=prompt_lens, prompt_token_ids=token_ids)
|
||||
|
||||
# MLM head (prefer BertMLMHead, fallback to Linear if unavailable)
|
||||
try:
|
||||
@ -50,10 +46,10 @@ def test_splade_pooler_matches_reference_formula(B, T, H, V):
|
||||
|
||||
# Forward pass through SPLADE pooler
|
||||
pooler = SPLADESparsePooler(mlm_head=mlm_head, pooling="max", remove_cls_sep=True)
|
||||
pooled = pooler(hidden_states=hs_tenser, pooling_metadata=meta) # list of [V]
|
||||
pooled = pooler(hidden_states=hs_list, pooling_metadata=meta) # list of [V]
|
||||
|
||||
# Basic output checks
|
||||
assert isinstance(pooled, torch.Tensor) and len(pooled) == B
|
||||
assert isinstance(pooled, list) and len(pooled) == B
|
||||
for vec in pooled:
|
||||
assert vec.shape == (V,)
|
||||
assert torch.isfinite(vec).all()
|
||||
@ -87,3 +83,40 @@ def test_splade_pooler_matches_reference_formula(B, T, H, V):
|
||||
rtol=1e-4,
|
||||
atol=1e-4,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# 2) Integration smoke test: end-to-end embedding path wiring
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.cpu_model
|
||||
def test_bert_splade_sparse_embed_smoke(vllm_runner, monkeypatch):
|
||||
"""Ensure BertSpladeSparseEmbeddingModel loads and produces sparse embeddings."""
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
MODEL_ID = "hf-internal-testing/tiny-random-bert"
|
||||
hf_overrides = {"architectures": ["BertSpladeSparseEmbeddingModel"]}
|
||||
|
||||
# Enforce CPU-only execution (optional)
|
||||
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "")
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
tok = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
vocab_size = tok.vocab_size
|
||||
|
||||
# The embed path should route through SPLADESparsePooler
|
||||
with vllm_runner(
|
||||
MODEL_ID,
|
||||
runner="pooling",
|
||||
max_model_len=64,
|
||||
hf_overrides=hf_overrides,
|
||||
) as vm:
|
||||
outs = vm.embed(["hello world", "splade sparse test"])
|
||||
|
||||
# Basic sanity checks
|
||||
assert len(outs) == 2
|
||||
assert outs[0].shape[0] == vocab_size
|
||||
assert outs[1].shape[0] == vocab_size
|
||||
assert np.isfinite(outs[0]).all() and (outs[0] >= 0).all()
|
||||
assert np.isfinite(outs[1]).all() and (outs[1] >= 0).all()
|
||||
|
||||
@ -6,16 +6,12 @@ from collections.abc import Sequence
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
check_embeddings_close,
|
||||
get_vllm_extra_kwargs,
|
||||
)
|
||||
from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close
|
||||
|
||||
# Most embedding models on the STS12 task (See #17175):
|
||||
# - Model implementation and minor changes in tensor dtype
|
||||
@ -169,11 +165,28 @@ def mteb_test_embed_models(
|
||||
hf_model_callback=None,
|
||||
atol=MTEB_EMBED_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# Test embed_dims, isnan and whether to use normalize
|
||||
example_prompts = ["The chef prepared a delicious meal." * 1000]
|
||||
|
||||
# Allow vllm to test using the given dtype, such as float32
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
|
||||
|
||||
# Allow vllm to test using hf_overrides
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
# Allow changing the head dtype used by vllm in tests
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
@ -199,12 +212,9 @@ def mteb_test_embed_models(
|
||||
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Test embedding_size, isnan and whether to use normalize
|
||||
# Test embed_dims, isnan and whether to use normalize
|
||||
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
|
||||
outputs_tensor = torch.tensor(vllm_outputs)
|
||||
assert not torch.any(torch.isnan(outputs_tensor))
|
||||
embedding_size = model_config.embedding_size
|
||||
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
|
||||
assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
|
||||
|
||||
# Accelerate mteb test by setting
|
||||
# SentenceTransformers mteb score to a constant
|
||||
@ -221,7 +231,7 @@ def mteb_test_embed_models(
|
||||
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
|
||||
st_dtype = next(hf_model.model.parameters()).dtype
|
||||
|
||||
# Check embeddings close to hf outputs
|
||||
# Test embed_dims and whether to use normalize
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
@ -313,7 +323,24 @@ def mteb_test_rerank_models(
|
||||
vllm_mteb_encoder=VllmMtebEncoder,
|
||||
atol=MTEB_RERANK_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# Allow vllm to test using the given dtype, such as float32
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
|
||||
|
||||
# Allow vllm to test using hf_overrides
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
# Allow changing the head dtype used by vllm in tests
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
|
||||
@ -25,6 +25,10 @@ EMBEDDING_MODELS = [
|
||||
mteb_score=0.824413164,
|
||||
architecture="XLMRobertaModel",
|
||||
is_matryoshka=True,
|
||||
# The default max length of the model is 8194, which will crash
|
||||
# CUDAGraph due to odd length for Gemm. We set it to 8192 to avoid
|
||||
# avoid this issue.
|
||||
max_model_len=8192,
|
||||
dtype="float32",
|
||||
)
|
||||
]
|
||||
|
||||
@ -707,6 +707,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
|
||||
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
|
||||
# FIXME: https://github.com/huggingface/transformers/issues/38358
|
||||
marks=[pytest.mark.skip("Model initialization fails")],
|
||||
),
|
||||
"qwen2_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
||||
|
||||
@ -76,6 +76,9 @@ class _HfExamplesInfo:
|
||||
trust_remote_code: bool = False
|
||||
"""The ``trust_remote_code`` level required to load the model."""
|
||||
|
||||
v0_only: bool = False
|
||||
"""The model is only available with the vLLM V0 engine."""
|
||||
|
||||
hf_overrides: dict[str, Any] = field(default_factory=dict)
|
||||
"""The ``hf_overrides`` required to load the model."""
|
||||
|
||||
@ -262,10 +265,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo(
|
||||
"bigcode/starcoder",
|
||||
extras={
|
||||
"tiny": "bigcode/tiny_starcoder_py",
|
||||
"santacoder": "bigcode/gpt_bigcode-santacoder",
|
||||
},
|
||||
extras={"tiny": "bigcode/tiny_starcoder_py"},
|
||||
min_transformers_version="4.55.1",
|
||||
transformers_version_reason="HF model broken in 4.55.0",
|
||||
),
|
||||
@ -694,6 +694,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
|
||||
"MiniMaxAI/MiniMax-VL-01",
|
||||
trust_remote_code=True,
|
||||
v0_only=True,
|
||||
),
|
||||
"Mistral3ForConditionalGeneration": _HfExamplesInfo(
|
||||
"mistralai/Mistral-Small-3.1-24B-Instruct-2503",
|
||||
@ -751,8 +752,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"Qwen/Qwen-VL",
|
||||
extras={"chat": "Qwen/Qwen-VL-Chat"},
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.53.3",
|
||||
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
|
||||
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
|
||||
),
|
||||
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo(
|
||||
|
||||
@ -88,15 +88,13 @@ def can_initialize(
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
|
||||
if model_arch == "MiniMaxVL01ForConditionalGeneration":
|
||||
pytest.skip(
|
||||
"pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
|
||||
)
|
||||
|
||||
with (
|
||||
patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
|
||||
monkeypatch.context() as m,
|
||||
):
|
||||
if model_info.v0_only:
|
||||
# NOTE(woosuk): skip the test for V0-only models
|
||||
return
|
||||
if model_arch == "GptOssForCausalLM":
|
||||
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
|
||||
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
|
||||
@ -134,6 +132,8 @@ def can_initialize(
|
||||
@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
|
||||
def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test initializing small subset of supported models"""
|
||||
if model_arch == "Lfm2ForCausalLM":
|
||||
pytest.skip("Skipping until test supports V1-only models")
|
||||
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
|
||||
|
||||
|
||||
@ -144,6 +144,8 @@ def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.Monkey
|
||||
This test covers the complement of the tests covered in the "small subset"
|
||||
test.
|
||||
"""
|
||||
if model_arch == "Lfm2ForCausalLM":
|
||||
pytest.skip("Skipping until test supports V1-only models")
|
||||
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
|
||||
|
||||
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.multimodal.processing import InputProcessingContext
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from .. import ci_envs
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
TokensText = tuple[list[int], str]
|
||||
@ -415,35 +414,6 @@ class GenerateModelInfo(ModelInfo):
|
||||
hf_ppl: float | None = None
|
||||
|
||||
|
||||
def get_vllm_extra_kwargs(model_info: ModelInfo, vllm_extra_kwargs):
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
import pytest
|
||||
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
# Allow vllm to test using the given dtype, such as float32
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
|
||||
|
||||
# Allow vllm to test using hf_overrides
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
# Allow changing the head dtype used by vllm in tests
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
# Allow control over whether tests use enforce_eager
|
||||
if ci_envs.VLLM_CI_ENFORCE_EAGER is not None:
|
||||
vllm_extra_kwargs["enforce_eager"] = ci_envs.VLLM_CI_ENFORCE_EAGER
|
||||
|
||||
return vllm_extra_kwargs
|
||||
|
||||
|
||||
def dummy_hf_overrides(
|
||||
hf_config: PretrainedConfig,
|
||||
*,
|
||||
|
||||
@ -1,124 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
parser_name = "ernie45"
|
||||
|
||||
REASONING_MODEL_NAME = "baidu/ERNIE-4.5-21B-A3B-Thinking"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def ernie45_tokenizer():
|
||||
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
|
||||
|
||||
|
||||
# 带 </think>,非stream
|
||||
WITH_THINK = {
|
||||
"output": "abc</think>def",
|
||||
"reasoning_content": "abc",
|
||||
"content": "def",
|
||||
}
|
||||
# 带 </think>,stream
|
||||
WITH_THINK_STREAM = {
|
||||
"output": "abc</think>def",
|
||||
"reasoning_content": "abc",
|
||||
"content": "def",
|
||||
}
|
||||
# without </think>, all is reasoning_content
|
||||
WITHOUT_THINK = {
|
||||
"output": "abc",
|
||||
"reasoning_content": "abc",
|
||||
"content": None,
|
||||
}
|
||||
# without </think>, all is reasoning_content
|
||||
WITHOUT_THINK_STREAM = {
|
||||
"output": "abc",
|
||||
"reasoning_content": "abc",
|
||||
"content": None,
|
||||
}
|
||||
|
||||
COMPLETE_REASONING = {
|
||||
"output": "abc</think>",
|
||||
"reasoning_content": "abc",
|
||||
"content": None,
|
||||
}
|
||||
MULTILINE_REASONING = {
|
||||
"output": "abc\nABC</think>def\nDEF",
|
||||
"reasoning_content": "abc\nABC",
|
||||
"content": "def\nDEF",
|
||||
}
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
False,
|
||||
WITH_THINK,
|
||||
id="with_think",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
WITH_THINK_STREAM,
|
||||
id="with_think_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
WITHOUT_THINK,
|
||||
id="without_think",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
WITHOUT_THINK_STREAM,
|
||||
id="without_think_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
COMPLETE_REASONING,
|
||||
id="complete_reasoning",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
COMPLETE_REASONING,
|
||||
id="complete_reasoning_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
MULTILINE_REASONING,
|
||||
id="multiline_reasoning",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
MULTILINE_REASONING,
|
||||
id="multiline_reasoning_stream",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
|
||||
def test_reasoning(
|
||||
streaming: bool,
|
||||
param_dict: dict,
|
||||
ernie45_tokenizer,
|
||||
):
|
||||
output = ernie45_tokenizer.tokenize(param_dict["output"])
|
||||
output_tokens: list[str] = []
|
||||
for token in output:
|
||||
one_token = ernie45_tokenizer.convert_tokens_to_string([token])
|
||||
if one_token:
|
||||
output_tokens.append(one_token)
|
||||
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
|
||||
ernie45_tokenizer
|
||||
)
|
||||
|
||||
reasoning, content = run_reasoning_extraction(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
print()
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert content == param_dict["content"]
|
||||
@ -1,359 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
FunctionCall,
|
||||
ToolCall,
|
||||
)
|
||||
from vllm.entrypoints.openai.tool_parsers import Ernie45ToolParser
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def ernie45_tokenizer():
|
||||
return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ernie45_tool_parser(ernie45_tokenizer):
|
||||
return Ernie45ToolParser(ernie45_tokenizer)
|
||||
|
||||
|
||||
def assert_tool_calls(
|
||||
actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
|
||||
):
|
||||
assert len(actual_tool_calls) == len(expected_tool_calls)
|
||||
|
||||
for actual_tool_call, expected_tool_call in zip(
|
||||
actual_tool_calls, expected_tool_calls
|
||||
):
|
||||
assert isinstance(actual_tool_call.id, str)
|
||||
assert len(actual_tool_call.id) > 0
|
||||
|
||||
assert actual_tool_call.type == "function"
|
||||
assert actual_tool_call.function.name == expected_tool_call.function.name
|
||||
# Compare arguments as JSON objects to handle formatting differences
|
||||
actual_args = json.loads(actual_tool_call.function.arguments)
|
||||
expected_args = json.loads(expected_tool_call.function.arguments)
|
||||
assert actual_args == expected_args
|
||||
|
||||
|
||||
def test_extract_tool_calls_no_tools(ernie45_tool_parser):
|
||||
model_output = "This is a test"
|
||||
extracted_tool_calls = ernie45_tool_parser.extract_tool_calls(
|
||||
model_output, request=None
|
||||
) # type: ignore[arg-type]
|
||||
assert not extracted_tool_calls.tools_called
|
||||
assert extracted_tool_calls.tool_calls == []
|
||||
assert extracted_tool_calls.content == model_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_call",
|
||||
"multiple_tool_calls",
|
||||
"tool_call_with_content_before",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""<tool_call>
|
||||
{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
|
||||
</tool_call>
|
||||
""",
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_temperature",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Beijing",
|
||||
}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""<tool_call>
|
||||
{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
|
||||
</tool_call>
|
||||
<tool_call>
|
||||
{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
|
||||
</tool_call>
|
||||
""",
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_temperature",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Beijing",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_temperature_unit",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Guangzhou",
|
||||
"unit": "c",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""I need to call two tools to handle these two issues separately.
|
||||
</think>
|
||||
|
||||
<tool_call>
|
||||
{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
|
||||
</tool_call>
|
||||
<tool_call>
|
||||
{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
|
||||
</tool_call>
|
||||
""",
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_temperature",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Beijing",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_temperature_unit",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Guangzhou",
|
||||
"unit": "c",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
"I need to call two tools to handle these two issues separately.\n</think>",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls(
|
||||
ernie45_tool_parser, model_output, expected_tool_calls, expected_content
|
||||
):
|
||||
extracted_tool_calls = ernie45_tool_parser.extract_tool_calls(
|
||||
model_output, request=None
|
||||
) # type: ignore[arg-type]
|
||||
assert extracted_tool_calls.tools_called
|
||||
|
||||
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
|
||||
|
||||
assert extracted_tool_calls.content == expected_content
|
||||
|
||||
|
||||
def stream_delta_message_generator(
|
||||
ernie45_tool_parser: Ernie45ToolParser,
|
||||
ernie45_tokenizer: AnyTokenizer,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | None = None,
|
||||
) -> Generator[DeltaMessage, None, None]:
|
||||
all_token_ids = ernie45_tokenizer.encode(model_output, add_special_tokens=False)
|
||||
|
||||
previous_text = ""
|
||||
previous_tokens = None
|
||||
prefix_offset = 0
|
||||
read_offset = 0
|
||||
for i, delta_token in enumerate(all_token_ids):
|
||||
delta_token_ids = [delta_token]
|
||||
previous_token_ids = all_token_ids[:i]
|
||||
current_token_ids = all_token_ids[: i + 1]
|
||||
|
||||
(new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
|
||||
detokenize_incrementally(
|
||||
tokenizer=ernie45_tokenizer,
|
||||
all_input_ids=current_token_ids,
|
||||
prev_tokens=previous_tokens,
|
||||
prefix_offset=prefix_offset,
|
||||
read_offset=read_offset,
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=True,
|
||||
)
|
||||
)
|
||||
|
||||
current_text = previous_text + delta_text
|
||||
|
||||
delta_message = ernie45_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
delta_token_ids,
|
||||
request=request,
|
||||
)
|
||||
if delta_message:
|
||||
yield delta_message
|
||||
|
||||
previous_text = current_text
|
||||
previous_tokens = (
|
||||
previous_tokens + new_tokens if previous_tokens else new_tokens
|
||||
)
|
||||
prefix_offset = new_prefix_offset
|
||||
read_offset = new_read_offset
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_call",
|
||||
"multiple_tool_calls",
|
||||
"tool_call_with_content_before",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""<tool_call>
|
||||
{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
|
||||
</tool_call>
|
||||
""",
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_temperature",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Beijing",
|
||||
}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""<tool_call>
|
||||
{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
|
||||
</tool_call>
|
||||
<tool_call>
|
||||
{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
|
||||
</tool_call>
|
||||
""",
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_temperature",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Beijing",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_temperature_unit",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Guangzhou",
|
||||
"unit": "c",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""I need to call two tools to handle these two issues separately.
|
||||
</think>
|
||||
|
||||
<tool_call>
|
||||
{"name": "get_current_temperature", "arguments": {"location": "Beijing"}}
|
||||
</tool_call>
|
||||
<tool_call>
|
||||
{"name": "get_temperature_unit", "arguments": {"location": "Guangzhou", "unit": "c"}}
|
||||
</tool_call>
|
||||
""",
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_temperature",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Beijing",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_temperature_unit",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"location": "Guangzhou",
|
||||
"unit": "c",
|
||||
}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
"I need to call two tools to handle these two issues separately.\n</think>",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming_incremental(
|
||||
ernie45_tool_parser,
|
||||
ernie45_tokenizer,
|
||||
model_output,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
):
|
||||
"""Verify the Ernie45 Parser streaming behavior by verifying each chunk is as expected.""" # noqa: E501
|
||||
request = ChatCompletionRequest(model=MODEL, messages=[], tools=[])
|
||||
|
||||
tool_calls_dict = {}
|
||||
for delta_message in stream_delta_message_generator(
|
||||
ernie45_tool_parser, ernie45_tokenizer, model_output, request
|
||||
):
|
||||
if (
|
||||
delta_message.role is None
|
||||
and delta_message.content is None
|
||||
and delta_message.reasoning_content is None
|
||||
and len(delta_message.tool_calls) == 0
|
||||
):
|
||||
continue
|
||||
tool_calls = delta_message.tool_calls
|
||||
for tool_call_chunk in tool_calls:
|
||||
index = tool_call_chunk.index
|
||||
if index not in tool_calls_dict:
|
||||
if tool_call_chunk.function.arguments is None:
|
||||
tool_call_chunk.function.arguments = ""
|
||||
tool_calls_dict[index] = tool_call_chunk
|
||||
else:
|
||||
tool_calls_dict[
|
||||
index
|
||||
].function.arguments += tool_call_chunk.function.arguments
|
||||
actual_tool_calls = list(tool_calls_dict.values())
|
||||
|
||||
assert len(actual_tool_calls) > 0
|
||||
# check tool call format
|
||||
assert_tool_calls(actual_tool_calls, expected_tool_calls)
|
||||
@ -157,7 +157,7 @@ class RemoteOpenAIServer:
|
||||
self.host = None
|
||||
self.port = None
|
||||
else:
|
||||
self.host = str(args.host or "127.0.0.1")
|
||||
self.host = str(args.host or "localhost")
|
||||
self.port = int(args.port)
|
||||
|
||||
self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
|
||||
|
||||
@ -1,167 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
prompt = """
|
||||
Generals gathered in their masses
|
||||
Just like witches at black masses
|
||||
Evil minds that plot destruction
|
||||
Sorcerer of death's construction
|
||||
In the fields, the bodies burning
|
||||
As the war machine keeps turning
|
||||
Death and hatred to mankind
|
||||
Poisoning their brainwashed minds
|
||||
Oh, Lord, yeah
|
||||
|
||||
Politicians hide themselves away
|
||||
They only started the war
|
||||
Why should they go out to fight?
|
||||
They leave that all to the poor, yeah
|
||||
Time will tell on their power minds
|
||||
Making war just for fun
|
||||
Treating people just like pawns in chess
|
||||
Wait till their judgment day comes, yeah
|
||||
|
||||
Now, in darkness, world stops turning
|
||||
Ashes where their bodies burning
|
||||
No more war pigs have the power
|
||||
Hand of God has struck the hour
|
||||
Day of Judgment, God is calling
|
||||
On their knees, the war pigs crawling
|
||||
Begging mercies for their sins
|
||||
Satan, laughing, spreads his wings
|
||||
Oh, Lord, yeah
|
||||
"""
|
||||
|
||||
|
||||
class WrapperPooler(nn.Module):
|
||||
def __init__(self, pooler):
|
||||
super().__init__()
|
||||
self.pooler = pooler
|
||||
self.chunks = []
|
||||
|
||||
def get_pooling_updates(self, task):
|
||||
return self.pooler.get_pooling_updates(task)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
pooling_metadata,
|
||||
):
|
||||
self.chunks.append(hidden_states.shape[0])
|
||||
return self.pooler(hidden_states, pooling_metadata)
|
||||
|
||||
|
||||
def inject_pooler(self):
|
||||
model = self.get_model()
|
||||
wrapper = WrapperPooler(model.pooler)
|
||||
model.pooler = wrapper
|
||||
|
||||
|
||||
def retrieve_chunks(self):
|
||||
model = self.get_model()
|
||||
chunks = model.pooler.chunks
|
||||
model.pooler.chunks = []
|
||||
return chunks
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
|
||||
def test_pooling_chunked_prefill(vllm_runner, monkeypatch):
|
||||
"""Test chunked prefill for pooling models with LastPool."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
model_id = "Qwen/Qwen3-Embedding-0.6B"
|
||||
|
||||
chunk_size = 10
|
||||
|
||||
# Set chunking parameters to force chunked prefill
|
||||
# Note: Chunked prefill is automatically handled by vLLM
|
||||
# internally based on the model size and prompt
|
||||
with vllm_runner(
|
||||
model_id,
|
||||
runner="pooling",
|
||||
long_prefill_token_threshold=chunk_size,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
enable_chunked_prefill=True,
|
||||
) as llm:
|
||||
llm.get_llm().llm_engine.collective_rpc(inject_pooler)
|
||||
|
||||
tokenizer = llm.get_llm().get_tokenizer()
|
||||
tokens = tokenizer(prompt)["input_ids"]
|
||||
prompt_len = len(tokens)
|
||||
full_chunks, last_chunk = divmod(prompt_len, chunk_size)
|
||||
expected_chunks = [chunk_size] * full_chunks
|
||||
if last_chunk:
|
||||
expected_chunks.append(last_chunk)
|
||||
llm.embed([prompt])
|
||||
chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
|
||||
|
||||
# Check that PoolerWrapper was called and chunks were received
|
||||
assert len(chunks) > 1
|
||||
assert chunks == expected_chunks
|
||||
|
||||
# Disable chunked prefill
|
||||
with vllm_runner(
|
||||
model_id,
|
||||
runner="pooling",
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
llm.get_llm().llm_engine.collective_rpc(inject_pooler)
|
||||
llm.embed([prompt])
|
||||
chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
|
||||
|
||||
# Check that PoolerWrapper was called and no chunks were received
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == prompt_len
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
|
||||
def test_pooling_prefix_cache(vllm_runner, monkeypatch):
|
||||
"""Test chunked prefill for pooling models with LastPool."""
|
||||
|
||||
verses = prompt.split("\n\n")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
model_id = "Qwen/Qwen3-Embedding-0.6B"
|
||||
|
||||
with vllm_runner(
|
||||
model_id,
|
||||
runner="pooling",
|
||||
enable_prefix_caching=True,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
llm.get_llm().llm_engine.collective_rpc(inject_pooler)
|
||||
tokenizer = llm.get_llm().get_tokenizer()
|
||||
|
||||
prompt1 = "\n\n".join([verses[0], verses[1]])
|
||||
prompt2 = "\n\n".join([verses[0], verses[2]])
|
||||
tokens1 = tokenizer(prompt1)["input_ids"]
|
||||
tokens2 = tokenizer(prompt2)["input_ids"]
|
||||
prompt1_len = len(tokens1)
|
||||
prompt2_len = len(tokens2)
|
||||
|
||||
llm.embed([prompt1])
|
||||
chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == prompt1_len
|
||||
|
||||
llm.embed([prompt2])
|
||||
chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0]
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] <= prompt1_len
|
||||
assert chunks[0] < prompt2_len
|
||||
|
||||
cache_config = llm.get_llm().llm_engine.cache_config
|
||||
print(f"{cache_config=}")
|
||||
# Prefixes are cached in blocks
|
||||
assert (prompt2_len - chunks[0]) % cache_config.block_size == 0
|
||||
@ -17,12 +17,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
from vllm.v1.metrics.loggers import (
|
||||
AggregatedLoggingStatLogger,
|
||||
LoggingStatLogger,
|
||||
PerEngineStatLoggerAdapter,
|
||||
PrometheusStatLogger,
|
||||
)
|
||||
from vllm.v1.metrics.loggers import LoggingStatLogger
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
|
||||
@ -389,12 +384,6 @@ class MockLoggingStatLogger(LoggingStatLogger):
|
||||
self.log = MagicMock()
|
||||
|
||||
|
||||
class MockAggregatedStatLogger(AggregatedLoggingStatLogger):
|
||||
def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]):
|
||||
super().__init__(vllm_config, engine_indexes)
|
||||
self.log = MagicMock()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_customize_loggers(monkeypatch):
|
||||
"""Test that we can customize the loggers.
|
||||
@ -412,45 +401,10 @@ async def test_customize_loggers(monkeypatch):
|
||||
|
||||
await engine.do_log_stats()
|
||||
|
||||
stat_loggers = engine.logger_manager.stat_loggers
|
||||
assert (
|
||||
len(stat_loggers) == 3
|
||||
) # MockLoggingStatLogger + LoggingStatLogger + Promethus Logger
|
||||
print(f"{stat_loggers=}")
|
||||
stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once()
|
||||
assert isinstance(stat_loggers[1], PerEngineStatLoggerAdapter)
|
||||
assert isinstance(stat_loggers[1].per_engine_stat_loggers[0], LoggingStatLogger)
|
||||
assert isinstance(stat_loggers[2], PrometheusStatLogger)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_customize_aggregated_loggers(monkeypatch):
|
||||
"""Test that we can customize the aggregated loggers.
|
||||
If a customized logger is provided at the init, it should
|
||||
be added to the default loggers.
|
||||
"""
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(
|
||||
TEXT_ENGINE_ARGS,
|
||||
stat_loggers=[MockLoggingStatLogger, MockAggregatedStatLogger],
|
||||
)
|
||||
after.callback(engine.shutdown)
|
||||
|
||||
await engine.do_log_stats()
|
||||
|
||||
stat_loggers = engine.logger_manager.stat_loggers
|
||||
assert len(stat_loggers) == 4
|
||||
# MockLoggingStatLogger + MockAggregatedStatLogger
|
||||
# + LoggingStatLogger + PrometheusStatLogger
|
||||
stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once()
|
||||
stat_loggers[1].log.assert_called_once()
|
||||
assert isinstance(stat_loggers[2], PerEngineStatLoggerAdapter)
|
||||
assert isinstance(stat_loggers[2].per_engine_stat_loggers[0], LoggingStatLogger)
|
||||
assert isinstance(stat_loggers[3], PrometheusStatLogger)
|
||||
stat_loggers = engine.logger_manager.per_engine_logger_dict
|
||||
assert len(stat_loggers) == 1
|
||||
assert len(stat_loggers[0]) == 2 # LoggingStatLogger + MockLoggingStatLogger
|
||||
stat_loggers[0][0].log.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
|
||||
@ -76,21 +76,18 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
|
||||
seed.
|
||||
- Keep max_tokens and max_model_len bounded for speed and memory use.
|
||||
"""
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
random.seed(12345)
|
||||
|
||||
# Allow overrides from environment (useful for CI tuning)
|
||||
# "facebook/opt-125m" is too small, doesn't reliably test determinism
|
||||
model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
|
||||
num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
|
||||
max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128"))
|
||||
min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024"))
|
||||
max_random_prompt = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
|
||||
assert max_batch_size >= 2, "Batch size should be >= 2 to mix needle."
|
||||
batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "64"))
|
||||
assert batch_size >= 2, "Batch size should be >= 2 to mix needle."
|
||||
|
||||
# Keep GPU memory usage low to avoid startup allocation failures.
|
||||
gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.4"))
|
||||
max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "5120"))
|
||||
gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.3"))
|
||||
max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "4096"))
|
||||
swap_space_gb = int(os.getenv("VLLM_SWAP_SPACE_GB", "4"))
|
||||
|
||||
# Sampling parameters: longer outputs with a more random-sounding
|
||||
@ -114,7 +111,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
|
||||
# Engine with bs=1 behavior
|
||||
llm_bs1 = LLM_with_max_seqs(
|
||||
model=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
max_num_seqs=1,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
swap_space=swap_space_gb,
|
||||
@ -129,7 +126,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
|
||||
# Engine with larger batch limit (e.g., 64)
|
||||
llm_bsN = LLM_with_max_seqs(
|
||||
model=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
max_num_seqs=batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
swap_space=swap_space_gb,
|
||||
@ -138,16 +135,15 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
|
||||
mismatches = 0
|
||||
|
||||
for trial in range(num_trials):
|
||||
# Create a batch of size `max_batch_size` and insert the needle at
|
||||
# Create a batch of size `batch_size` and insert the needle at
|
||||
# a random index
|
||||
prompts: list[str] = []
|
||||
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
||||
needle_pos = random.randint(0, batch_size - 1)
|
||||
for i in range(batch_size):
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
|
||||
prompts.append(_random_prompt())
|
||||
|
||||
# Generate with the larger-batch engine
|
||||
outputs = llm_bsN.generate(prompts, sampling)
|
||||
@ -158,20 +154,19 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
|
||||
text = needle_output.outputs[0].text
|
||||
|
||||
if text != baseline_text:
|
||||
print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
|
||||
mismatches += 1
|
||||
|
||||
passes = num_trials - mismatches
|
||||
# Dump how many passed vs failed
|
||||
print(
|
||||
f"[determinism] total={num_trials}, passed={passes}, "
|
||||
f"failed={mismatches}, max_batch_size={max_batch_size}"
|
||||
f"failed={mismatches}, batch_size={batch_size}"
|
||||
)
|
||||
|
||||
if mismatches > 0:
|
||||
pytest.fail(
|
||||
f"Nondeterministic outputs detected: {mismatches} failed out "
|
||||
f"of {num_trials} trials (max_batch_size={max_batch_size})."
|
||||
f"of {num_trials} trials (batch_size={batch_size})."
|
||||
)
|
||||
|
||||
finally:
|
||||
@ -204,13 +199,8 @@ def _extract_step_logprobs(request_output):
|
||||
not torch.cuda.is_available(),
|
||||
reason="Requires CUDA to match production inference path.",
|
||||
)
|
||||
@pytest.mark.parametrize("backend", ["FLEX_ATTENTION", "FLASHINFER"])
|
||||
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(backend):
|
||||
backend = os.getenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
|
||||
# model_name = os.getenv("VLLM_TEST_MODEL", "facebook/opt-125m")
|
||||
model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
|
||||
tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
|
||||
|
||||
@ -218,14 +208,16 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(backend):
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tensor_parallel_size=tp_size,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=False,
|
||||
enforce_eager=True, # helps reduce nondeterminism from some backends
|
||||
)
|
||||
|
||||
prompts = [_random_prompt(10, 1024) for i in range(100)]
|
||||
prompts = [
|
||||
"The capital of France is",
|
||||
"The capital of Germany is",
|
||||
]
|
||||
|
||||
sp = SamplingParams(
|
||||
temperature=0.6,
|
||||
temperature=0.0,
|
||||
top_p=1.0,
|
||||
max_tokens=8,
|
||||
# Seed shouldn't matter at temperature=0, but keeping it stable anyway.
|
||||
@ -246,11 +238,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(backend):
|
||||
)
|
||||
bs1_logprobs_per_prompt.append(step_logprobs)
|
||||
|
||||
# BS=N: run prompts in a batch and collect logprobs per step for each
|
||||
# BS=2: run prompts in a batch and collect logprobs per step for each
|
||||
# prompt.
|
||||
outs_batched = llm.generate(prompts, sp, use_tqdm=False)
|
||||
assert len(outs_batched) == len(prompts)
|
||||
bsN_logprobs_per_prompt = []
|
||||
bs2_logprobs_per_prompt = []
|
||||
for o in outs_batched:
|
||||
step_logprobs = _extract_step_logprobs(o)
|
||||
if step_logprobs is None:
|
||||
@ -258,17 +250,17 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(backend):
|
||||
"Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test."
|
||||
)
|
||||
bsN_logprobs_per_prompt.append(step_logprobs)
|
||||
bs2_logprobs_per_prompt.append(step_logprobs)
|
||||
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
|
||||
for i, (logprobs_bs1, logprobs_bsN) in enumerate(
|
||||
zip(bs1_logprobs_per_prompt, bsN_logprobs_per_prompt)
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=2 runs.
|
||||
for i, (logprobs_bs1, logprobs_bs2) in enumerate(
|
||||
zip(bs1_logprobs_per_prompt, bs2_logprobs_per_prompt)
|
||||
):
|
||||
assert len(logprobs_bs1) == len(logprobs_bsN), (
|
||||
assert len(logprobs_bs1) == len(logprobs_bs2), (
|
||||
f"Different number of generation steps for prompt index {i}: "
|
||||
f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
|
||||
f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bs2)} (BS=2)"
|
||||
)
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bs2)):
|
||||
assert a.shape == b.shape, (
|
||||
f"Logits shape mismatch at prompt {i}, step {t}: {a.shape} vs {b.shape}"
|
||||
)
|
||||
@ -305,7 +297,6 @@ def LLM_with_max_seqs(
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
trust_remote_code=os.getenv("VLLM_TRUST_REMOTE_CODE", "0") == "1",
|
||||
enable_prefix_caching=False,
|
||||
enforce_eager=True,
|
||||
# Enable for MOE models
|
||||
# enable_expert_parallel=True,
|
||||
)
|
||||
|
||||
@ -19,18 +19,11 @@ done
|
||||
|
||||
echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
|
||||
|
||||
DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
|
||||
if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
|
||||
KV_CONFIG_HETERO_LAYOUT=',"enable_permute_local_kv":"True"'
|
||||
else
|
||||
KV_CONFIG_HETERO_LAYOUT=''
|
||||
fi
|
||||
|
||||
# Build the kv-transfer-config once
|
||||
if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
|
||||
KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"'${KV_CONFIG_HETERO_LAYOUT}'}'
|
||||
KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
else
|
||||
KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\""${KV_CONFIG_HETERO_LAYOUT}"}"
|
||||
KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}"
|
||||
fi
|
||||
|
||||
# Models to run
|
||||
@ -124,7 +117,6 @@ run_tests_for_model() {
|
||||
|
||||
# Build the command with or without model-specific args
|
||||
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \
|
||||
VLLM_KV_CACHE_LAYOUT='HND' \
|
||||
UCX_NET_DEVICES=all \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \
|
||||
vllm serve $model_name \
|
||||
@ -165,7 +157,6 @@ run_tests_for_model() {
|
||||
|
||||
# Build the command with or without model-specific args
|
||||
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \
|
||||
VLLM_KV_CACHE_LAYOUT=$DECODER_KV_LAYOUT \
|
||||
UCX_NET_DEVICES=all \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \
|
||||
vllm serve $model_name \
|
||||
|
||||
@ -190,6 +190,7 @@ def _make_fake_nixl_pkg():
|
||||
# Copy of FakeNixlWrapper implementation for Ray workers
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from typing import Optional
|
||||
|
||||
{fake_nixl_source}
|
||||
|
||||
@ -286,12 +287,9 @@ def test_prompt_less_than_block_size():
|
||||
class FakeNixlConnectorWorker(NixlConnectorWorker):
|
||||
REMOTE_ENGINE_ID = "remote_engine"
|
||||
|
||||
def __init__(
|
||||
self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs
|
||||
):
|
||||
def __init__(self, *args, hand_shake_latency: float = 1.8, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._hand_shake_latency = hand_shake_latency
|
||||
self.kv_cache_layout = kv_cache_layout
|
||||
|
||||
def _nixl_handshake(
|
||||
self, host: str, port: int, remote_tp_size: int, expected_engine_id: str
|
||||
@ -567,63 +565,10 @@ class TestNixlHandshake:
|
||||
|
||||
# We don't check layout for homogeneous TP and MLA for now, as the
|
||||
# whole block is moved.
|
||||
with pytest.raises(RuntimeError):
|
||||
# mismatched layout is expected to fail
|
||||
worker.add_remote_agent(meta, remote_tp_size=2)
|
||||
worker.add_remote_agent(meta, remote_tp_size=2)
|
||||
with pytest.raises(AssertionError):
|
||||
worker.add_remote_agent(meta, remote_tp_size=1)
|
||||
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
FakeNixlWrapper,
|
||||
)
|
||||
def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
|
||||
self, dist_init
|
||||
):
|
||||
"""
|
||||
Verify that adding a remote agent fails if kv_cache_layout differs.
|
||||
This test is only relevant for heterogeneous TP.
|
||||
"""
|
||||
vllm_config = create_vllm_config(enable_permute_local_kv=True)
|
||||
|
||||
# Mock TP world size to 2 to force heterogeneous TP when
|
||||
# remote_tp_size=1
|
||||
with patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size", # noqa: E501
|
||||
return_value=2,
|
||||
):
|
||||
# Initialize connector and worker (with fake NIXL wrapper)
|
||||
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
|
||||
connector.connector_worker = FakeNixlConnectorWorker(
|
||||
vllm_config,
|
||||
connector.engine_id,
|
||||
hand_shake_latency=0,
|
||||
kv_cache_layout="NHD",
|
||||
)
|
||||
worker = connector.connector_worker
|
||||
|
||||
# Minimal local registration params used by add_remote_agent
|
||||
worker.slot_size_per_layer = [2048]
|
||||
worker.block_len_per_layer = [2048 * worker.block_size]
|
||||
worker.num_blocks = 1
|
||||
worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
|
||||
|
||||
# Metadata with different kv_cache_layout than local worker
|
||||
meta = NixlAgentMetadata(
|
||||
engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
|
||||
agent_metadata=FakeNixlWrapper.AGENT_METADATA,
|
||||
kv_caches_base_addr=[0],
|
||||
num_blocks=1,
|
||||
# prefill TP=1, decode TP=2, remote block_lens is double to local
|
||||
block_lens=[i * 2 for i in worker.block_len_per_layer],
|
||||
attn_backend_name=worker.backend_name,
|
||||
kv_cache_layout="HND",
|
||||
)
|
||||
|
||||
# We don't check layout for homogeneous TP and MLA for now, as the
|
||||
# whole block is moved.
|
||||
worker.add_remote_agent(meta, remote_tp_size=1)
|
||||
|
||||
|
||||
# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
|
||||
# we put here is important. First run ray, it will clean up the resources, then
|
||||
@ -839,75 +784,6 @@ def test_multi_kv_connector_stats_aggregation():
|
||||
assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
|
||||
|
||||
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
FakeNixlWrapper,
|
||||
)
|
||||
def test_scheduler_kv_connector_stats_aggregation():
|
||||
"""Test scheduler and worker KV connector stats aggregation."""
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
scheduler = create_scheduler(create_vllm_config())
|
||||
|
||||
# Worker stats with transfer metrics
|
||||
worker_stats = NixlKVConnectorStats()
|
||||
worker_stats.record_transfer(get_default_xfer_telemetry())
|
||||
worker_stats.data["remote_tokens"] = []
|
||||
|
||||
# Scheduler stats with custom metric (needs dummy transfer to avoid being skipped)
|
||||
scheduler_stats = NixlKVConnectorStats()
|
||||
scheduler_stats.data.update(
|
||||
{ # dummy transfer just for testing, to bypass is_empty() check
|
||||
"transfer_duration": [0],
|
||||
"post_duration": [0],
|
||||
"bytes_transferred": [0],
|
||||
"num_descriptors": [0],
|
||||
"remote_tokens": [128],
|
||||
}
|
||||
)
|
||||
|
||||
# Mock the scheduler connector's stats method
|
||||
scheduler.connector.get_kv_connector_stats = lambda: MultiKVConnectorStats(
|
||||
data={"NixlConnector": scheduler_stats}
|
||||
)
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=["req_0"],
|
||||
req_id_to_index={"req_0": 0},
|
||||
sampled_token_ids=[[123]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[None],
|
||||
kv_connector_output=KVConnectorOutput(
|
||||
kv_connector_stats=MultiKVConnectorStats(
|
||||
data={"NixlConnector": worker_stats}
|
||||
)
|
||||
),
|
||||
)
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=None,
|
||||
num_scheduled_tokens={"req_0": 1},
|
||||
total_num_scheduled_tokens=1,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=[0],
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=set(),
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None,
|
||||
)
|
||||
|
||||
engine_core_outputs = scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
final_stats = next(
|
||||
iter(engine_core_outputs.values())
|
||||
).scheduler_stats.kv_connector_stats
|
||||
nixl_stats = final_stats["NixlConnector"]
|
||||
assert nixl_stats.num_successful_transfers == 2
|
||||
assert nixl_stats.data["remote_tokens"] == [128]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
@ -1267,145 +1143,3 @@ def test_aborted_request_removed_from_worker_in_batch(dist_init):
|
||||
# After abort, the worker should not keep tracking it as "in-batch"
|
||||
assert req.request_id not in connector.connector_worker._reqs_to_process
|
||||
#### Model Runner end ####
|
||||
|
||||
|
||||
class FailingNixlWrapper(FakeNixlWrapper):
|
||||
"""Mock NixlWrapper that fails on specific operations."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fail_handshake = False
|
||||
self.fail_transfer_setup = False
|
||||
self.fail_send_notif = False
|
||||
|
||||
def add_remote_agent(self, agent_metadata: bytes) -> str:
|
||||
if self.fail_handshake:
|
||||
from zmq.error import Again
|
||||
|
||||
raise Again("Simulated timeout failure")
|
||||
return super().add_remote_agent(agent_metadata)
|
||||
|
||||
def make_prepped_xfer(
|
||||
self,
|
||||
xfer_type: str,
|
||||
local_xfer_side_handle: int,
|
||||
local_block_descs_ids: list[int],
|
||||
remote_xfer_side_handle: int,
|
||||
remote_block_descs_ids: list[int],
|
||||
notif_msg: bytes | None = None,
|
||||
) -> int:
|
||||
if self.fail_transfer_setup:
|
||||
# classic RuntimeError to simulate failure
|
||||
raise RuntimeError("BAD STATUS")
|
||||
return super().make_prepped_xfer(
|
||||
xfer_type,
|
||||
local_xfer_side_handle,
|
||||
local_block_descs_ids,
|
||||
remote_xfer_side_handle,
|
||||
remote_block_descs_ids,
|
||||
notif_msg,
|
||||
)
|
||||
|
||||
def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
|
||||
if self.fail_send_notif:
|
||||
raise RuntimeError("Simulated send_notif failure")
|
||||
return super().send_notif(agent_name, notif_msg)
|
||||
|
||||
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
FailingNixlWrapper,
|
||||
)
|
||||
def test_handshake_failure_returns_finished(dist_init):
|
||||
"""Test that handshake failures mark blocks invalid and return via get_finished."""
|
||||
vllm_config = create_vllm_config()
|
||||
|
||||
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
|
||||
connector.connector_worker = FakeNixlConnectorWorker(
|
||||
vllm_config, connector.engine_id, hand_shake_latency=0.1
|
||||
)
|
||||
connector.connector_worker.nixl_wrapper.fail_handshake = True
|
||||
|
||||
request_id = "test_handshake_fail"
|
||||
metadata = NixlConnectorMetadata()
|
||||
metadata.add_new_req(
|
||||
request_id=request_id,
|
||||
local_block_ids=[1, 2, 3],
|
||||
kv_transfer_params={
|
||||
"remote_block_ids": [4, 5, 6],
|
||||
"remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
|
||||
"remote_host": "localhost",
|
||||
"remote_port": 1234,
|
||||
"remote_tp_size": 1,
|
||||
},
|
||||
)
|
||||
connector.bind_connector_metadata(metadata)
|
||||
|
||||
dummy_ctx = ForwardContext(
|
||||
no_compile_layers={},
|
||||
attn_metadata={},
|
||||
virtual_engine=0,
|
||||
)
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
|
||||
# Wait for handshake to fail
|
||||
time.sleep(0.3)
|
||||
|
||||
# Check that blocks were marked invalid
|
||||
invalid_blocks = connector.get_block_ids_with_load_errors()
|
||||
assert invalid_blocks == {1, 2, 3}
|
||||
|
||||
# Check that request appears in get_finished
|
||||
_, done_recving = connector.get_finished(finished_req_ids=set())
|
||||
assert request_id in done_recving
|
||||
|
||||
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
FailingNixlWrapper,
|
||||
)
|
||||
def test_transfer_setup_failure_returns_finished(dist_init):
|
||||
"""Test that transfer setup failures mark blocks invalid
|
||||
and return via get_finished."""
|
||||
vllm_config = create_vllm_config()
|
||||
|
||||
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
|
||||
connector.connector_worker = FakeNixlConnectorWorker(
|
||||
vllm_config, connector.engine_id, hand_shake_latency=0
|
||||
)
|
||||
connector.connector_worker.nixl_wrapper.fail_transfer_setup = True
|
||||
|
||||
request_id = "test_transfer_fail"
|
||||
metadata = NixlConnectorMetadata()
|
||||
metadata.add_new_req(
|
||||
request_id=request_id,
|
||||
local_block_ids=[7, 8, 9],
|
||||
kv_transfer_params={
|
||||
"remote_block_ids": [10, 11, 12],
|
||||
"remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
|
||||
"remote_host": "localhost",
|
||||
"remote_port": 1234,
|
||||
"remote_tp_size": 1,
|
||||
},
|
||||
)
|
||||
connector.bind_connector_metadata(metadata)
|
||||
|
||||
dummy_ctx = ForwardContext(
|
||||
no_compile_layers={},
|
||||
attn_metadata={},
|
||||
virtual_engine=0,
|
||||
)
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
|
||||
# Wait for handshake to complete and process ready_requests
|
||||
connector.bind_connector_metadata(NixlConnectorMetadata())
|
||||
time.sleep(0.1)
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
|
||||
# check that blocks were marked invalid
|
||||
invalid_blocks = connector.get_block_ids_with_load_errors()
|
||||
assert invalid_blocks == {7, 8, 9}
|
||||
|
||||
# ensure request appears in get_finished
|
||||
_, done_recving = connector.get_finished(finished_req_ids=set())
|
||||
assert request_id in done_recving
|
||||
|
||||
@ -83,7 +83,6 @@ def create_vllm_config(
|
||||
block_size: int = 16,
|
||||
max_model_len: int = 10000,
|
||||
enable_chunked_prefill: bool = True,
|
||||
enable_permute_local_kv: bool = False,
|
||||
) -> VllmConfig:
|
||||
"""Initialize VllmConfig For Testing."""
|
||||
scheduler_config = SchedulerConfig(
|
||||
@ -109,7 +108,6 @@ def create_vllm_config(
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="NixlConnector",
|
||||
kv_role="kv_both",
|
||||
enable_permute_local_kv=enable_permute_local_kv,
|
||||
)
|
||||
return VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
|
||||
@ -54,7 +54,7 @@ async def test_async_llm_replace_default_loggers(log_stats_enabled_engine_args):
|
||||
engine = AsyncLLM.from_engine_args(
|
||||
log_stats_enabled_engine_args, stat_loggers=[RayPrometheusStatLogger]
|
||||
)
|
||||
assert isinstance(engine.logger_manager.stat_loggers[0], RayPrometheusStatLogger)
|
||||
assert isinstance(engine.logger_manager.prometheus_logger, RayPrometheusStatLogger)
|
||||
engine.shutdown()
|
||||
|
||||
|
||||
@ -73,11 +73,9 @@ async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
|
||||
disabled_log_engine_args, stat_loggers=[DummyStatLogger]
|
||||
)
|
||||
|
||||
assert len(engine.logger_manager.stat_loggers) == 2
|
||||
assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1
|
||||
assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1
|
||||
assert isinstance(
|
||||
engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0],
|
||||
DummyStatLogger,
|
||||
engine.logger_manager.per_engine_logger_dict[0][0], DummyStatLogger
|
||||
)
|
||||
|
||||
# log_stats is still True, since custom stat loggers are used
|
||||
|
||||
@ -26,7 +26,6 @@ import regex as re
|
||||
FILES = [
|
||||
"vllm/*.py",
|
||||
"vllm/assets",
|
||||
"vllm/distributed",
|
||||
"vllm/entrypoints",
|
||||
"vllm/inputs",
|
||||
"vllm/logging_utils",
|
||||
@ -43,6 +42,7 @@ SEPARATE_GROUPS = [
|
||||
"tests",
|
||||
"vllm/attention",
|
||||
"vllm/compilation",
|
||||
"vllm/distributed",
|
||||
"vllm/engine",
|
||||
"vllm/executor",
|
||||
"vllm/inputs",
|
||||
|
||||
@ -173,7 +173,6 @@ def cp_lse_ag_out_rs(
|
||||
cp_attn_lse: torch.Tensor,
|
||||
cp_group: GroupCoordinator,
|
||||
ctx: CPTritonContext = None,
|
||||
return_lse=False,
|
||||
):
|
||||
"""
|
||||
cp_attn_out: [ B, H, D ]
|
||||
@ -193,15 +192,8 @@ def cp_lse_ag_out_rs(
|
||||
|
||||
cp_attn_lse = cp_attn_lse.contiguous()
|
||||
lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
|
||||
out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
|
||||
assert out.is_contiguous()
|
||||
out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
|
||||
out = cp_group.reduce_scatter(out, dim=1)
|
||||
|
||||
if return_lse:
|
||||
cp_num_heads = lse.shape[1] // cp_group.world_size
|
||||
cp_rank = cp_group.rank_in_group
|
||||
lse = lse[:, cp_num_heads * cp_rank : cp_num_heads * (cp_rank + 1)]
|
||||
return out, lse
|
||||
return out
|
||||
|
||||
|
||||
|
||||
@ -572,7 +572,6 @@ class RandomDataset(BenchmarkDataset):
|
||||
# Ensure the lower bound for output length is at least 1 to
|
||||
# prevent sampling 0 tokens.
|
||||
output_low = max(output_low, 1)
|
||||
output_high = max(output_high, 1)
|
||||
|
||||
if input_low > input_high:
|
||||
raise ValueError(
|
||||
@ -639,112 +638,6 @@ class RandomDataset(BenchmarkDataset):
|
||||
return prompt, total_input_len, token_mismatch
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Random Dataset Implementation (Synthetic Data)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class RandomDatasetForReranking(RandomDataset):
|
||||
"""
|
||||
Random dataset specialized for the needs of scoring:
|
||||
- Batches of inputs
|
||||
- Inputs composed of pairs
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
request_id_prefix: str = "",
|
||||
range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
|
||||
input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
|
||||
batchsize: int = 1,
|
||||
is_reranker: bool = True,
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
n_sep_tokens = int(is_reranker)
|
||||
|
||||
query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len
|
||||
|
||||
query_lens, _, query_offsets = self.get_sampling_params(
|
||||
1, range_ratio, query_len_param, 0, tokenizer
|
||||
)
|
||||
|
||||
query_len = int(query_lens[0])
|
||||
|
||||
if not is_reranker:
|
||||
assert num_requests > 1 and batchsize > 1
|
||||
num_requests -= 1
|
||||
batchsize -= 1
|
||||
doc_len_param = input_len
|
||||
else:
|
||||
doc_len_param = input_len - query_len - n_sep_tokens
|
||||
|
||||
doc_lens, _, doc_offsets = self.get_sampling_params(
|
||||
num_requests, range_ratio, doc_len_param, 0, tokenizer
|
||||
)
|
||||
vocab_size = tokenizer.vocab_size
|
||||
|
||||
query_prompt, query_input_len, token_mismatch_total = (
|
||||
self.generate_token_sequence(
|
||||
tokenizer=tokenizer,
|
||||
prefix_token_ids=[],
|
||||
prefix_len=0,
|
||||
vocab_size=vocab_size,
|
||||
input_len=query_len,
|
||||
offset=int(query_offsets[0]),
|
||||
index=0,
|
||||
)
|
||||
)
|
||||
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
|
||||
tokenizer=tokenizer,
|
||||
prefix_token_ids=[],
|
||||
prefix_len=0,
|
||||
vocab_size=vocab_size,
|
||||
input_len=int(doc_lens[i]),
|
||||
offset=int(doc_offsets[i]),
|
||||
index=i + 1,
|
||||
)
|
||||
token_mismatch_total += token_mismatch
|
||||
requests.append((prompt, total_input_len))
|
||||
|
||||
batch_requests = []
|
||||
# Create batched requests
|
||||
for i in range(0, num_requests, batchsize):
|
||||
batch = requests[i : i + batchsize]
|
||||
query_contrib = (
|
||||
(query_input_len + n_sep_tokens) * len(batch)
|
||||
if is_reranker
|
||||
else query_input_len
|
||||
)
|
||||
batch_requests.append(
|
||||
SampleRequest(
|
||||
prompt=[query_prompt] + [req[0] for req in batch],
|
||||
prompt_len=query_contrib + sum(req[1] for req in batch),
|
||||
expected_output_len=0,
|
||||
request_id=request_id_prefix + str(i // batchsize),
|
||||
)
|
||||
)
|
||||
|
||||
if token_mismatch_total != 0:
|
||||
logger.warning(
|
||||
"Across all generated prompts, there were %d %s tokens "
|
||||
"than expected after decoding and re-encoding. This is "
|
||||
"expected due to the imperfect nature of the sampling "
|
||||
"procedure.",
|
||||
abs(token_mismatch_total),
|
||||
"more" if token_mismatch_total > 0 else "fewer",
|
||||
)
|
||||
|
||||
return batch_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MultiModalDataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -1256,7 +1149,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
"sonnet",
|
||||
"random",
|
||||
"random-mm",
|
||||
"random-rerank",
|
||||
"hf",
|
||||
"custom",
|
||||
"prefix_repetition",
|
||||
@ -1400,14 +1292,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
default=1,
|
||||
help=("Batch size for random sampling. Only used for embeddings benchmark."),
|
||||
)
|
||||
random_group.add_argument(
|
||||
"--no-reranker",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether the model supports reranking natively."
|
||||
" Only used for reranker benchmark."
|
||||
),
|
||||
)
|
||||
|
||||
# random multimodal dataset options
|
||||
random_mm_group = parser.add_argument_group(
|
||||
@ -1794,19 +1678,6 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
request_id_prefix=args.request_id_prefix,
|
||||
no_oversample=args.no_oversample,
|
||||
),
|
||||
"random-rerank": lambda: RandomDatasetForReranking(
|
||||
random_seed=args.seed,
|
||||
dataset_path=args.dataset_path,
|
||||
disable_shuffle=args.disable_shuffle,
|
||||
).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
input_len=args.random_input_len,
|
||||
range_ratio=args.random_range_ratio,
|
||||
request_id_prefix=args.request_id_prefix,
|
||||
batchsize=args.random_batch_size,
|
||||
is_reranker=not args.no_reranker,
|
||||
),
|
||||
"prefix_repetition": lambda: PrefixRepetitionRandomDataset(
|
||||
random_seed=args.seed,
|
||||
dataset_path=args.dataset_path,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user