Compare commits
2 Commits
woosuk/rm-
...
zhuohan/re
| Author | SHA1 | Date | |
|---|---|---|---|
| a2599dca0f | |||
| 3fd66b1e73 |
@ -1,12 +1,11 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||
value: 0.80
|
||||
value: 0.90
|
||||
limit: 100
|
||||
num_fewshot: 0
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
|
||||
@ -70,7 +70,7 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
|
||||
@ -416,8 +416,8 @@ steps:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
- label: PyTorch Fullgraph Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -425,7 +425,6 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@ -808,8 +807,8 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 21 min
|
||||
timeout_in_minutes: 30
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
@ -822,6 +821,8 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/fusion.py
|
||||
- vllm/compilation/fusion_attn.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
@ -838,32 +839,15 @@ steps:
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
# Fusion
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@ -1020,11 +1004,6 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
@ -1089,17 +1068,6 @@ steps:
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
|
||||
##### multi gpus test #####
|
||||
@ -1132,7 +1100,7 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
@ -1140,8 +1108,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD033: false
|
||||
MD042: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
|
||||
@ -10,8 +10,7 @@ import torch
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
def with_triton_mode(fn):
|
||||
|
||||
@ -10,8 +10,7 @@ import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
|
||||
@ -7,8 +7,7 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
||||
@ -9,9 +9,9 @@ import torch
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
from vllm.utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
|
||||
155
benchmarks/kernels/benchmark_polynorm.py
Normal file
155
benchmarks/kernels/benchmark_polynorm.py
Normal file
@ -0,0 +1,155 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as vllm_ops
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
|
||||
def polynorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
|
||||
def norm(x, eps: float):
|
||||
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
||||
|
||||
x = x.float()
|
||||
return (
|
||||
(
|
||||
weight[0] * norm(x**3, eps)
|
||||
+ weight[1] * norm(x**2, eps)
|
||||
+ weight[2] * norm(x, eps)
|
||||
+ bias
|
||||
)
|
||||
.to(weight.dtype)
|
||||
.view(orig_shape)
|
||||
)
|
||||
|
||||
|
||||
def polynorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
|
||||
out = torch.empty_like(x)
|
||||
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
||||
output = out
|
||||
|
||||
output = output.view(orig_shape)
|
||||
return output
|
||||
|
||||
|
||||
def calculate_diff(batch_size, seq_len, hidden_dim):
|
||||
dtype = torch.bfloat16
|
||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||
|
||||
output_naive = polynorm_naive(x, weight, bias)
|
||||
output_vllm = polynorm_vllm(x, weight, bias)
|
||||
|
||||
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
||||
print("✅ All implementations match")
|
||||
else:
|
||||
print("❌ Implementations differ")
|
||||
|
||||
|
||||
batch_size_range = [2**i for i in range(0, 7, 2)]
|
||||
seq_length_range = [2**i for i in range(6, 11, 1)]
|
||||
dim_range = [2048, 4096]
|
||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
||||
|
||||
|
||||
def get_benchmark():
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["dim", "batch_size", "seq_len"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["naive", "vllm"],
|
||||
line_names=["Naive", "vLLM"],
|
||||
styles=[("blue", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name="polynorm-perf",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(dim, batch_size, seq_len, provider):
|
||||
dtype = torch.bfloat16
|
||||
hidden_dim = dim * 4
|
||||
|
||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "naive":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: polynorm_naive(x, weight, bias),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: polynorm_vllm(x, weight, bias),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Batch size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seq-len",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Sequence length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hidden-dim",
|
||||
type=int,
|
||||
default=8192,
|
||||
help="Intermediate size of MLP",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-path",
|
||||
type=str,
|
||||
default="./configs/polnorm/",
|
||||
help="Path to save polnorm benchmark results",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Run correctness test
|
||||
calculate_diff(
|
||||
batch_size=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
hidden_dim=args.hidden_dim,
|
||||
)
|
||||
|
||||
benchmark = get_benchmark()
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
||||
@ -7,8 +7,7 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
||||
@ -9,9 +9,9 @@ from tabulate import tabulate
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
from vllm.utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
|
||||
@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
from vllm.utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random_flash,
|
||||
)
|
||||
|
||||
|
||||
@ -1251,7 +1251,7 @@ async def main() -> None:
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the `--model` argument. ",
|
||||
"same as the ``--model`` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
|
||||
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@ -148,6 +148,211 @@ fused_add_rms_norm_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
/* Function specialization in the case of FP16/BF16 tensors.
|
||||
Additional optimizations we can make in this case are
|
||||
packed and vectorized operations, which help with the
|
||||
memory latency bottleneck.
|
||||
|
||||
_f16VecPN struct extends _f16Vec to add operations specifically required for
|
||||
polynomial normalization (poly norm).
|
||||
The original _f16Vec does not include the sum-of-powers computation or
|
||||
in-place polynomial normalization logic. */
|
||||
template <typename scalar_t, int width>
|
||||
struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
|
||||
using Base = _f16Vec<scalar_t, width>;
|
||||
using Converter = typename Base::Converter;
|
||||
using T1 = typename Base::T1;
|
||||
using T2 = typename Base::T2;
|
||||
using Base::data;
|
||||
|
||||
__device__ auto sum_pows() const {
|
||||
float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||
float x2 = z.x * z.x;
|
||||
float x4 = x2 * x2;
|
||||
float x6 = x4 * x2;
|
||||
|
||||
float y2 = z.y * z.y;
|
||||
float y4 = y2 * y2;
|
||||
float y6 = y4 * y2;
|
||||
|
||||
s2 += x2 + y2;
|
||||
s4 += x4 + y4;
|
||||
s6 += x6 + y6;
|
||||
}
|
||||
return std::make_tuple(s2, s4, s6);
|
||||
}
|
||||
|
||||
__device__ void poly_norm_inplace(const float w2_inv_std,
|
||||
const float w1_inv_std2,
|
||||
const float w0_inv_std3, const float bias) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||
|
||||
float x2 = z.x * z.x;
|
||||
float x3 = x2 * z.x;
|
||||
z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
|
||||
|
||||
float y2 = z.y * z.y;
|
||||
float y3 = y2 * z.y;
|
||||
z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
|
||||
|
||||
auto out = Converter::convert(z);
|
||||
data[i] = out.x;
|
||||
data[i + 1] = out.y;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, int width>
|
||||
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ weight, // [3]
|
||||
const scalar_t* __restrict__ bias, // [1]
|
||||
const float epsilon, const int hidden_size) {
|
||||
// Sanity checks on our vector struct and type-punned pointer arithmetic
|
||||
static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
|
||||
static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||
|
||||
/* These and the argument pointers are all declared `restrict` as they are
|
||||
not aliased in practice. Argument pointers should not be dereferenced
|
||||
in this kernel as that would be undefined behavior */
|
||||
auto* __restrict__ input_v =
|
||||
reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
|
||||
const int vec_hidden_size = hidden_size / width;
|
||||
float variance = 0.0f;
|
||||
float variance2 = 0.0f;
|
||||
float variance3 = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||
int id = blockIdx.x * vec_hidden_size + idx;
|
||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
||||
auto [x2, x4, x6] = temp.sum_pows();
|
||||
|
||||
variance += x2;
|
||||
variance2 += x4;
|
||||
variance3 += x6;
|
||||
}
|
||||
|
||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
||||
|
||||
struct SumOp {
|
||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
}
|
||||
};
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
float3 block_variances =
|
||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
||||
|
||||
variance = block_variances.x;
|
||||
variance2 = block_variances.y;
|
||||
variance3 = block_variances.z;
|
||||
|
||||
__shared__ float s_w2_inv_std;
|
||||
__shared__ float s_w1_inv_std2;
|
||||
__shared__ float s_w0_inv_std3;
|
||||
__shared__ float s_bias;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
float w0 = (float)weight[0];
|
||||
float w1 = (float)weight[1];
|
||||
float w2 = (float)weight[2];
|
||||
s_bias = (float)bias[0];
|
||||
|
||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
|
||||
|
||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||
int id = blockIdx.x * vec_hidden_size + idx;
|
||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
||||
temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
|
||||
out_v[id] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Generic poly_norm_kernel
|
||||
The width field is not used here but necessary for other specializations.
|
||||
*/
|
||||
template <typename scalar_t, int width>
|
||||
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ weight, // [3]
|
||||
const scalar_t* __restrict__ bias, // [1]
|
||||
const float epsilon, const int hidden_size) {
|
||||
float variance = 0.0f;
|
||||
float variance2 = 0.0f;
|
||||
float variance3 = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
float x2 = x * x;
|
||||
float x4 = x2 * x2;
|
||||
float x6 = x4 * x2;
|
||||
|
||||
variance += x2;
|
||||
variance2 += x4;
|
||||
variance3 += x6;
|
||||
}
|
||||
|
||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
||||
|
||||
struct SumOp {
|
||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
}
|
||||
};
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
float3 block_variances =
|
||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
||||
|
||||
variance = block_variances.x;
|
||||
variance2 = block_variances.y;
|
||||
variance3 = block_variances.z;
|
||||
|
||||
__shared__ float s_w2_inv_std;
|
||||
__shared__ float s_w1_inv_std2;
|
||||
__shared__ float s_w0_inv_std3;
|
||||
__shared__ float s_bias;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
float w0 = (float)weight[0];
|
||||
float w1 = (float)weight[1];
|
||||
float w2 = (float)weight[2];
|
||||
s_bias = (float)bias[0];
|
||||
|
||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
float x2 = x * x;
|
||||
float x3 = x2 * x;
|
||||
|
||||
out[blockIdx.x * hidden_size + idx] =
|
||||
(scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
|
||||
s_bias);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
@ -159,26 +364,18 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
|
||||
// We cannot just use `input.stride(-2)` if the tensor is not row-major.
|
||||
// Instead, we use a 2d view to get the second-innermost stride.
|
||||
// That way the dimensions (except the last one) can be arbitrarily permuted.
|
||||
torch::Tensor input_view = input.view({-1, hidden_size});
|
||||
|
||||
int num_tokens = input_view.numel() / hidden_size;
|
||||
int64_t input_stride = input_view.stride(-2);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
int64_t input_stride = input.stride(-2);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(hidden_size, 1024));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input_view.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
|
||||
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
});
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||
});
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||
@ -195,8 +392,6 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& residual, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [hidden_size]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
int hidden_size = input.size(-1);
|
||||
@ -239,3 +434,50 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
}
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_POLY_NORM(width) \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
|
||||
vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \
|
||||
weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \
|
||||
hidden_size); \
|
||||
});
|
||||
|
||||
void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [3]
|
||||
torch::Tensor& bias, // [1]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.data_ptr() != input.data_ptr());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
/* This kernel is memory-latency bound in many scenarios.
|
||||
When num_tokens is large, a smaller block size allows
|
||||
for increased block occupancy on CUs and better latency
|
||||
hiding on global mem ops. */
|
||||
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
|
||||
dim3 block(std::min(hidden_size, max_block_size));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
/*If the tensor types are FP16/BF16, try to use the optimized kernel
|
||||
with packed + vectorized ops.
|
||||
Max optimization is achieved with a width-8 vector of FP16/BF16s
|
||||
since we can load at most 128 bits at once in a global memory op.
|
||||
However, this requires each tensor's data to be aligned to 16
|
||||
bytes.
|
||||
*/
|
||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||
LAUNCH_FUSED_POLY_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_POLY_NORM(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -229,8 +229,6 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
int hidden_size = input.size(-1);
|
||||
int input_stride = input.stride(-2);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& gating_output, bool renormalize);
|
||||
torch::Tensor& gating_output);
|
||||
|
||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
|
||||
@ -16,23 +16,12 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <type_traits>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#else
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
typedef __hip_bfloat162 __nv_bfloat162;
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
@ -47,27 +36,16 @@ template <
|
||||
/// Alignment requirement in bytes
|
||||
int Alignment = sizeof(T) * N
|
||||
>
|
||||
struct alignas(Alignment) AlignedArray {
|
||||
T data[N];
|
||||
class alignas(Alignment) AlignedArray {
|
||||
float data[N];
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ float toFloat(T value) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return value;
|
||||
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
|
||||
return __bfloat162float(value);
|
||||
} else if constexpr (std::is_same_v<T, __half>) {
|
||||
return __half2float(value);
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== Softmax things ===============================
|
||||
// We have our own implementation of softmax here so we can support transposing the output
|
||||
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||
template <int TPB, typename InputType>
|
||||
template <int TPB>
|
||||
__launch_bounds__(TPB) __global__
|
||||
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
|
||||
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
|
||||
{
|
||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
@ -88,8 +66,7 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData = max(val, threadData);
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||
@ -104,8 +81,7 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData += expf(val - float_max);
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||
@ -119,9 +95,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = toFloat(input[idx]);
|
||||
const float softmax_val = expf(val - float_max) * normalizing_factor;
|
||||
output[idx] = softmax_val;
|
||||
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||
output[idx] = val;
|
||||
}
|
||||
}
|
||||
|
||||
@ -135,8 +110,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const int num_experts,
|
||||
const int k,
|
||||
const int start_expert,
|
||||
const int end_expert,
|
||||
const bool renormalize)
|
||||
const int end_expert)
|
||||
{
|
||||
|
||||
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||
@ -151,7 +125,6 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
|
||||
const bool row_is_active = finished ? !finished[block_row] : true;
|
||||
const int thread_read_offset = blockIdx.x * num_experts;
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
thread_kvp.key = 0;
|
||||
@ -190,23 +163,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||
assert(indices[idx] >= 0);
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
if (renormalize) {
|
||||
selected_sum += result_kvp.value;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (threadIdx.x == 0) {
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
const int idx = k * block_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== TopK softmax things ===============================
|
||||
@ -225,30 +184,21 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
2) This implementation assumes k is small, but will work for any k.
|
||||
*/
|
||||
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
|
||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||
{
|
||||
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
|
||||
std::is_same_v<InputType, __half>,
|
||||
"InputType must be float, __nv_bfloat16, or __half");
|
||||
|
||||
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||
|
||||
// Number of bytes each thread pulls in per load
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||
|
||||
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
|
||||
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
|
||||
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
|
||||
}
|
||||
|
||||
// Restrictions based on previous section.
|
||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||
@ -286,71 +236,27 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
|
||||
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
||||
// row it will read.
|
||||
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
|
||||
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
||||
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
|
||||
// this can support all powers of 2 up to 16.
|
||||
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
|
||||
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
|
||||
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
|
||||
// Finally, we pull in the data from global mem
|
||||
float row_chunk[VPT];
|
||||
|
||||
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
|
||||
if constexpr (std::is_same_v<InputType, float>) {
|
||||
using VecType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
|
||||
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
|
||||
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __bfloat162float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __half>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
|
||||
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __half2float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
|
||||
{
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
}
|
||||
|
||||
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||
@ -404,7 +310,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
int start_col = first_elt_read_by_thread;
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
// First, each thread does the local argmax
|
||||
@ -458,9 +363,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
output[idx] = max_val;
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
||||
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||
if (renormalize) {
|
||||
selected_sum += max_val;
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
||||
@ -478,28 +380,15 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (thread_group_idx == 0)
|
||||
{
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
const int idx = k * thread_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Constructs some constants needed to partition the work across threads at compile time.
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
||||
struct TopkConstants
|
||||
{
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||
@ -508,21 +397,20 @@ struct TopkConstants
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||
cudaStream_t stream)
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
|
||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||
{
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@ -530,26 +418,26 @@ void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finishe
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream);
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType, typename InputType>
|
||||
template <typename IndType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
const InputType* gating_output,
|
||||
const float* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
@ -557,15 +445,11 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
const int num_tokens,
|
||||
const int num_experts,
|
||||
const int topk,
|
||||
const bool renormalize,
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
|
||||
// elements can be loaded by a warp
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
|
||||
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
@ -622,11 +506,11 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
TORCH_CHECK(softmax_workspace != nullptr,
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
static constexpr int TPB = 256;
|
||||
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts, renormalize);
|
||||
num_experts, topk, 0, num_experts);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -634,50 +518,11 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
} // namespace moe
|
||||
} // namespace vllm
|
||||
|
||||
|
||||
template<typename ComputeType>
|
||||
void dispatch_topk_softmax_launch(
|
||||
torch::Tensor& gating_output,
|
||||
torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& softmax_workspace,
|
||||
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
|
||||
{
|
||||
if (topk_indices.scalar_type() == at::ScalarType::Int) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void topk_softmax(
|
||||
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||
bool renormalize)
|
||||
torch::Tensor& gating_output) // [num_tokens, num_experts]
|
||||
{
|
||||
const int num_experts = gating_output.size(-1);
|
||||
const auto num_tokens = gating_output.numel() / num_experts;
|
||||
@ -689,19 +534,45 @@ void topk_softmax(
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
|
||||
|
||||
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||
if(topk_indices.scalar_type() == at::ScalarType::Int)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// Apply topk softmax to the gating outputs.
|
||||
m.def(
|
||||
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
|
||||
"token_expert_indices, Tensor gating_output) -> ()");
|
||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||
|
||||
// Calculate the result of moe by summing up the partial results
|
||||
|
||||
@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||
torch::Tensor& weight, double epsilon);
|
||||
|
||||
void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
torch::Tensor& bias, double epsilon);
|
||||
|
||||
void apply_repetition_penalties_(torch::Tensor& logits,
|
||||
const torch::Tensor& prompt_mask,
|
||||
const torch::Tensor& output_mask,
|
||||
|
||||
@ -145,11 +145,7 @@ void rms_norm_dynamic_per_token_quant(
|
||||
if (scale_ub.has_value()) {
|
||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||
}
|
||||
TORCH_CHECK(weight.dtype() == input.dtype());
|
||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||
if (residual) {
|
||||
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
|
||||
|
||||
@ -175,6 +175,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"float epsilon) -> ()");
|
||||
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
|
||||
|
||||
// Polynomial Normalization.
|
||||
ops.def(
|
||||
"poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
|
||||
"epsilon) -> ()");
|
||||
ops.impl("poly_norm", torch::kCUDA, &poly_norm);
|
||||
|
||||
// Apply repetition penalties to logits in-place
|
||||
ops.def(
|
||||
"apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
|
||||
|
||||
@ -20,6 +20,8 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.CompilationConfig][]
|
||||
- [vllm.config.VllmConfig][]
|
||||
|
||||
[](){ #offline-inference-api }
|
||||
|
||||
## Offline Inference
|
||||
|
||||
LLM Class.
|
||||
@ -43,14 +45,18 @@ Engine classes for offline and online inference.
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
[](){ #sampling-params }
|
||||
|
||||
- [vllm.SamplingParams][]
|
||||
- [vllm.PoolingParams][]
|
||||
|
||||
[](){ #multi-modality }
|
||||
|
||||
## Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
||||
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
|
||||
|
||||
@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
|
||||
|
||||
There are three main levels of configuration, from highest priority to lowest priority:
|
||||
|
||||
- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
|
||||
- [Request parameters][completions-api] and [input arguments][sampling-params]
|
||||
- [Engine arguments](./engine_args.md)
|
||||
- [Environment variables](./env_vars.md)
|
||||
|
||||
@ -27,6 +27,8 @@ You can monitor the number of preemption requests through Prometheus metrics exp
|
||||
|
||||
In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
|
||||
|
||||
[](){ #chunked-prefill }
|
||||
|
||||
## Chunked Prefill
|
||||
|
||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||
|
||||
@ -7,8 +7,8 @@ toc_depth: 4
|
||||
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
||||
|
||||
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
||||
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
@ -924,6 +924,8 @@ throughput numbers correctly is also adjusted.
|
||||
|
||||
</details>
|
||||
|
||||
[](){ #performance-benchmarks }
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
|
||||
@ -986,6 +988,8 @@ The benchmarking currently runs on a predefined set of models configured in the
|
||||
|
||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
[](){ #nightly-benchmarks }
|
||||
|
||||
## Nightly Benchmarks
|
||||
|
||||
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Summary
|
||||
|
||||
!!! important
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
|
||||
|
||||
@ -39,6 +39,8 @@ For [generative models](../../models/generative_models.md), there are two levels
|
||||
|
||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
||||
|
||||
[](){ #mm-processing-tests }
|
||||
|
||||
### Multi-modal processing
|
||||
|
||||
#### Common tests
|
||||
|
||||
@ -180,13 +180,9 @@ The profiling traces generated by the continuous profiling workflow are publicly
|
||||
The Python standard library includes
|
||||
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
||||
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
||||
Both the `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` functions can be
|
||||
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
|
||||
used to profile a section of code.
|
||||
|
||||
!!! note
|
||||
The legacy import paths `vllm.utils.cprofile` and `vllm.utils.cprofile_context` are deprecated.
|
||||
Please use `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` instead.
|
||||
|
||||
### Example usage - decorator
|
||||
|
||||
The first helper is a Python decorator that can be used to profile a function.
|
||||
@ -194,9 +190,9 @@ If a filename is specified, the profile will be saved to that file. If no filena
|
||||
specified, profile data will be printed to stdout.
|
||||
|
||||
```python
|
||||
from vllm.utils.profiling import cprofile
|
||||
import vllm.utils
|
||||
|
||||
@cprofile("expensive_function.prof")
|
||||
@vllm.utils.cprofile("expensive_function.prof")
|
||||
def expensive_function():
|
||||
# some expensive code
|
||||
pass
|
||||
@ -208,13 +204,13 @@ The second helper is a context manager that can be used to profile a block of
|
||||
code. Similar to the decorator, the filename is optional.
|
||||
|
||||
```python
|
||||
from vllm.utils.profiling import cprofile_context
|
||||
import vllm.utils
|
||||
|
||||
def another_function():
|
||||
# more expensive code
|
||||
pass
|
||||
|
||||
with cprofile_context("another_function.prof"):
|
||||
with vllm.utils.cprofile_context("another_function.prof"):
|
||||
another_function()
|
||||
```
|
||||
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# Using Docker
|
||||
|
||||
[](){ #deployment-docker-pre-built-image }
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
||||
vLLM offers an official Docker image for deployment.
|
||||
@ -60,6 +62,8 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
[](){ #deployment-docker-build-image-from-source }
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# Anyscale
|
||||
|
||||
[](){ #deployment-anyscale }
|
||||
|
||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||
|
||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-build }
|
||||
|
||||
## Build Nginx Container
|
||||
|
||||
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
|
||||
@ -25,6 +27,8 @@ Build the container:
|
||||
docker build . -f Dockerfile.nginx --tag nginx-lb
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-conf }
|
||||
|
||||
## Create Simple Nginx Config file
|
||||
|
||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||
@ -49,6 +53,8 @@ Create a file named `nginx_conf/nginx.conf`. Note that you can add as many serve
|
||||
}
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-vllm-container }
|
||||
|
||||
## Build vLLM Container
|
||||
|
||||
```bash
|
||||
@ -67,12 +73,16 @@ docker build \
|
||||
--build-arg https_proxy=$https_proxy
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-docker-network }
|
||||
|
||||
## Create Docker Network
|
||||
|
||||
```bash
|
||||
docker network create vllm_nginx
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-launch-container }
|
||||
|
||||
## Launch vLLM Containers
|
||||
|
||||
Notes:
|
||||
@ -112,6 +122,8 @@ Notes:
|
||||
!!! note
|
||||
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-launch-nginx }
|
||||
|
||||
## Launch Nginx
|
||||
|
||||
```bash
|
||||
@ -123,6 +135,8 @@ docker run \
|
||||
--name nginx-lb nginx-lb:latest
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-verify-nginx }
|
||||
|
||||
## Verify That vLLM Servers Are Ready
|
||||
|
||||
```bash
|
||||
|
||||
@ -47,7 +47,7 @@ Here is a sample of `LLM` class usage:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
More API details can be found in the [Offline Inference](../api/README.md#offline-inference) section of the API docs.
|
||||
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
||||
|
||||
The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Multi-Modal Data Processing
|
||||
|
||||
To enable various optimizations in vLLM such as [chunked prefill](../configuration/optimization.md#chunked-prefill) and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||
To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||
|
||||
Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
|
||||
|
||||
@ -41,10 +41,14 @@ While HF processors support text + multi-modal inputs natively, this is not so f
|
||||
|
||||
Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
|
||||
|
||||
[](){ #mm-dummy-text }
|
||||
|
||||
### Dummy text
|
||||
|
||||
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
|
||||
|
||||
[](){ #mm-automatic-prompt-updating }
|
||||
|
||||
### Automatic prompt updating
|
||||
|
||||
We address the second issue by implementing model-agnostic code in
|
||||
@ -60,4 +64,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github
|
||||
|
||||
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
||||
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
## Debugging
|
||||
|
||||
Please see the [Troubleshooting](../usage/troubleshooting.md#python-multiprocessing)
|
||||
Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
|
||||
page for information on known issues and how to solve them.
|
||||
|
||||
## Introduction
|
||||
|
||||
@ -41,7 +41,7 @@ Every plugin has three parts:
|
||||
|
||||
1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
|
||||
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
|
||||
3. **Plugin value**: The fully qualified name of the function or module to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||
3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||
|
||||
## Types of supported plugins
|
||||
|
||||
@ -51,8 +51,6 @@ Every plugin has three parts:
|
||||
|
||||
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
|
||||
|
||||
- **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
|
||||
|
||||
## Guidelines for Writing Plugins
|
||||
|
||||
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
||||
|
||||
@ -36,9 +36,9 @@ th:not(:first-child) {
|
||||
}
|
||||
</style>
|
||||
|
||||
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
||||
@ -57,14 +57,16 @@ th:not(:first-child) {
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
|
||||
[](){ #feature-x-hardware }
|
||||
|
||||
### Feature x Hardware
|
||||
|
||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | Intel GPU |
|
||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
|
||||
| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [CP][chunked-prefill] | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970) |
|
||||
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Multimodal Inputs
|
||||
|
||||
This page teaches you how to pass multi-modal inputs to [multi-modal models](../models/supported_models.md#list-of-multimodal-language-models) in vLLM.
|
||||
This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
|
||||
|
||||
!!! note
|
||||
We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
|
||||
|
||||
@ -153,7 +153,7 @@ VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
|
||||
|
||||
### Pre-built images
|
||||
|
||||
See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
|
||||
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
|
||||
|
||||
### Build image from source
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
|
||||
|
||||
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
||||
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-wheel-from-source) for more details.
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
@ -44,6 +44,8 @@ export CUDA_VERSION=118 # or 126
|
||||
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
|
||||
```
|
||||
|
||||
[](){ #install-the-latest-code }
|
||||
|
||||
#### Install the latest code
|
||||
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||
@ -126,11 +128,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
|
||||
uv pip install --editable .
|
||||
```
|
||||
|
||||
You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
|
||||
You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
|
||||
|
||||
!!! note
|
||||
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
|
||||
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
|
||||
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
|
||||
|
||||
#### Full build (with compilation)
|
||||
|
||||
@ -248,7 +250,7 @@ uv pip install -e .
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
|
||||
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
|
||||
|
||||
Another way to access the latest code is to use the docker images:
|
||||
|
||||
@ -264,11 +266,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
|
||||
See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
|
||||
See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:supported-features]
|
||||
|
||||
See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
|
||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
||||
|
||||
# --8<-- [end:supported-features]
|
||||
|
||||
@ -66,6 +66,8 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-wheels"
|
||||
|
||||
[](){ #build-from-source }
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
@ -217,6 +217,6 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:supported-features]
|
||||
|
||||
See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
|
||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
||||
|
||||
# --8<-- [end:supported-features]
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
This guide will help you quickly get started with vLLM to perform:
|
||||
|
||||
- [Offline batched inference](#offline-batched-inference)
|
||||
- [Online serving using OpenAI-compatible server](#openai-compatible-server)
|
||||
- [Offline batched inference][quickstart-offline]
|
||||
- [Online serving using OpenAI-compatible server][quickstart-online]
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@ -42,6 +42,8 @@ uv pip install vllm --torch-backend=auto
|
||||
!!! note
|
||||
For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.
|
||||
|
||||
[](){ #quickstart-offline }
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)
|
||||
@ -55,7 +57,7 @@ The first line of this example imports the classes [LLM][vllm.LLM] and [Sampling
|
||||
from vllm import LLM, SamplingParams
|
||||
```
|
||||
|
||||
The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](../api/README.md#inference-parameters).
|
||||
The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
|
||||
|
||||
!!! important
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
|
||||
@ -133,6 +135,8 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
[](){ #quickstart-online }
|
||||
|
||||
## OpenAI-Compatible Server
|
||||
|
||||
vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
|
||||
@ -146,7 +150,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
|
||||
|
||||
!!! note
|
||||
By default, the server uses a predefined chat template stored in the tokenizer.
|
||||
You can learn about overriding it [here](../serving/openai_compatible_server.md#chat-template).
|
||||
You can learn about overriding it [here][chat-template].
|
||||
!!! important
|
||||
By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
|
||||
@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors
|
||||
|
||||
Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
|
||||
|
||||
To enable this feature, use the `--load-format fastsafetensors` command-line argument
|
||||
To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
|
||||
|
||||
@ -140,5 +140,5 @@ outputs = llm.chat(conversation, chat_template=custom_template)
|
||||
|
||||
Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Completions API](../serving/openai_compatible_server.md#completions-api) is similar to `LLM.generate` but only accepts text.
|
||||
- [Chat API](../serving/openai_compatible_server.md#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
|
||||
- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
|
||||
- [Chat API][chat-api] is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
|
||||
|
||||
@ -16,8 +16,8 @@
|
||||
| meta-llama/Llama-4-* | Llama4ForConditionalGeneration | ❌ |
|
||||
| microsoft/Phi-3-mini-128k-instruct | Phi3ForCausalLM | 🟨 |
|
||||
| microsoft/phi-4 | Phi3ForCausalLM | ❌ |
|
||||
| google/gemma-3-27b-it | TransformersForMultimodalLM | 🟨 |
|
||||
| google/gemma-3-4b-it | TransformersForMultimodalLM | ❌ |
|
||||
| google/gemma-3-27b-it | Gemma3ForConditionalGeneration | 🟨 |
|
||||
| google/gemma-3-4b-it | Gemma3ForConditionalGeneration | ❌ |
|
||||
| deepseek-ai/DeepSeek-R1 | DeepseekV3ForCausalLM | ❌ |
|
||||
| deepseek-ai/DeepSeek-V3 | DeepseekV3ForCausalLM | ❌ |
|
||||
| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | LlamaForCausalLM | ✅ |
|
||||
|
||||
@ -185,10 +185,10 @@ print(f"Data: {data!r}")
|
||||
|
||||
Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
- [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
|
||||
- [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
|
||||
- [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models.
|
||||
- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
|
||||
- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
|
||||
- [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
|
||||
|
||||
## Matryoshka Embeddings
|
||||
|
||||
|
||||
@ -11,7 +11,9 @@ Alongside each architecture, we include some popular models that use it.
|
||||
|
||||
If vLLM natively supports a model, its implementation can be found in [vllm/model_executor/models](../../vllm/model_executor/models).
|
||||
|
||||
These models are what we list in [supported text models](#list-of-text-only-language-models) and [supported multimodal models](#list-of-multimodal-language-models).
|
||||
These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models].
|
||||
|
||||
[](){ #transformers-backend }
|
||||
|
||||
### Transformers
|
||||
|
||||
@ -58,7 +60,7 @@ For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
|
||||
- The model directory must have the correct structure (e.g. `config.json` is present).
|
||||
- `config.json` must contain `auto_map.AutoModel`.
|
||||
- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
|
||||
- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
|
||||
- Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
|
||||
|
||||
If the compatible model is:
|
||||
@ -68,6 +70,8 @@ If the compatible model is:
|
||||
|
||||
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
|
||||
[](){ #writing-custom-models }
|
||||
|
||||
#### Writing custom models
|
||||
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
@ -112,7 +116,7 @@ Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers.py](../../vllm/model_executor/models/transformers.py) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
@ -160,7 +164,7 @@ To determine whether a given model is natively supported, you can check the `con
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
|
||||
|
||||
Models do not _need_ to be natively supported to be used in vLLM.
|
||||
The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
|
||||
!!! tip
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
@ -302,6 +306,8 @@ output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
[](){ #feature-status-legend }
|
||||
|
||||
## Feature Status Legend
|
||||
|
||||
- ✅︎ indicates that the feature is supported for the model.
|
||||
@ -310,6 +316,8 @@ print(output)
|
||||
|
||||
- ⚠️ indicates that the feature is available but may have known issues or limitations.
|
||||
|
||||
[](){ #supported-text-models }
|
||||
|
||||
## List of Text-only Language Models
|
||||
|
||||
### Generative Models
|
||||
@ -575,6 +583,8 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
|
||||
!!! note
|
||||
Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
|
||||
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
## List of Multimodal Language Models
|
||||
|
||||
The following modalities are supported depending on the model:
|
||||
@ -640,6 +650,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
|
||||
@ -653,7 +664,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
|
||||
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
|
||||
| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
|
||||
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
|
||||
@ -669,6 +679,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
|
||||
@ -693,8 +704,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
|
||||
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
@ -703,7 +712,21 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||
|
||||
!!! warning
|
||||
For `Gemma3ForConditionalGeneration`, `{"do_pan_and_scan": true}` is not supported in Transformers backend yet.
|
||||
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
|
||||
However, there are differences in how they handle text + image inputs:
|
||||
|
||||
V0 correctly implements the model's attention pattern:
|
||||
- Uses bidirectional attention between the image tokens corresponding to the same image
|
||||
- Uses causal attention for other tokens
|
||||
- Implemented via (naive) PyTorch SDPA with masking tensors
|
||||
- Note: May use significant memory for long prompts with image
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
!!! note
|
||||
`Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
|
||||
@ -755,6 +778,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
|
||||
|
||||
!!! warning
|
||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
||||
|
||||
!!! note
|
||||
For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
|
||||
|
||||
|
||||
@ -69,7 +69,6 @@ There are several notable differences when using Ray:
|
||||
- A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
|
||||
- There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
|
||||
- There is no need to specify `--data-parallel-rpc-port`
|
||||
- When a single DP group requires multiple nodes, *e.g.* in case a single model replica needs to run on at least two nodes, make sure to set `VLLM_RAY_DP_PACK_STRATEGY="span"` in which case `--data-parallel-size-local` is ignored and will be automatically determined
|
||||
- Remote DP ranks will be allocated based on node resources of the Ray cluster
|
||||
|
||||
Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.
|
||||
|
||||
@ -4,7 +4,7 @@ For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md).
|
||||
|
||||
## Verify inter-node GPU communication
|
||||
|
||||
After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
|
||||
After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
|
||||
|
||||
## No available node types can fulfill resource request
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ The available APIs depend on the model type:
|
||||
- [Pooling models](../models/pooling_models.md) output their hidden states directly.
|
||||
|
||||
!!! info
|
||||
[API Reference](../api/README.md#offline-inference)
|
||||
[API Reference][offline-inference-api]
|
||||
|
||||
## Ray Data LLM API
|
||||
|
||||
|
||||
@ -44,35 +44,37 @@ To call the server, in your preferred text editor, create a script that uses an
|
||||
|
||||
We currently support the following OpenAI APIs:
|
||||
|
||||
- [Completions API](#completions-api) (`/v1/completions`)
|
||||
- [Completions API][completions-api] (`/v1/completions`)
|
||||
- Only applicable to [text generation models](../models/generative_models.md).
|
||||
- *Note: `suffix` parameter is not supported.*
|
||||
- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
|
||||
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
|
||||
- [Chat Completions API][chat-api] (`/v1/chat/completions`)
|
||||
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template][chat-template].
|
||||
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
|
||||
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
|
||||
- [Embeddings API][embeddings-api] (`/v1/embeddings`)
|
||||
- Only applicable to [embedding models](../models/pooling_models.md).
|
||||
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
|
||||
- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
|
||||
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
|
||||
- [Translation API](#translations-api) (`/v1/audio/translations`)
|
||||
- [Translation API][translations-api] (`/v1/audio/translations`)
|
||||
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
|
||||
|
||||
In addition, we have the following custom APIs:
|
||||
|
||||
- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
|
||||
- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`)
|
||||
- Applicable to any model with a tokenizer.
|
||||
- [Pooling API](#pooling-api) (`/pooling`)
|
||||
- [Pooling API][pooling-api] (`/pooling`)
|
||||
- Applicable to all [pooling models](../models/pooling_models.md).
|
||||
- [Classification API](#classification-api) (`/classify`)
|
||||
- [Classification API][classification-api] (`/classify`)
|
||||
- Only applicable to [classification models](../models/pooling_models.md).
|
||||
- [Score API](#score-api) (`/score`)
|
||||
- [Score API][score-api] (`/score`)
|
||||
- Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
|
||||
- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
- Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
|
||||
- Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
|
||||
- Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
|
||||
- Only applicable to [cross-encoder models](../models/pooling_models.md).
|
||||
|
||||
[](){ #chat-template }
|
||||
|
||||
## Chat Template
|
||||
|
||||
In order for the language model to support chat protocol, vLLM requires the model to include
|
||||
@ -172,6 +174,8 @@ with `--enable-request-id-headers`.
|
||||
|
||||
## API Reference
|
||||
|
||||
[](){ #completions-api }
|
||||
|
||||
### Completions API
|
||||
|
||||
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
|
||||
@ -181,7 +185,7 @@ Code example: [examples/online_serving/openai_completion_client.py](../../exampl
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [sampling parameters](../api/README.md#inference-parameters) are supported.
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
??? code
|
||||
|
||||
@ -197,6 +201,8 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
|
||||
```
|
||||
|
||||
[](){ #chat-api }
|
||||
|
||||
### Chat API
|
||||
|
||||
Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
|
||||
@ -212,7 +218,7 @@ Code example: [examples/online_serving/openai_chat_completion_client.py](../../e
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [sampling parameters](../api/README.md#inference-parameters) are supported.
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
??? code
|
||||
|
||||
@ -228,6 +234,8 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
|
||||
```
|
||||
|
||||
[](){ #embeddings-api }
|
||||
|
||||
### Embeddings API
|
||||
|
||||
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
|
||||
@ -235,7 +243,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
||||
|
||||
Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
|
||||
|
||||
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
|
||||
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
|
||||
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
|
||||
|
||||
??? code
|
||||
@ -361,6 +369,8 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
|
||||
```
|
||||
|
||||
[](){ #transcriptions-api }
|
||||
|
||||
### Transcriptions API
|
||||
|
||||
Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
|
||||
@ -458,7 +468,7 @@ For `verbose_json` response format:
|
||||
|
||||
#### Extra Parameters
|
||||
|
||||
The following [sampling parameters](../api/README.md#inference-parameters) are supported.
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
??? code
|
||||
|
||||
@ -474,6 +484,8 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
```
|
||||
|
||||
[](){ #translations-api }
|
||||
|
||||
### Translations API
|
||||
|
||||
Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation);
|
||||
@ -488,7 +500,7 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp
|
||||
|
||||
#### Extra Parameters
|
||||
|
||||
The following [sampling parameters](../api/README.md#inference-parameters) are supported.
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
|
||||
@ -500,6 +512,8 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
|
||||
```
|
||||
|
||||
[](){ #tokenizer-api }
|
||||
|
||||
### Tokenizer API
|
||||
|
||||
Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
|
||||
@ -508,14 +522,18 @@ It consists of two endpoints:
|
||||
- `/tokenize` corresponds to calling `tokenizer.encode()`.
|
||||
- `/detokenize` corresponds to calling `tokenizer.decode()`.
|
||||
|
||||
[](){ #pooling-api }
|
||||
|
||||
### Pooling API
|
||||
|
||||
Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
|
||||
|
||||
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
|
||||
Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
|
||||
|
||||
[](){ #classification-api }
|
||||
|
||||
### Classification API
|
||||
|
||||
Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
|
||||
@ -631,6 +649,8 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
|
||||
```
|
||||
|
||||
[](){ #score-api }
|
||||
|
||||
### Score API
|
||||
|
||||
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
||||
@ -836,6 +856,8 @@ The following extra parameters are supported:
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
|
||||
```
|
||||
|
||||
[](){ #rerank-api }
|
||||
|
||||
### Re-rank API
|
||||
|
||||
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
|
||||
|
||||
@ -38,7 +38,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
|
||||
- `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states.
|
||||
- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
|
||||
- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
|
||||
- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. (WARNING: This flag will slow down the token generation by **over 100x**. Do not use unless absolutely needed.)
|
||||
- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
|
||||
|
||||
## Breakpoints
|
||||
|
||||
@ -80,6 +80,8 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
|
||||
If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
|
||||
To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
|
||||
|
||||
[](){ #troubleshooting-incorrect-hardware-driver }
|
||||
|
||||
## Incorrect hardware/driver
|
||||
|
||||
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
|
||||
@ -176,6 +178,8 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b
|
||||
|
||||
Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
|
||||
|
||||
[](){ #troubleshooting-python-multiprocessing }
|
||||
|
||||
## Python multiprocessing
|
||||
|
||||
### `RuntimeError` Exception
|
||||
|
||||
@ -248,8 +248,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
# TODO: Support this in transformers backend
|
||||
# mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
@ -734,26 +733,6 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# LightOnOCR
|
||||
def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [
|
||||
"<|im_start|>system<|im_end|>\n<|im_start|>user\n<|image_pad|><|im_end|>\n<|im_start|>assistant\n"
|
||||
for _ in questions
|
||||
]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="lightonai/LightOnOCR-1B",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
@ -1729,7 +1708,6 @@ model_example_map = {
|
||||
"keye_vl": run_keye_vl,
|
||||
"keye_vl1_5": run_keye_vl1_5,
|
||||
"kimi_vl": run_kimi_vl,
|
||||
"lightonocr": run_lightonocr,
|
||||
"llama4": run_llama4,
|
||||
"llava": run_llava,
|
||||
"llava-next": run_llava_next,
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@ -3,22 +3,16 @@
|
||||
|
||||
import weakref
|
||||
from collections.abc import Callable, Sequence
|
||||
from contextlib import nullcontext
|
||||
from copy import deepcopy
|
||||
|
||||
import depyf
|
||||
from torch import fx
|
||||
from torch._ops import OpOverload
|
||||
from torch.fx._utils import lazy_format_graph_code
|
||||
|
||||
from vllm.compilation.fx_utils import find_op_nodes
|
||||
from vllm.compilation.inductor_pass import InductorPass
|
||||
from vllm.compilation.pass_manager import with_pattern_match_debug
|
||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||
from vllm.config import VllmConfig, get_current_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger("vllm.tests.compile.backend")
|
||||
|
||||
|
||||
class LazyInitPass(InductorPass):
|
||||
@ -51,32 +45,20 @@ class TestBackend:
|
||||
|
||||
def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
|
||||
self.custom_passes = list(passes)
|
||||
vllm_config = get_current_vllm_config()
|
||||
compile_config = vllm_config.compilation_config
|
||||
# Deepcopy to allow multiple TestBackend instances to use the same VllmConfig
|
||||
self.inductor_config = deepcopy(compile_config.inductor_compile_config)
|
||||
compile_config = get_current_vllm_config().compilation_config
|
||||
self.inductor_config = compile_config.inductor_compile_config
|
||||
self.inductor_config["force_disable_caches"] = True
|
||||
self.inductor_config["post_grad_custom_post_pass"] = self.post_pass
|
||||
|
||||
if debug_dump_path := vllm_config.compile_debug_dump_path():
|
||||
logger.debug("Dumping depyf output to %s", debug_dump_path)
|
||||
self.debug_ctx = depyf.prepare_debug(debug_dump_path.as_posix())
|
||||
else:
|
||||
self.debug_ctx = nullcontext()
|
||||
|
||||
def __call__(self, graph: fx.GraphModule, example_inputs):
|
||||
self.graph_pre_compile = deepcopy(graph)
|
||||
from torch._inductor.compile_fx import compile_fx
|
||||
|
||||
with self.debug_ctx:
|
||||
return compile_fx(
|
||||
graph, example_inputs, config_patches=self.inductor_config
|
||||
)
|
||||
return compile_fx(graph, example_inputs, config_patches=self.inductor_config)
|
||||
|
||||
@with_pattern_match_debug
|
||||
def post_pass(self, graph: fx.Graph):
|
||||
self.graph_pre_pass = deepcopy(graph)
|
||||
lazy_format_graph_code("graph_pre_pass", graph.owning_module)
|
||||
|
||||
VllmInductorPass.dump_prefix = 0
|
||||
for pass_ in self.custom_passes:
|
||||
@ -86,7 +68,6 @@ class TestBackend:
|
||||
VllmInductorPass.dump_prefix = None
|
||||
|
||||
self.graph_post_pass = deepcopy(graph)
|
||||
lazy_format_graph_code("graph_post_pass", graph.owning_module)
|
||||
# assign by reference, will reflect the final state of the graph
|
||||
self.final_graph = graph
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
|
||||
@ -20,7 +20,7 @@ from vllm.config import (
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
|
||||
@ -19,7 +19,7 @@ from vllm.config import (
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from ..silly_attention import get_global_counter, reset_global_counter
|
||||
|
||||
@ -27,7 +27,7 @@ from vllm.config import (
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
|
||||
@ -8,7 +8,7 @@ Centralizes custom operation definitions to avoid duplicate registrations.
|
||||
import torch
|
||||
from torch.library import Library
|
||||
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
# Shared library for all compilation test operations
|
||||
# Using "silly" namespace to match existing test expectations
|
||||
|
||||
@ -15,7 +15,7 @@ from vllm.config import (
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
def reference_fn(x: torch.Tensor):
|
||||
|
||||
@ -5,7 +5,7 @@ import dataclasses
|
||||
import pytest
|
||||
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import compare_all_settings
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
|
||||
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
|
||||
|
||||
|
||||
def test_version():
|
||||
|
||||
@ -15,7 +15,7 @@ from vllm.config import (
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from . import silly_attention # noqa: F401
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
@ -10,9 +10,11 @@ import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
@ -21,23 +23,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
||||
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
|
||||
("facebook/opt-125m", {}),
|
||||
(
|
||||
"neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
|
||||
{"dtype": torch.float16},
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
{
|
||||
"dtype": torch.float16,
|
||||
},
|
||||
),
|
||||
(
|
||||
"neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
|
||||
{
|
||||
"dtype": torch.float16,
|
||||
},
|
||||
),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||
]
|
||||
|
||||
if all:
|
||||
TEST_MODELS.extend(
|
||||
[
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
{"dtype": torch.float16},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# TODO: figure out why this fails.
|
||||
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
||||
TEST_MODELS.append(
|
||||
@ -82,38 +83,31 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
||||
"compilation_mode",
|
||||
[CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
|
||||
)
|
||||
@pytest.mark.parametrize("model, model_kwargs", models_list(all=True))
|
||||
@pytest.mark.parametrize("model_info", models_list(all=True))
|
||||
@create_new_process_for_each_test()
|
||||
def test_full_graph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
compilation_mode: int,
|
||||
):
|
||||
if (
|
||||
"w8a8" in model
|
||||
or "w8w8" in model
|
||||
and current_platform.has_device_capability((10, 0))
|
||||
):
|
||||
# int8 removed on Blackwell:
|
||||
pytest.skip("int8 support removed on Blackwell")
|
||||
model, model_kwargs = model_info
|
||||
|
||||
with monkeypatch.context():
|
||||
print(f"MODEL={model}")
|
||||
|
||||
run_model(compilation_mode, model, **model_kwargs)
|
||||
run_model(compilation_mode, model, model_kwargs)
|
||||
|
||||
|
||||
# TODO(luka) add other supported compilation config scenarios here
|
||||
@pytest.mark.parametrize(
|
||||
"compilation_config, model, model_kwargs",
|
||||
"compilation_config, model_info",
|
||||
[
|
||||
# additional compile sizes, only some of the models
|
||||
(
|
||||
CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
|
||||
*model_info,
|
||||
model,
|
||||
)
|
||||
for model_info in models_list(all=False)
|
||||
for model in models_list(all=False)
|
||||
]
|
||||
+ [
|
||||
# RMSNorm + quant fusion, only 8-bit quant models
|
||||
@ -123,19 +117,18 @@ def test_full_graph(
|
||||
custom_ops=["+rms_norm"],
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||
),
|
||||
*model_info,
|
||||
model,
|
||||
)
|
||||
for model_info in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
|
||||
for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
|
||||
]
|
||||
+ [
|
||||
# Test depyf integration works
|
||||
(
|
||||
CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
debug_dump_path=Path(tempfile.gettempdir()),
|
||||
debug_dump_path=tempfile.gettempdir(),
|
||||
),
|
||||
"facebook/opt-125m",
|
||||
{},
|
||||
("facebook/opt-125m", {}),
|
||||
),
|
||||
]
|
||||
+ [
|
||||
@ -149,9 +142,9 @@ def test_full_graph(
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
compile_sizes=[1, 2],
|
||||
),
|
||||
*model_info,
|
||||
model,
|
||||
)
|
||||
for model_info in models_list(all=False)
|
||||
for model in models_list(all=False)
|
||||
if is_torch_equal_or_newer("2.9.0.dev")
|
||||
],
|
||||
)
|
||||
@ -159,24 +152,16 @@ def test_full_graph(
|
||||
@create_new_process_for_each_test()
|
||||
def test_custom_compile_config(
|
||||
compilation_config: CompilationConfig,
|
||||
model: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
):
|
||||
if (
|
||||
"w8a8" in model
|
||||
or "w8w8" in model
|
||||
and current_platform.has_device_capability((10, 0))
|
||||
):
|
||||
# int8 removed on Blackwell:
|
||||
pytest.skip("int8 support removed on Blackwell")
|
||||
|
||||
if compilation_config.use_inductor_graph_partition and not is_torch_equal_or_newer(
|
||||
"2.9.0.dev"
|
||||
):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
model, model_kwargs = model_info
|
||||
print(f"MODEL={model}")
|
||||
run_model(compilation_config, model, **model_kwargs)
|
||||
run_model(compilation_config, model, model_kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -191,16 +176,50 @@ def test_fp8_kv_scale_compile(compilation_mode: int):
|
||||
"calculate_kv_scales": True,
|
||||
"max_model_len": 512,
|
||||
}
|
||||
run_model(compilation_mode, model, **model_kwargs)
|
||||
run_model(compilation_mode, model, model_kwargs)
|
||||
|
||||
|
||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||
compilation_config = (
|
||||
compile_config
|
||||
if isinstance(compile_config, CompilationConfig)
|
||||
else CompilationConfig(level=compile_config)
|
||||
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
|
||||
compilation_config = CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=True,
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
custom_ops=["+quant_fp8"],
|
||||
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
||||
)
|
||||
model_kwargs = {
|
||||
"kv_cache_dtype": "fp8",
|
||||
"max_model_len": 1024,
|
||||
}
|
||||
with (
|
||||
caplog_vllm.at_level(logging.DEBUG),
|
||||
global_force_attn_backend_context_manager(_Backend.FLASHINFER),
|
||||
):
|
||||
run_model(compilation_config, model, model_kwargs)
|
||||
|
||||
try:
|
||||
assert "Fused quantization onto 48 attention nodes" in caplog_vllm.text, (
|
||||
caplog_vllm.text
|
||||
)
|
||||
except AssertionError:
|
||||
# Note: this message is only triggered when the compilation goes
|
||||
# through the custom pass. Due to multiple layers of cache on
|
||||
# PyTorch side, the compilation of a graph may be cached such
|
||||
# that custom pass directly goes through cache. In this case,
|
||||
# we go through this branch and assert that the pass is not
|
||||
# triggered.
|
||||
assert "Fused quantization" not in caplog_vllm.text
|
||||
|
||||
|
||||
def run_model(
|
||||
compile_config: int | CompilationConfig,
|
||||
model: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
):
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
@ -208,17 +227,12 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
# Allow override from model_kwargs
|
||||
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
|
||||
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
|
||||
|
||||
# No cudagraphs by default
|
||||
if compilation_config.cudagraph_mode is None:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
compilation_config=compilation_config,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_custom_all_reduce=True,
|
||||
compilation_config=compile_config,
|
||||
**model_kwargs,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -11,13 +11,7 @@ from vllm.compilation.fusion import RMSNormQuantFusionPass
|
||||
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.config import CompilationConfig, PassConfig, VllmConfig
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
@ -54,7 +48,8 @@ class TestSiluMul(torch.nn.Module):
|
||||
return y
|
||||
|
||||
def example_inputs(self, num_tokens=32, hidden_size=128):
|
||||
return (torch.rand(num_tokens, hidden_size * 2),)
|
||||
dtype = torch.float16 if TEST_FP8 else torch.float32
|
||||
return (torch.rand(num_tokens, hidden_size * 2, dtype=dtype),)
|
||||
|
||||
def ops_in_model(self, do_fusion):
|
||||
if TEST_FP8 and do_fusion:
|
||||
@ -72,11 +67,15 @@ class TestFusedAddRMSNorm(torch.nn.Module):
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
|
||||
dtype = torch.float16 if TEST_FP8 else torch.float32
|
||||
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size))
|
||||
torch.empty((intermediate_size, hidden_size), dtype=dtype)
|
||||
)
|
||||
self.norm = RMSNorm(intermediate_size, 1e-05)
|
||||
self.norm.weight = torch.nn.Parameter(torch.ones(intermediate_size))
|
||||
self.norm.weight = torch.nn.Parameter(
|
||||
torch.ones(intermediate_size, dtype=dtype)
|
||||
)
|
||||
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
|
||||
@ -113,8 +112,9 @@ class TestFusedAddRMSNorm(torch.nn.Module):
|
||||
return norm_output, residual_output
|
||||
|
||||
def example_inputs(self, batch_size=8, hidden_size=16, seq_len=16):
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size))
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size))
|
||||
dtype = torch.float16 if TEST_FP8 else torch.float32
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
return (hidden_states, residual)
|
||||
|
||||
def ops_in_model(self, do_fusion):
|
||||
@ -145,9 +145,10 @@ class TestRotaryEmbedding(torch.nn.Module):
|
||||
return q_rotated, k_rotated
|
||||
|
||||
def example_inputs(self, num_tokens=32, head_dim=64):
|
||||
dtype = torch.float16
|
||||
positions = torch.arange(num_tokens, dtype=torch.long)
|
||||
q = torch.randn(num_tokens, head_dim)
|
||||
k = torch.randn(num_tokens, head_dim)
|
||||
q = torch.randn(num_tokens, head_dim, dtype=dtype)
|
||||
k = torch.randn(num_tokens, head_dim, dtype=dtype)
|
||||
return (positions, q, k)
|
||||
|
||||
def ops_in_model(self, do_fusion):
|
||||
@ -165,7 +166,7 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
|
||||
self.hidden_size = head_dim * num_heads
|
||||
|
||||
self.qkv_proj = torch.nn.Linear(
|
||||
self.hidden_size, self.hidden_size * 3, bias=False
|
||||
self.hidden_size, self.hidden_size * 3, bias=False, dtype=torch.float16
|
||||
)
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
@ -189,9 +190,10 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
|
||||
return qkv_updated
|
||||
|
||||
def example_inputs(self, num_tokens=32, head_dim=64, num_heads=4):
|
||||
dtype = torch.float16
|
||||
hidden_size = head_dim * num_heads
|
||||
positions = torch.arange(num_tokens, dtype=torch.long)
|
||||
hidden_states = torch.randn(num_tokens, hidden_size)
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
return (positions, hidden_states)
|
||||
|
||||
def ops_in_model(self, do_fusion):
|
||||
@ -209,58 +211,48 @@ MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("model_class", MODELS)
|
||||
@pytest.mark.parametrize("do_fusion", [True, False])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", reason="Only test on CUDA")
|
||||
def test_fix_functionalization(
|
||||
model_class: torch.nn.Module, do_fusion: bool, dtype: torch.dtype
|
||||
):
|
||||
def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(dtype=dtype),
|
||||
compilation_config=CompilationConfig(
|
||||
custom_ops=["all"],
|
||||
pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True),
|
||||
),
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.compilation_config = CompilationConfig(
|
||||
pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True)
|
||||
)
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
assert RMSNorm.enabled()
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
|
||||
passes = (
|
||||
[noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass]
|
||||
if do_fusion
|
||||
else [noop_pass, cleanup_pass]
|
||||
)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
|
||||
passes = (
|
||||
[noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass]
|
||||
if do_fusion
|
||||
else [noop_pass, cleanup_pass]
|
||||
)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
backend_func = TestBackend(*passes, func_pass)
|
||||
backend_no_func = TestBackend(*passes)
|
||||
|
||||
backend_func = TestBackend(*passes, func_pass)
|
||||
backend_no_func = TestBackend(*passes)
|
||||
model = model_class()
|
||||
torch.compile(model, backend=backend_func)(*model.example_inputs())
|
||||
torch.compile(model, backend=backend_no_func)(*model.example_inputs())
|
||||
|
||||
model = model_class()
|
||||
torch.compile(model, backend=backend_func)(*model.example_inputs())
|
||||
torch.compile(model, backend=backend_no_func)(*model.example_inputs())
|
||||
# check if the functionalization pass is applied
|
||||
for op in model.ops_in_model(do_fusion):
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# check if the functionalization pass is applied
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in model.ops_in_model(do_fusion):
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in model.ops_in_model(do_fusion):
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
for op in model.ops_not_in_model():
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in model.ops_in_model(do_fusion))
|
||||
assert all(not found.get(op) for op in model.ops_not_in_model())
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
for op in model.ops_not_in_model():
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in model.ops_in_model(do_fusion))
|
||||
assert all(not found.get(op) for op in model.ops_not_in_model())
|
||||
|
||||
@ -5,18 +5,15 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.plugins
|
||||
from vllm.compilation.fusion import FUSED_OPS, FusedRMSQuantKey, RMSNormQuantFusionPass
|
||||
from vllm.compilation.fx_utils import find_op_nodes
|
||||
from vllm.compilation.matcher_utils import QUANT_OPS
|
||||
from vllm.compilation.fusion import (
|
||||
FUSED_OPS,
|
||||
QUANT_OPS,
|
||||
FusedRMSQuantKey,
|
||||
RMSNormQuantFusionPass,
|
||||
)
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationMode,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape,
|
||||
@ -35,9 +32,6 @@ from .backend import TestBackend
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
RMS_OP = torch.ops._C.rms_norm.default
|
||||
RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
|
||||
|
||||
|
||||
class TestModel(torch.nn.Module):
|
||||
def __init__(
|
||||
@ -51,18 +45,18 @@ class TestModel(torch.nn.Module):
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.cuda_force_torch = cuda_force_torch
|
||||
self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
|
||||
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
|
||||
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
|
||||
group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
|
||||
quant_scale = ScaleDesc(torch.float32, static, group_shape)
|
||||
self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
|
||||
self.key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
|
||||
if static:
|
||||
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
|
||||
else:
|
||||
self.scale = [None for _ in range(3)]
|
||||
self.scale = [None for _ in range(2)]
|
||||
self.w = [
|
||||
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
|
||||
for _ in range(3)
|
||||
for _ in range(2)
|
||||
]
|
||||
|
||||
with override_cutlass_fp8_supported(not cuda_force_torch):
|
||||
@ -71,12 +65,8 @@ class TestModel(torch.nn.Module):
|
||||
act_quant_group_shape=group_shape,
|
||||
)
|
||||
|
||||
self.enable_rms_norm_custom_op = self.norm[0].enabled()
|
||||
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
|
||||
|
||||
def forward(self, x):
|
||||
# avoid having graph input be an arg to a pattern directly
|
||||
x = resid = torch.relu(x)
|
||||
resid = torch.sqrt(x)
|
||||
y = self.norm[0](x)
|
||||
|
||||
x2 = self.fp8_linear.apply(
|
||||
@ -88,44 +78,24 @@ class TestModel(torch.nn.Module):
|
||||
x3 = self.fp8_linear.apply(
|
||||
y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
|
||||
)
|
||||
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
return y3
|
||||
|
||||
x4 = self.fp8_linear.apply(
|
||||
y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
|
||||
)
|
||||
|
||||
y4, resid = self.norm[3](x4, resid) # use resid here
|
||||
return y4
|
||||
def ops_in_model_before(self):
|
||||
return [QUANT_OPS[self.key]]
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [
|
||||
FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
|
||||
FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
|
||||
FUSED_OPS[FusedRMSQuantKey(self.key, False)],
|
||||
FUSED_OPS[FusedRMSQuantKey(self.key, True)],
|
||||
]
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return (
|
||||
[QUANT_OPS[self.quant_key]]
|
||||
if self.enable_quant_fp8_custom_op
|
||||
else [torch.ops.aten.reciprocal]
|
||||
)
|
||||
|
||||
def ops_in_model_before_partial(self):
|
||||
return (
|
||||
[RMS_OP, RMS_ADD_OP]
|
||||
if self.enable_rms_norm_custom_op
|
||||
else [torch.ops.aten.rsqrt]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("hidden_size", [64])
|
||||
@pytest.mark.parametrize("num_tokens", [257])
|
||||
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
|
||||
@pytest.mark.parametrize("static", [True, False])
|
||||
@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
|
||||
@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
|
||||
# cuda_force_torch used to test torch code path on platforms that
|
||||
# cutlass_fp8_supported() == True.
|
||||
@pytest.mark.parametrize(
|
||||
@ -135,32 +105,19 @@ class TestModel(torch.nn.Module):
|
||||
not current_platform.is_cuda_alike(), reason="Only test on CUDA and ROCm"
|
||||
)
|
||||
def test_fusion_rmsnorm_quant(
|
||||
dtype,
|
||||
hidden_size,
|
||||
num_tokens,
|
||||
eps,
|
||||
static,
|
||||
enable_rms_norm_custom_op,
|
||||
enable_quant_fp8_custom_op,
|
||||
cuda_force_torch,
|
||||
dtype, hidden_size, num_tokens, eps, static, cuda_force_torch
|
||||
):
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(dtype)
|
||||
torch.manual_seed(1)
|
||||
maybe_create_device_identity() # needed for certain non-cutlass fp8 paths
|
||||
|
||||
custom_ops = []
|
||||
if enable_rms_norm_custom_op:
|
||||
custom_ops.append("+rms_norm")
|
||||
if enable_quant_fp8_custom_op:
|
||||
custom_ops.append("+quant_fp8")
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(dtype=dtype),
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=custom_ops,
|
||||
custom_ops=["+rms_norm", "+quant_fp8"],
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||
),
|
||||
)
|
||||
)
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
# Reshape pass is needed for the fusion pass to work
|
||||
@ -169,39 +126,31 @@ def test_fusion_rmsnorm_quant(
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
|
||||
backend2 = TestBackend(noop_pass, cleanup_pass)
|
||||
model = TestModel(hidden_size, eps, static, cuda_force_torch)
|
||||
|
||||
# First dimension dynamic
|
||||
x = torch.rand(num_tokens, hidden_size)
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
model_fused = torch.compile(model, backend=backend)
|
||||
result_fused = model_fused(x)
|
||||
result = model(x)
|
||||
|
||||
model_unfused = torch.compile(model, backend=backend2)
|
||||
result_unfused = model_unfused(x)
|
||||
model2 = torch.compile(model, backend=backend)
|
||||
result2 = model2(x)
|
||||
|
||||
if dtype == torch.float16:
|
||||
# Higher tol for dynamic, even higher for bfloat16
|
||||
if static:
|
||||
ATOL, RTOL = (1e-3, 1e-3)
|
||||
elif dtype == torch.float16:
|
||||
ATOL, RTOL = (2e-3, 2e-3)
|
||||
else:
|
||||
ATOL, RTOL = (1e-2, 1e-2)
|
||||
|
||||
torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
|
||||
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
|
||||
|
||||
assert fusion_pass.matched_count == 3
|
||||
assert fusion_pass.matched_count == 2
|
||||
|
||||
# In pre-nodes, fp8 quant should be there and fused kernels should not
|
||||
backend.check_before_ops(model.ops_in_model_before())
|
||||
backend.check_before_ops(
|
||||
model.ops_in_model_before_partial(), fully_replaced=False
|
||||
)
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
|
||||
# If RMSNorm custom op is disabled (native/torch impl used),
|
||||
# there's a risk that the fused add doesn't get included in the
|
||||
# replacement and only the rms part gets fused with quant.
|
||||
# Hence, we check only 2 add nodes are left (final fused rmsnorm add).
|
||||
if not enable_rms_norm_custom_op:
|
||||
n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
|
||||
# 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
|
||||
assert n_add_nodes(backend.graph_pre_pass) == 7
|
||||
assert n_add_nodes(backend.graph_post_pass) == 2
|
||||
# In post-nodes, fused kernels should be there and fp8 quant should not
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
|
||||
@ -6,7 +6,6 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.compilation.collective_fusion import AllReduceFusionPass
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
@ -18,7 +17,6 @@ from vllm.config import (
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.parallel_state import (
|
||||
@ -27,8 +25,8 @@ from vllm.distributed.parallel_state import (
|
||||
)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
Fp8LinearOp,
|
||||
GroupShape,
|
||||
QuantFP8,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
@ -42,30 +40,13 @@ class TestAllReduceRMSNormModel(torch.nn.Module):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
|
||||
self.norm = RMSNorm(hidden_size, eps)
|
||||
|
||||
def forward(self, x):
|
||||
# avoid having graph input be an arg to a pattern directly
|
||||
z = torch.relu(x)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
z2 = torch.mm(y, self.w[0])
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
z3 = torch.mm(y2, self.w[1])
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
|
||||
y3, resid = self.norm[2](x3, resid)
|
||||
|
||||
z4 = torch.mm(y3, self.w[2])
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
|
||||
y4, resid = self.norm[3](x4, resid)
|
||||
return y4
|
||||
def forward(self, hidden_states, residual):
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
all_reduce = tensor_model_parallel_all_reduce(view)
|
||||
norm = self.norm(all_reduce)
|
||||
return norm
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [torch.ops.vllm.all_reduce.default]
|
||||
@ -74,53 +55,44 @@ class TestAllReduceRMSNormModel(torch.nn.Module):
|
||||
return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
|
||||
|
||||
|
||||
class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
|
||||
class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
self.w = [
|
||||
torch.rand(hidden_size, hidden_size)
|
||||
.to(dtype=current_platform.fp8_dtype())
|
||||
.t()
|
||||
for _ in range(3)
|
||||
]
|
||||
self.norm = RMSNorm(hidden_size, eps)
|
||||
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=True,
|
||||
act_quant_group_shape=GroupShape.PER_TENSOR,
|
||||
def forward(self, hidden_states, residual):
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
all_reduce = tensor_model_parallel_all_reduce(view)
|
||||
norm, _ = self.norm(all_reduce, residual)
|
||||
return norm
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [torch.ops.vllm.all_reduce.default]
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
|
||||
|
||||
|
||||
class TestAllReduceFusedAddRMSNormStaticQuantFP8Model(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.norm = RMSNorm(hidden_size, eps)
|
||||
self.quant_fp8 = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
|
||||
self.scale = torch.rand(1, dtype=torch.float32)
|
||||
self.output = torch.empty((token_num, hidden_size), dtype=torch.float32)
|
||||
|
||||
def forward(self, hidden_states, residual):
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
all_reduce = tensor_model_parallel_all_reduce(view)
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
torch.ops._C.static_scaled_fp8_quant(
|
||||
self.output, norm_output.contiguous(), self.scale
|
||||
)
|
||||
|
||||
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# avoid having graph input be an arg to a pattern directly
|
||||
z = torch.relu(hidden_states)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
z2 = self.fp8_linear.apply(
|
||||
y, self.w[0], self.wscale[0], input_scale=self.scale[0]
|
||||
)
|
||||
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
z3 = self.fp8_linear.apply(
|
||||
y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
|
||||
)
|
||||
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
|
||||
z4 = self.fp8_linear.apply(
|
||||
y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
|
||||
)
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
y4, resid = self.norm[3](x4, resid) # use resid here
|
||||
return y4
|
||||
return self.output, residual_output
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
|
||||
@ -128,9 +100,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
|
||||
def ops_in_model_before(self):
|
||||
return [
|
||||
torch.ops.vllm.all_reduce.default,
|
||||
torch.ops._C.static_scaled_fp8_quant.default
|
||||
if self.fp8_linear.quant_fp8.enabled()
|
||||
else torch.ops.aten.reciprocal.default,
|
||||
torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
|
||||
|
||||
@ -139,48 +109,25 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.norm = RMSNorm(hidden_size, eps)
|
||||
self.scale = torch.rand(1, dtype=torch.float32)
|
||||
self.output = torch.empty((token_num, hidden_size), dtype=torch.float32)
|
||||
|
||||
self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
|
||||
self.agscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
wgscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
self.alpha = [1 / (w * a) for w, a in zip(wgscale, self.agscale)]
|
||||
round_up = lambda x, y: (x + y - 1) // y * y
|
||||
rounded_m = round_up(token_num, 128)
|
||||
scale_n = hidden_size // 16
|
||||
rounded_n = round_up(scale_n, 4)
|
||||
self.output_scale = torch.empty((rounded_m, rounded_n // 4), dtype=torch.int32)
|
||||
|
||||
wq_gen, wscale_gen = zip(
|
||||
*(scaled_fp4_quant(w, wg) for w, wg in zip(self.w, wgscale))
|
||||
def forward(self, hidden_states, residual):
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
all_reduce = tensor_model_parallel_all_reduce(view)
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
norm_output = norm_output.reshape(-1, norm_output.shape[-1])
|
||||
torch.ops._C.scaled_fp4_quant(
|
||||
self.output, norm_output, self.output_scale, self.scale
|
||||
)
|
||||
self.wq, self.wscale = list(wq_gen), list(wscale_gen)
|
||||
print(f"{self.wq=}, {self.wscale=}")
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# avoid having graph input be an arg to a pattern directly
|
||||
z = torch.relu(hidden_states)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
yq, y_scale = scaled_fp4_quant(y, self.agscale[0])
|
||||
z2 = cutlass_scaled_fp4_mm(
|
||||
yq, self.wq[0], y_scale, self.wscale[0], self.alpha[0], out_dtype=y.dtype
|
||||
)
|
||||
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
yq2, y_scale2 = scaled_fp4_quant(y2, self.agscale[1])
|
||||
z3 = cutlass_scaled_fp4_mm(
|
||||
yq2, self.wq[1], y_scale2, self.wscale[1], self.alpha[1], out_dtype=y2.dtype
|
||||
)
|
||||
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
|
||||
yq3, y_scale3 = scaled_fp4_quant(y3, self.agscale[2])
|
||||
z4 = cutlass_scaled_fp4_mm(
|
||||
yq3, self.wq[2], y_scale3, self.wscale[2], self.alpha[2], out_dtype=y3.dtype
|
||||
)
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
y4, resid = self.norm[3](x4, resid) # use resid here
|
||||
return y4
|
||||
return self.output, residual_output, self.output_scale
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
|
||||
@ -194,19 +141,19 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"test_model, enable_quant_fp8_custom_op",
|
||||
"test_model",
|
||||
[
|
||||
(TestAllReduceRMSNormModel, False),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, True),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, False),
|
||||
(TestAllReduceFusedAddRMSNormStaticQuantFP4Model, False),
|
||||
TestAllReduceRMSNormModel,
|
||||
TestAllReduceFusedAddRMSNormModel,
|
||||
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
|
||||
# TODO: Enable with torch==2.8.0
|
||||
# TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("seq_len", [8])
|
||||
@pytest.mark.parametrize("hidden_size", [64])
|
||||
@pytest.mark.parametrize("hidden_size", [16])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
@pytest.mark.skipif(
|
||||
not find_spec("flashinfer")
|
||||
@ -220,8 +167,6 @@ def test_all_reduce_fusion_pass_replace(
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
enable_rms_norm_custom_op,
|
||||
enable_quant_fp8_custom_op,
|
||||
):
|
||||
num_processes = 2
|
||||
if (
|
||||
@ -236,16 +181,7 @@ def test_all_reduce_fusion_pass_replace(
|
||||
def run_torch_spawn(fn, nprocs):
|
||||
torch.multiprocessing.spawn(
|
||||
fn,
|
||||
args=(
|
||||
num_processes,
|
||||
test_model,
|
||||
batch_size,
|
||||
seq_len,
|
||||
hidden_size,
|
||||
dtype,
|
||||
enable_rms_norm_custom_op,
|
||||
enable_quant_fp8_custom_op,
|
||||
),
|
||||
args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
|
||||
nprocs=nprocs,
|
||||
)
|
||||
|
||||
@ -260,8 +196,6 @@ def all_reduce_fusion_pass_on_test_model(
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
enable_rms_norm_custom_op,
|
||||
enable_quant_fp8_custom_op,
|
||||
):
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
@ -283,22 +217,15 @@ def all_reduce_fusion_pass_on_test_model(
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
custom_ops = []
|
||||
if enable_rms_norm_custom_op:
|
||||
custom_ops.append("+rms_norm")
|
||||
if enable_quant_fp8_custom_op:
|
||||
custom_ops.append("+quant_fp8")
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops
|
||||
mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
|
||||
)
|
||||
)
|
||||
vllm_config.compilation_config.pass_config = PassConfig(
|
||||
enable_fi_allreduce_fusion=True, enable_noop=True
|
||||
)
|
||||
vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
|
||||
vllm_config.parallel_config.rank = local_rank # Setup rank for debug path
|
||||
|
||||
# this is a fake model name to construct the model config
|
||||
# in the vllm_config, it's not really used.
|
||||
@ -306,27 +233,24 @@ def all_reduce_fusion_pass_on_test_model(
|
||||
vllm_config.model_config = ModelConfig(
|
||||
model=model_name, trust_remote_code=True, dtype=dtype, seed=42
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
backend = TestBackend(
|
||||
noop_pass, all_reduce_fusion_pass, func_pass, cleanup_pass
|
||||
)
|
||||
all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
token_num = batch_size * seq_len
|
||||
model = test_model_cls(hidden_size, token_num)
|
||||
backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass, cleanup_pass)
|
||||
|
||||
hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)
|
||||
token_num = batch_size * seq_len
|
||||
model = test_model_cls(hidden_size, token_num)
|
||||
|
||||
compiled_model = torch.compile(model, backend=backend)
|
||||
compiled_model(hidden_states)
|
||||
hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)
|
||||
residual = torch.randn((token_num, hidden_size), requires_grad=False)
|
||||
|
||||
assert all_reduce_fusion_pass.matched_count == 4, (
|
||||
f"{all_reduce_fusion_pass.matched_count=}"
|
||||
)
|
||||
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
del all_reduce_fusion_pass
|
||||
compiled_model = torch.compile(model, backend=backend)
|
||||
compiled_model(hidden_states, residual)
|
||||
|
||||
assert all_reduce_fusion_pass.matched_count == 1
|
||||
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
del all_reduce_fusion_pass
|
||||
|
||||
@ -6,15 +6,14 @@ import pytest
|
||||
import torch._dynamo
|
||||
|
||||
from tests.compile.backend import LazyInitPass, TestBackend
|
||||
from tests.utils import flat_product
|
||||
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.compilation.fusion import QUANT_OPS
|
||||
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
|
||||
from vllm.compilation.fx_utils import find_op_nodes
|
||||
from vllm.compilation.matcher_utils import QUANT_OPS
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (
|
||||
@ -29,18 +28,21 @@ from vllm.config import (
|
||||
)
|
||||
from vllm.forward_context import get_forward_context, set_forward_context
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
QuantKey,
|
||||
kFp8StaticTensorSym,
|
||||
kNvfp4Quant,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
FP4_DTYPE = torch.uint8
|
||||
|
||||
# globals needed for string-import custom Dynamo backend field
|
||||
backend: TestBackend | None = None
|
||||
backend_unfused: TestBackend | None = None
|
||||
|
||||
|
||||
class AttentionQuantPatternModel(torch.nn.Module):
|
||||
"""Base model for AttentionQuantPattern fusion."""
|
||||
@ -102,7 +104,6 @@ class AttentionQuantPatternModel(torch.nn.Module):
|
||||
num_blocks = batch_size * max_blocks
|
||||
backend = self.attn.backend
|
||||
|
||||
# TODO(luka) use get_kv_cache_stride_order
|
||||
# Create dummy KV cache for the selected backend
|
||||
if backend == _Backend.ROCM_ATTN:
|
||||
# k/v as 1st dimention
|
||||
@ -240,40 +241,26 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
|
||||
)
|
||||
|
||||
|
||||
MODELS_FP8: list[tuple[str, type]] = []
|
||||
MODELS_FP4: list[tuple[str, type]] = []
|
||||
HEADS: list[tuple[int, int]] = []
|
||||
SPLIT_ATTENTION: list[bool] = []
|
||||
BACKENDS_FP8: list[_Backend] = []
|
||||
BACKENDS_FP4: list[_Backend] = []
|
||||
|
||||
if current_platform.is_cuda():
|
||||
HEADS = [(64, 8), (40, 8)]
|
||||
MODELS_FP8 = [
|
||||
MODELS = [
|
||||
(
|
||||
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||
TestAttentionFp8StaticQuantPatternModel,
|
||||
)
|
||||
]
|
||||
MODELS_FP4 = [
|
||||
),
|
||||
(
|
||||
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
|
||||
TestAttentionNvfp4QuantPatternModel,
|
||||
)
|
||||
),
|
||||
]
|
||||
BACKENDS_FP8 = [_Backend.TRITON_ATTN, _Backend.FLASHINFER]
|
||||
BACKENDS_FP4 = [_Backend.FLASHINFER]
|
||||
|
||||
HEADS = [(64, 8), (40, 8)]
|
||||
elif current_platform.is_rocm():
|
||||
HEADS = [(32, 8), (40, 8)]
|
||||
MODELS_FP8 = [
|
||||
MODELS = [
|
||||
("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
|
||||
]
|
||||
BACKENDS = [
|
||||
_Backend.ROCM_AITER_UNIFIED_ATTN,
|
||||
_Backend.ROCM_ATTN,
|
||||
_Backend.TRITON_ATTN,
|
||||
]
|
||||
HEADS = [(32, 8), (40, 8)]
|
||||
else:
|
||||
MODELS = []
|
||||
HEADS = []
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS)
|
||||
@ -282,36 +269,46 @@ elif current_platform.is_rocm():
|
||||
"batch_size", [7, 256, 533] if current_platform.is_cuda() else [8]
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("model_name, model_class", MODELS)
|
||||
@pytest.mark.parametrize(
|
||||
"backend, model_name, model_class, custom_ops",
|
||||
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
|
||||
list(flat_product(BACKENDS_FP8, MODELS_FP8, ["+quant_fp8", "-quant_fp8"]))
|
||||
# quant_fp4 only has the custom impl
|
||||
+ list(flat_product(BACKENDS_FP4, MODELS_FP4, [""])),
|
||||
"backend",
|
||||
[_Backend.FLASHINFER]
|
||||
if current_platform.is_cuda()
|
||||
else [_Backend.ROCM_AITER_UNIFIED_ATTN, _Backend.ROCM_ATTN, _Backend.TRITON_ATTN],
|
||||
)
|
||||
# TODO(boyuan): test inductor graph partition on rocm
|
||||
@pytest.mark.parametrize(
|
||||
"use_inductor_graph_partition",
|
||||
[False] if current_platform.is_rocm() else [False, True],
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA"
|
||||
)
|
||||
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda() and not current_platform.is_device_capability((10, 0)),
|
||||
reason="On CUDA only test on SM100(Blackwell)",
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA"
|
||||
)
|
||||
def test_attention_quant_pattern(
|
||||
num_qo_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
batch_size: int,
|
||||
dtype: torch.dtype,
|
||||
custom_ops: str,
|
||||
model_name: str,
|
||||
model_class: type[AttentionQuantPatternModel],
|
||||
backend: _Backend,
|
||||
use_inductor_graph_partition: bool,
|
||||
dist_init,
|
||||
caplog_vllm,
|
||||
):
|
||||
"""Test AttentionStaticQuantPattern fusion pass"""
|
||||
if backend == _Backend.FLASHINFER and (
|
||||
not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
|
||||
):
|
||||
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
device = torch.device("cuda:0")
|
||||
torch.manual_seed(42)
|
||||
@ -325,7 +322,8 @@ def test_attention_quant_pattern(
|
||||
scheduler_config=SchedulerConfig(max_num_seqs=1024),
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=custom_ops_list,
|
||||
custom_ops=["+quant_fp8"],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
),
|
||||
cache_config=CacheConfig(cache_dtype="fp8"),
|
||||
)
|
||||
@ -360,9 +358,8 @@ def test_attention_quant_pattern(
|
||||
forward_ctx = get_forward_context()
|
||||
forward_ctx.attn_metadata = model_unfused.build_attn_metadata(batch_size)
|
||||
|
||||
# Run model directly without fusion
|
||||
# Still compile so query QuantFP8 has closer numerics
|
||||
result_unfused = torch.compile(model_unfused, fullgraph=True)(q, k, v)
|
||||
# Run model directly without compilation and fusion
|
||||
result_unfused = model_unfused(q, k, v)
|
||||
|
||||
# Run model with attn fusion enabled
|
||||
vllm_config.compilation_config.pass_config = PassConfig(
|
||||
@ -417,25 +414,16 @@ def test_attention_quant_pattern(
|
||||
)
|
||||
|
||||
# Check attn fusion support
|
||||
quant_key: QuantKey = model_class.quant_key
|
||||
quant_key = model_class.quant_key
|
||||
attn_fusion_supported = [
|
||||
layer.impl.fused_output_quant_supported(quant_key)
|
||||
for key, layer in vllm_config.compilation_config.static_forward_context.items()
|
||||
]
|
||||
assert sum(attn_fusion_supported) == len(attn_fusion_supported), (
|
||||
"All layers should support attention fusion"
|
||||
)
|
||||
|
||||
# Check quantization ops in the graph before and after fusion
|
||||
quant_op = (
|
||||
torch.ops.aten.reciprocal
|
||||
if "-quant_fp8" in custom_ops_list
|
||||
else QUANT_OPS[quant_key]
|
||||
)
|
||||
|
||||
# Note: for fp8, fully_replaced=False because query quant ops remain in graph.
|
||||
# Only output quant ops are fused into attention.
|
||||
test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Quant)
|
||||
if any(attn_fusion_supported):
|
||||
# Check quantization ops in the graph before and after fusion
|
||||
# Note: fully_replaced=False because query quant ops remain in graph.
|
||||
# Only output quant ops are fused into attention.
|
||||
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
|
||||
|
||||
# access the underlying `AttnFusionPass` on the `LazyInitPass`
|
||||
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
|
||||
|
||||
@ -1,305 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from tests.v1.attention.utils import _Backend
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ..utils import flat_product, multi_gpu_test
|
||||
|
||||
|
||||
class ModelBackendTestCase(NamedTuple):
|
||||
model_name: str
|
||||
model_kwargs: dict[str, Any]
|
||||
backend: _Backend
|
||||
attention_fusions: int
|
||||
allreduce_fusions: int | None = None
|
||||
|
||||
|
||||
MODELS_FP8: list[ModelBackendTestCase] = []
|
||||
MODELS_FP4: list[ModelBackendTestCase] = []
|
||||
MODELS: list[ModelBackendTestCase] = [] # tp-only
|
||||
|
||||
if current_platform.is_cuda():
|
||||
MODELS_FP8 = [
|
||||
ModelBackendTestCase(
|
||||
# Use smaller model for L40s in CI
|
||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=_Backend.TRITON_ATTN,
|
||||
attention_fusions=32,
|
||||
allreduce_fusions=65,
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=_Backend.FLASHINFER,
|
||||
attention_fusions=48,
|
||||
allreduce_fusions=96,
|
||||
),
|
||||
]
|
||||
|
||||
MODELS_FP4 = [
|
||||
ModelBackendTestCase(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=_Backend.FLASHINFER,
|
||||
attention_fusions=48,
|
||||
allreduce_fusions=96,
|
||||
),
|
||||
]
|
||||
|
||||
# TP only
|
||||
MODELS = [
|
||||
ModelBackendTestCase(
|
||||
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=_Backend.TRITON_ATTN,
|
||||
attention_fusions=0,
|
||||
allreduce_fusions=65,
|
||||
),
|
||||
]
|
||||
|
||||
elif current_platform.is_rocm():
|
||||
MODELS_FP8 = [
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=_Backend.TRITON_ATTN,
|
||||
attention_fusions=32,
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=_Backend.ROCM_ATTN,
|
||||
attention_fusions=32,
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=_Backend.ROCM_AITER_UNIFIED_ATTN,
|
||||
attention_fusions=32,
|
||||
),
|
||||
]
|
||||
|
||||
# TODO(luka) test both in nightly
|
||||
CUSTOM_OPS_FP8 = ["-quant_fp8"] # , "+quant_fp8"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, "
|
||||
"attention_fusions, allreduce_fusions, custom_ops",
|
||||
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
|
||||
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
|
||||
# quant_fp4 only has the custom impl
|
||||
+ list(flat_product(MODELS_FP4, [""])),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
def test_attn_quant(
|
||||
model_name: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
backend: _Backend,
|
||||
attention_fusions: int,
|
||||
allreduce_fusions: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if backend == _Backend.FLASHINFER and (
|
||||
not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
|
||||
):
|
||||
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
custom_ops=custom_ops_list,
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(compilation_config, model_name, **model_kwargs)
|
||||
|
||||
matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 1, log_holder.text
|
||||
assert int(matches[0]) == attention_fusions
|
||||
|
||||
|
||||
# TODO(luka) test both in nightly
|
||||
CUSTOM_OPS_RMS_NORM = ["-rms_norm"] # , "+rms_norm"]
|
||||
|
||||
|
||||
def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
|
||||
for op_list in itertools.product(*custom_ops_lists):
|
||||
yield ",".join(op_list)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, "
|
||||
"attention_fusions, allreduce_fusions, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
||||
)
|
||||
)
|
||||
# Toggle RMSNorm for FP4 models and unquant models
|
||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda()
|
||||
or not has_flashinfer()
|
||||
or not current_platform.has_device_capability(90),
|
||||
reason="allreduce+rmsnorm fusion requires flashinfer",
|
||||
)
|
||||
def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: _Backend,
|
||||
attention_fusions: int,
|
||||
allreduce_fusions: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
custom_ops=custom_ops_list,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
enable_attn_fusion=True,
|
||||
enable_noop=True,
|
||||
enable_fi_allreduce_fusion=True,
|
||||
),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 2, log_holder.text
|
||||
|
||||
assert int(matches[0]) == attention_fusions
|
||||
assert int(matches[1]) == attention_fusions
|
||||
|
||||
matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 2, log_holder.text
|
||||
|
||||
assert int(matches[0]) == allreduce_fusions
|
||||
assert int(matches[1]) == allreduce_fusions
|
||||
|
||||
|
||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||
compilation_config = (
|
||||
compile_config
|
||||
if isinstance(compile_config, CompilationConfig)
|
||||
else CompilationConfig(level=compile_config)
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
# Allow override from model_kwargs
|
||||
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
|
||||
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
|
||||
|
||||
# No cudagraphs by default
|
||||
if compilation_config.cudagraph_mode is None:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
compilation_config=compilation_config,
|
||||
**model_kwargs,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
|
||||
# dummy custom pass that doesn't inherit
|
||||
@ -42,8 +42,7 @@ class ProperPass(InductorPass):
|
||||
],
|
||||
)
|
||||
def test_pass_manager_uuid(callable):
|
||||
# Some passes need dtype to be set
|
||||
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||
config = VllmConfig()
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
@ -18,8 +18,6 @@ from vllm.config import (
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
VllmConfig,
|
||||
get_current_vllm_config,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.parallel_state import (
|
||||
@ -44,7 +42,9 @@ prompts = [
|
||||
|
||||
|
||||
class TestModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
def __init__(
|
||||
self, hidden_size=16, intermediate_size=32, vllm_config: VllmConfig = None
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
@ -95,11 +95,13 @@ class TestModel(torch.nn.Module):
|
||||
|
||||
|
||||
class TestQuantModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
def __init__(
|
||||
self, hidden_size=16, intermediate_size=32, vllm_config: VllmConfig = None
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.vllm_config = get_current_vllm_config()
|
||||
self.vllm_config = vllm_config
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size)), requires_grad=False
|
||||
)
|
||||
@ -264,84 +266,76 @@ def sequence_parallelism_pass_on_test_model(
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
# configure vllm config for SequenceParallelismPass
|
||||
compilation_config = CompilationConfig(
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.compilation_config = CompilationConfig(
|
||||
pass_config=PassConfig(
|
||||
enable_sequence_parallelism=True,
|
||||
enable_fusion=enable_fusion,
|
||||
enable_noop=True,
|
||||
)
|
||||
) # NoOp needed for fusion
|
||||
device_config = DeviceConfig(device=torch.device("cuda"))
|
||||
vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
|
||||
|
||||
# this is a fake model name to construct the model config
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
model_config = ModelConfig(
|
||||
vllm_config.model_config = ModelConfig(
|
||||
model=model_name, trust_remote_code=True, dtype=dtype, seed=42
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
device_config=device_config,
|
||||
compilation_config=compilation_config,
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
|
||||
assert (
|
||||
sequence_parallelism_pass.compilation_config.splitting_ops
|
||||
== vllm_config.compilation_config.splitting_ops
|
||||
)
|
||||
assert (
|
||||
sequence_parallelism_pass.compilation_config.use_inductor_graph_partition
|
||||
== vllm_config.compilation_config.use_inductor_graph_partition
|
||||
)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
assert (
|
||||
sequence_parallelism_pass.compilation_config.splitting_ops
|
||||
== vllm_config.compilation_config.splitting_ops
|
||||
)
|
||||
assert (
|
||||
sequence_parallelism_pass.compilation_config.use_inductor_graph_partition
|
||||
== vllm_config.compilation_config.use_inductor_graph_partition
|
||||
)
|
||||
passes_for_backend: list[VllmInductorPass] = [
|
||||
noop_pass,
|
||||
sequence_parallelism_pass,
|
||||
]
|
||||
passes_for_backend: list[VllmInductorPass] = [noop_pass, sequence_parallelism_pass]
|
||||
|
||||
if enable_fusion:
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
passes_for_backend.append(fusion_pass)
|
||||
if enable_fusion:
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
passes_for_backend.append(fusion_pass)
|
||||
|
||||
passes_for_backend.append(cleanup_pass)
|
||||
passes_for_backend.append(cleanup_pass)
|
||||
|
||||
backend_no_func = TestBackend(*passes_for_backend)
|
||||
backend_func = TestBackend(*passes_for_backend, func_pass)
|
||||
backend_no_func = TestBackend(*passes_for_backend)
|
||||
backend_func = TestBackend(*passes_for_backend, func_pass)
|
||||
|
||||
model = test_model_cls(hidden_size, hidden_size * 2)
|
||||
model = test_model_cls(hidden_size, hidden_size * 2, vllm_config=vllm_config)
|
||||
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
|
||||
compiled_model_no_func(hidden_states, residual)
|
||||
compiled_model_func = torch.compile(model, backend=backend_func)
|
||||
compiled_model_func(hidden_states, residual)
|
||||
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
|
||||
compiled_model_no_func(hidden_states, residual)
|
||||
compiled_model_func = torch.compile(model, backend=backend_func)
|
||||
compiled_model_func(hidden_states, residual)
|
||||
|
||||
assert sequence_parallelism_pass.matched_count == 1
|
||||
assert sequence_parallelism_pass.matched_count == 1
|
||||
|
||||
# In pre-nodes, all reduce should be there,
|
||||
# reduce scatter and all gather should not
|
||||
backend_no_func.check_before_ops(model.ops_in_model_before())
|
||||
# In pre-nodes, all reduce should be there,
|
||||
# reduce scatter and all gather should not
|
||||
backend_no_func.check_before_ops(model.ops_in_model_before())
|
||||
|
||||
# In post-nodes, reduce scatter and all gather should be there,
|
||||
# all reduce should not
|
||||
backend_no_func.check_after_ops(model.ops_in_model_after())
|
||||
# In post-nodes, reduce scatter and all gather should be there,
|
||||
# all reduce should not
|
||||
backend_no_func.check_after_ops(model.ops_in_model_after())
|
||||
|
||||
# check if the functionalization pass is applied
|
||||
# check if the functionalization pass is applied
|
||||
for op in model.ops_in_model():
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in model.ops_in_model():
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in model.ops_in_model():
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in model.ops_in_model())
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in model.ops_in_model())
|
||||
|
||||
@ -1,13 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
import pathlib
|
||||
from copy import deepcopy
|
||||
|
||||
from tblib import pickling_support
|
||||
|
||||
# ruff: noqa
|
||||
|
||||
from tblib import pickling_support
|
||||
|
||||
# Install support for pickling exceptions so that we can nicely propagate
|
||||
# failures from tests running in a subprocess.
|
||||
# This should be run before any custom exception subclasses are defined.
|
||||
@ -43,7 +40,7 @@ from transformers import (
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from tests.models.utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
|
||||
from vllm import LLM, SamplingParams, envs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
@ -60,8 +57,8 @@ from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -1073,101 +1070,6 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
|
||||
yield caplog
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def caplog_mp_fork():
|
||||
"""
|
||||
This fixture enables capturing logs from a forked MP subprocess.
|
||||
It should be used in conjunction with caplog_vllm.
|
||||
|
||||
By default, subprocess logs do not go through the parent process.
|
||||
We instead create a queue listener in the parent process which
|
||||
forwards logs to the logger's other handlers, and add a QueueHandler
|
||||
to the root logger. Forked subprocesses will inherit the root logger
|
||||
and pass their messages to the queue, which the listener will forward
|
||||
to the root logger, which can be captured by caplog.
|
||||
|
||||
Note that this workaround only works for fork; with spawn, the subprocess
|
||||
reinitializes logging and does not automatically inherit the queue.
|
||||
We'd have to manually pass the queue to the subprocess at the spawn point.
|
||||
See caplog_mp_spawn below.
|
||||
"""
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ctx():
|
||||
import logging.handlers
|
||||
import multiprocessing as mp
|
||||
|
||||
logger_queue: mp.Queue[logging.LogRecord] = mp.Queue()
|
||||
logger = logging.getLogger()
|
||||
handlers = logger.handlers
|
||||
|
||||
# The listener works on a background thread, not inherited by the child.
|
||||
queue_listener = logging.handlers.QueueListener(logger_queue, *handlers)
|
||||
queue_listener.start()
|
||||
|
||||
# Add queue handler after creating the listener to avoid cycle
|
||||
logger.addHandler(logging.handlers.QueueHandler(logger_queue))
|
||||
yield
|
||||
queue_listener.stop()
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
class LogHolder:
|
||||
def __init__(self):
|
||||
self.text = None
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def caplog_mp_spawn(tmp_path, monkeypatch):
|
||||
"""
|
||||
This fixture enables capturing logs from a forked MP subprocess.
|
||||
It does not require caplog_vllm (but it only contains logs from the child).
|
||||
|
||||
By default, subprocess logs do not go through the parent process.
|
||||
We instead add a FileHandler to the config so the spawned child process
|
||||
writes its logs to a temp file.
|
||||
In the parent, we read the file and return the contents.
|
||||
|
||||
Note: this method could be extended to fork by either reconfiguring logging
|
||||
in the parent or using a SocketHandler:
|
||||
https://docs.python.org/3/howto/logging-cookbook.html#sending-and-receiving-logging-events-across-a-network # noqa: E501
|
||||
"""
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ctx(level: int | str):
|
||||
from vllm.logger import DEFAULT_LOGGING_CONFIG
|
||||
|
||||
config_path = tmp_path / "vllm_logging_config.json"
|
||||
log_path = tmp_path / "vllm.log"
|
||||
log_holder = LogHolder()
|
||||
|
||||
config = deepcopy(DEFAULT_LOGGING_CONFIG)
|
||||
if envs.VLLM_LOGGING_CONFIG_PATH:
|
||||
path = pathlib.Path(envs.VLLM_LOGGING_CONFIG_PATH)
|
||||
assert path.exists()
|
||||
config = json.loads(path.read_text())
|
||||
|
||||
config["loggers"]["vllm"]["handlers"] += ["vllm_file"]
|
||||
config["handlers"]["vllm_file"] = {
|
||||
"class": "logging.FileHandler",
|
||||
"formatter": "vllm",
|
||||
"level": level,
|
||||
"filename": log_path.as_posix(),
|
||||
}
|
||||
|
||||
config_path.write_text(json.dumps(config))
|
||||
|
||||
with monkeypatch.context() as monkeypatch_ctx:
|
||||
monkeypatch_ctx.setenv("VLLM_LOGGING_CONFIG_PATH", config_path.as_posix())
|
||||
monkeypatch_ctx.setenv("VLLM_CONFIGURE_LOGGING", "1")
|
||||
yield log_holder
|
||||
|
||||
log_holder.text = log_path.read_text()
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def num_gpus_available():
|
||||
"""Get number of GPUs without initializing the CUDA context
|
||||
|
||||
@ -18,7 +18,7 @@ import pytest
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
@ -11,10 +11,10 @@ import vllm.envs as envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import (
|
||||
cuda_device_count_stateless,
|
||||
get_open_port,
|
||||
update_environment_variables,
|
||||
)
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
|
||||
@ -3,15 +3,12 @@
|
||||
|
||||
import asyncio
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import AsyncMock, Mock
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.v1.engine.exceptions import EngineDeadError
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
@ -227,24 +224,3 @@ async def test_server_load(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("load"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
assert response.json().get("server_load") == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_check_engine_dead_error():
|
||||
# Import the health function directly to test it in isolation
|
||||
from vllm.entrypoints.openai.api_server import health
|
||||
|
||||
# Create a mock request that simulates what FastAPI would provide
|
||||
mock_request = Mock(spec=Request)
|
||||
mock_app_state = Mock()
|
||||
mock_engine_client = AsyncMock()
|
||||
mock_engine_client.check_health.side_effect = EngineDeadError()
|
||||
mock_app_state.engine_client = mock_engine_client
|
||||
mock_request.app.state = mock_app_state
|
||||
|
||||
# Test the health function directly with our mocked request
|
||||
# This simulates what would happen if the engine dies
|
||||
response = await health(mock_request)
|
||||
|
||||
# Assert that it returns 503 Service Unavailable
|
||||
assert response.status_code == 503
|
||||
|
||||
@ -1,280 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""Integration tests for GPT-OSS structural tags functionality (PR #25515)."""
|
||||
|
||||
import json
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
StructuredOutputsParams,
|
||||
)
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.reasoning.gptoss_reasoning_parser import (
|
||||
GptOssReasoningParser,
|
||||
)
|
||||
|
||||
|
||||
class TestGptOssStructuralTagsIntegration:
|
||||
"""Integration tests for structural tags in GPT-OSS tool calls."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_tokenizer(self):
|
||||
"""Create a mock tokenizer."""
|
||||
tokenizer = Mock()
|
||||
tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
|
||||
return tokenizer
|
||||
|
||||
@pytest.fixture
|
||||
def gptoss_parser(self, mock_tokenizer):
|
||||
"""Create a real GptOssReasoningParser instance."""
|
||||
return GptOssReasoningParser(mock_tokenizer)
|
||||
|
||||
@pytest.fixture
|
||||
def tool_server_with_python(self):
|
||||
"""Create a tool server with Python tool enabled."""
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
|
||||
return tool_server
|
||||
|
||||
@pytest.fixture
|
||||
def tool_server_empty(self):
|
||||
"""Create a tool server with no tools."""
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(return_value=False)
|
||||
return tool_server
|
||||
|
||||
def test_end_to_end_no_tools(self, gptoss_parser):
|
||||
"""Test end-to-end flow when no tools are available."""
|
||||
# Test the parser directly
|
||||
result = gptoss_parser.prepare_structured_tag(None, None)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Verify basic structure
|
||||
assert parsed_result["type"] == "structural_tag"
|
||||
assert parsed_result["format"]["type"] == "triggered_tags"
|
||||
assert len(parsed_result["format"]["tags"]) == 1
|
||||
|
||||
# Verify only analysis channel is allowed
|
||||
analysis_tag = parsed_result["format"]["tags"][0]
|
||||
assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
|
||||
assert analysis_tag["content"]["type"] == "any_text"
|
||||
assert analysis_tag["end"] == "<|end|>"
|
||||
|
||||
# Verify triggers
|
||||
assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
|
||||
assert parsed_result["format"]["stop_after_first"] is False
|
||||
|
||||
def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
|
||||
"""Test end-to-end flow with Python tool enabled."""
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Should have analysis tag + 2 python tags
|
||||
assert len(parsed_result["format"]["tags"]) == 3
|
||||
|
||||
# Verify all expected tags are present
|
||||
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
|
||||
expected_begins = [
|
||||
"<|channel|>analysis<|message|>",
|
||||
"<|channel|>commentary to=python",
|
||||
"<|channel|>analysis to=python",
|
||||
]
|
||||
|
||||
for expected in expected_begins:
|
||||
assert expected in tag_begins
|
||||
|
||||
# Verify triggers include commentary
|
||||
assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
|
||||
assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]
|
||||
|
||||
def test_structured_outputs_params_integration(
|
||||
self, gptoss_parser, tool_server_with_python
|
||||
):
|
||||
"""Test integration with StructuredOutputsParams."""
|
||||
# Generate structural tag
|
||||
structural_tag = gptoss_parser.prepare_structured_tag(
|
||||
None, tool_server_with_python
|
||||
)
|
||||
|
||||
# Create StructuredOutputsParams
|
||||
params = StructuredOutputsParams(structural_tag=structural_tag)
|
||||
|
||||
# Verify the tag is properly stored and accessible
|
||||
assert params.structural_tag == structural_tag
|
||||
|
||||
# Verify the tag is valid JSON
|
||||
parsed_tag = json.loads(params.structural_tag)
|
||||
assert parsed_tag["type"] == "structural_tag"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"browser, python, container, expected_tags",
|
||||
[
|
||||
# No tools
|
||||
(False, False, False, 1),
|
||||
# Single tool
|
||||
(True, False, False, 3),
|
||||
# Multiple tools
|
||||
(True, True, False, 5),
|
||||
# All tools
|
||||
(True, True, True, 7),
|
||||
],
|
||||
)
|
||||
def test_tool_server_interaction_flow(
|
||||
self, gptoss_parser, browser, python, container, expected_tags
|
||||
):
|
||||
"""Test the complete tool server interaction flow."""
|
||||
|
||||
# Create a mock ToolServer
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
|
||||
# Simulate tool availability based on parameters
|
||||
tool_server.has_tool = Mock(
|
||||
side_effect=lambda tool: {
|
||||
"browser": browser,
|
||||
"python": python,
|
||||
"container": container,
|
||||
}.get(tool, False)
|
||||
)
|
||||
|
||||
# Run the parser and verify results
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Validate number of tags
|
||||
assert len(parsed_result["format"]["tags"]) == expected_tags
|
||||
|
||||
# Verify tool-specific tags exist for enabled tools
|
||||
tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
|
||||
for tool, enabled in {
|
||||
"browser": browser,
|
||||
"python": python,
|
||||
"container": container,
|
||||
}.items():
|
||||
if enabled:
|
||||
assert f"<|channel|>commentary to={tool}" in tag_begins
|
||||
assert f"<|channel|>analysis to={tool}" in tag_begins
|
||||
|
||||
def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
|
||||
"""Test that original tags are preserved when provided."""
|
||||
original_tag = '{"type": "custom_tag", "data": "preserved"}'
|
||||
|
||||
result = gptoss_parser.prepare_structured_tag(
|
||||
original_tag, tool_server_with_python
|
||||
)
|
||||
|
||||
# Should return original tag unchanged
|
||||
assert result == original_tag
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tools",
|
||||
[
|
||||
[],
|
||||
["browser"],
|
||||
["python"],
|
||||
["container"],
|
||||
["browser", "python"],
|
||||
["browser", "container"],
|
||||
["python", "container"],
|
||||
["browser", "python", "container"],
|
||||
],
|
||||
)
|
||||
def test_json_validity_comprehensive(self, gptoss_parser, tools):
|
||||
"""Test JSON validity across all possible tool combinations."""
|
||||
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)
|
||||
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
|
||||
# Should be valid JSON
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Should have correct structure
|
||||
assert parsed_result["type"] == "structural_tag"
|
||||
assert "format" in parsed_result
|
||||
assert "tags" in parsed_result["format"]
|
||||
assert "triggers" in parsed_result["format"]
|
||||
|
||||
# Tag count should be: 1 (analysis) + 2 * len(tools)
|
||||
expected_tag_count = 1 + (2 * len(tools))
|
||||
assert len(parsed_result["format"]["tags"]) == expected_tag_count
|
||||
|
||||
def test_error_handling_invalid_tool_server(self, gptoss_parser):
|
||||
"""Test error handling with invalid tool server."""
|
||||
# Tool server that raises exceptions
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))
|
||||
|
||||
# Should handle gracefully and still return a valid tag
|
||||
with pytest.raises(Exception, match="Tool server error"):
|
||||
gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
|
||||
def test_concurrent_requests_isolation(self, gptoss_parser):
|
||||
"""Test that concurrent requests don't interfere with each other."""
|
||||
# Simulate concurrent requests with different tool servers
|
||||
tool_server_1 = Mock(spec=ToolServer)
|
||||
tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")
|
||||
|
||||
tool_server_2 = Mock(spec=ToolServer)
|
||||
tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")
|
||||
|
||||
# Generate tags concurrently
|
||||
result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
|
||||
result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)
|
||||
|
||||
# Parse results
|
||||
parsed_1 = json.loads(result_1)
|
||||
parsed_2 = json.loads(result_2)
|
||||
|
||||
# Verify they have different tool configurations
|
||||
tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
|
||||
tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]
|
||||
|
||||
# Result 1 should have python tags
|
||||
assert "<|channel|>commentary to=python" in tags_1
|
||||
assert "<|channel|>commentary to=browser" not in tags_1
|
||||
|
||||
# Result 2 should have browser tags
|
||||
assert "<|channel|>commentary to=browser" in tags_2
|
||||
assert "<|channel|>commentary to=python" not in tags_2
|
||||
|
||||
def test_tag_format_consistency(self, gptoss_parser):
|
||||
"""Test that all generated tags follow consistent format."""
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(
|
||||
side_effect=lambda tool: tool in ["python", "browser"]
|
||||
)
|
||||
|
||||
result = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
parsed_result = json.loads(result)
|
||||
|
||||
# Verify all tags have required fields
|
||||
for tag in parsed_result["format"]["tags"]:
|
||||
assert "begin" in tag
|
||||
assert "content" in tag
|
||||
assert "end" in tag
|
||||
assert tag["content"]["type"] == "any_text"
|
||||
assert tag["end"] == "<|end|>"
|
||||
|
||||
# Verify begin format
|
||||
assert tag["begin"].startswith("<|channel|>")
|
||||
|
||||
def test_trigger_configuration(self, gptoss_parser):
|
||||
"""Test trigger configuration for different tool setups."""
|
||||
# Test with no tools
|
||||
result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
|
||||
parsed_no_tools = json.loads(result_no_tools)
|
||||
assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]
|
||||
|
||||
# Test with tools
|
||||
tool_server = Mock(spec=ToolServer)
|
||||
tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
|
||||
|
||||
result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
|
||||
parsed_with_tools = json.loads(result_with_tools)
|
||||
|
||||
expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
|
||||
assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)
|
||||
@ -3,10 +3,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils.torch_utils import (
|
||||
create_kv_caches_with_random,
|
||||
create_kv_caches_with_random_flash,
|
||||
)
|
||||
from vllm.utils import create_kv_caches_with_random, create_kv_caches_with_random_flash
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
||||
@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.attention.layer import Attention, MultiHeadAttention
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.mem_utils import get_max_shared_memory_bytes
|
||||
from vllm.utils import get_max_shared_memory_bytes
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
from xformers import ops as xops
|
||||
|
||||
@ -15,7 +15,7 @@ from tests.kernels.utils import make_alibi_bias
|
||||
from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
NUM_HEADS = [64]
|
||||
NUM_QUERIES_PER_KV = [1, 64]
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
|
||||
from tests.kernels.quant_utils import FP8_DTYPE
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
@ -70,6 +70,38 @@ def test_rms_norm(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_poly_norm(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
layer = PolyNorm().to(dtype=dtype)
|
||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||
layer.bias.data.normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
x *= scale
|
||||
|
||||
ref_out = layer.forward_native(x)
|
||||
out = layer(x)
|
||||
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.poly_norm,
|
||||
(out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
|
||||
@ -3,8 +3,7 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils import is_uva_available
|
||||
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
|
||||
from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
|
||||
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
|
||||
|
||||
@ -13,9 +13,8 @@ import torch
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils import cuda_device_count_stateless, has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
|
||||
from .modular_kernel_tools.common import (
|
||||
Config,
|
||||
|
||||
@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant(
|
||||
.clamp(fp8_traits_min, fp8_traits_max)
|
||||
.to(FP8_DTYPE)
|
||||
)
|
||||
return ref_out, ref_scale.view((1, 1))
|
||||
return ref_out, ref_scale.view((1,))
|
||||
|
||||
|
||||
def native_w8a8_block_matmul(
|
||||
|
||||
@ -22,8 +22,8 @@ from vllm.utils import (
|
||||
STR_BACKEND_ENV_VAR,
|
||||
STR_FLASH_ATTN_VAL,
|
||||
STR_XFORMERS_ATTN_VAL,
|
||||
make_tensor_with_pad,
|
||||
)
|
||||
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||
|
||||
# For now, disable "test_aot_dispatch_dynamic" since there are some
|
||||
# bugs related to this test in PyTorch 2.4.
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
MODELS = ["google/gemma-2b", "google/gemma-2-2b"]
|
||||
MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@ -14,8 +14,14 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
|
||||
model,
|
||||
load_format="dummy",
|
||||
) as llm:
|
||||
normalizers = llm.apply_model(
|
||||
lambda model: model.model.normalizer.cpu().item()
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config
|
||||
if model == "google/gemma-3-4b-it":
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
||||
else:
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.model.normalizer.cpu().item()
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config
|
||||
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
|
||||
|
||||
@ -113,6 +113,25 @@ VLM_TEST_SETTINGS = {
|
||||
dtype="bfloat16" if current_platform.is_cpu() else "auto",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"paligemma": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# Paligemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype="bfloat16",
|
||||
marks=[
|
||||
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||
],
|
||||
),
|
||||
"qwen2_5_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
@ -177,24 +196,14 @@ VLM_TEST_SETTINGS = {
|
||||
# Gemma3 has bidirectional mask on images
|
||||
"gemma3-transformers": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
# TODO: Support `do_pan_and_scan` in transformers backend
|
||||
# patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
# "mm_processor_kwargs": {"do_pan_and_scan": True},
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
@ -213,27 +222,6 @@ VLM_TEST_SETTINGS = {
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# PaliGemma has PrefixLM attention
|
||||
"paligemma-transformers": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# PaliGemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# Pixel values from processor are not 4D or 5D arrays
|
||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
@ -360,6 +348,24 @@ VLM_TEST_SETTINGS = {
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
num_logprobs=10,
|
||||
),
|
||||
"glm4v": VLMTestInfo(
|
||||
models=["zai-org/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
|
||||
@ -328,6 +328,16 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
hf_model.processor = processor
|
||||
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, **kwargs):
|
||||
# FIXME: https://github.com/huggingface/transformers/issues/38333
|
||||
kwargs["disable_compile"] = True
|
||||
|
||||
return orig_generate(*args, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
|
||||
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.models.radio import RadioModel
|
||||
from vllm.transformers_utils.configs.radio import RadioConfig
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
|
||||
|
||||
@ -222,6 +222,7 @@ def _test_processing_correctness(
|
||||
_ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"ovis": False,
|
||||
"ovis2_5": False,
|
||||
"paligemma": False,
|
||||
"ultravox": False,
|
||||
"whisper": False,
|
||||
}
|
||||
@ -332,6 +333,7 @@ def _test_processing_correctness_one(
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"adept/fuyu-8b",
|
||||
"google/gemma-3-4b-it",
|
||||
"google/gemma-3n-E2B-it",
|
||||
"zai-org/glm-4v-9b",
|
||||
"zai-org/GLM-4.1V-9B-Thinking",
|
||||
@ -368,6 +370,8 @@ def _test_processing_correctness_one(
|
||||
"AIDC-AI/Ovis1.6-Llama3.2-3B",
|
||||
"AIDC-AI/Ovis2-1B",
|
||||
"AIDC-AI/Ovis2.5-2B",
|
||||
"google/paligemma-3b-mix-224",
|
||||
"google/paligemma2-3b-ft-docci-448",
|
||||
"microsoft/Phi-3.5-vision-instruct",
|
||||
"microsoft/Phi-4-multimodal-instruct",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
|
||||
@ -26,6 +26,7 @@ from vllm.distributed import (
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.models.interfaces import (
|
||||
SupportsMultiModal,
|
||||
supports_multimodal,
|
||||
@ -35,7 +36,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
|
||||
from ...utils import dummy_hf_overrides
|
||||
@ -48,6 +48,7 @@ ARCH_NEEDS_EXTRAS = [
|
||||
"Idefics3ForConditionalGeneration",
|
||||
"LlavaForConditionalGeneration",
|
||||
"MiniCPMV",
|
||||
"PaliGemmaForConditionalGeneration",
|
||||
]
|
||||
REPO_ID_TO_SKIP = {
|
||||
"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
|
||||
|
||||
@ -67,17 +67,17 @@ class _HfExamplesInfo:
|
||||
|
||||
is_available_online: bool = True
|
||||
"""
|
||||
Set this to `False` if the name of this architecture no longer exists on
|
||||
Set this to ``False`` if the name of this architecture no longer exists on
|
||||
the HF repo. To maintain backwards compatibility, we have not removed them
|
||||
from the main model registry, so without this flag the registry tests will
|
||||
fail.
|
||||
"""
|
||||
|
||||
trust_remote_code: bool = False
|
||||
"""The `trust_remote_code` level required to load the model."""
|
||||
"""The ``trust_remote_code`` level required to load the model."""
|
||||
|
||||
hf_overrides: dict[str, Any] = field(default_factory=dict)
|
||||
"""The `hf_overrides` required to load the model."""
|
||||
"""The ``hf_overrides`` required to load the model."""
|
||||
|
||||
max_model_len: int | None = None
|
||||
"""
|
||||
@ -652,10 +652,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
|
||||
"lightonai/LightOnOCR-1B",
|
||||
is_available_online=False,
|
||||
),
|
||||
"Llama4ForConditionalGeneration": _HfExamplesInfo(
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
max_model_len=10240,
|
||||
|
||||
@ -7,7 +7,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import (
|
||||
generate_scheduler_kv_cache_config,
|
||||
get_kv_cache_configs,
|
||||
|
||||
@ -162,7 +162,7 @@ def check_logprobs_close(
|
||||
|
||||
# Test prompt logprobs closeness
|
||||
if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
|
||||
# Both sequences' prompt logprobs lists are not `None`
|
||||
# Both sequences' prompt logprobs lists are not `None``
|
||||
# (although individual list elements may be `None`);
|
||||
# for each token's logprobs:
|
||||
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
|
||||
|
||||
@ -1,29 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.v1.metrics.loggers import StatLoggerBase
|
||||
|
||||
|
||||
class DummyStatLogger(StatLoggerBase):
|
||||
"""
|
||||
A dummy stat logger for testing purposes.
|
||||
Implements the minimal interface expected by StatLoggerManager.
|
||||
"""
|
||||
|
||||
def __init__(self, vllm_config, engine_idx=0):
|
||||
self.vllm_config = vllm_config
|
||||
self.engine_idx = engine_idx
|
||||
self.recorded = []
|
||||
self.logged = False
|
||||
self.engine_initialized = False
|
||||
|
||||
def record(self, scheduler_stats, iteration_stats, mm_cache_stats, engine_idx):
|
||||
self.recorded.append(
|
||||
(scheduler_stats, iteration_stats, mm_cache_stats, engine_idx)
|
||||
)
|
||||
|
||||
def log(self):
|
||||
self.logged = True
|
||||
|
||||
def log_engine_initialized(self):
|
||||
self.engine_initialized = True
|
||||
@ -1,15 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="dummy_stat_logger",
|
||||
version="0.1",
|
||||
packages=["dummy_stat_logger"],
|
||||
entry_points={
|
||||
"vllm.stat_logger_plugins": [
|
||||
"dummy_stat_logger = dummy_stat_logger.dummy_stat_logger:DummyStatLogger" # noqa
|
||||
]
|
||||
},
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user