Compare commits
91 Commits
triton-con
...
codex/chan
| Author | SHA1 | Date | |
|---|---|---|---|
| 5bb81e284b | |||
| 01513a334a | |||
| ac2bf41e53 | |||
| a931b4cdcf | |||
| a0f8a79646 | |||
| 18bdcf4113 | |||
| 1c3198b6c4 | |||
| 260127ea54 | |||
| d0dc4cfca4 | |||
| d31a647124 | |||
| 85431bd9ad | |||
| c11013db8b | |||
| 1eb2b9c102 | |||
| 6ebf313790 | |||
| cfbcb9ed87 | |||
| 76ddeff293 | |||
| f46098335b | |||
| e9534c7202 | |||
| 7976446015 | |||
| fcb9f879c1 | |||
| 3ed94f9d0a | |||
| fa839565f2 | |||
| 75a99b98bf | |||
| b5c3b68359 | |||
| 6cbc4d4bea | |||
| 153c6f1e61 | |||
| 34cda778a0 | |||
| 30800b01c2 | |||
| 10be209493 | |||
| 19c863068b | |||
| f29fd8a7f8 | |||
| ed10f3cea1 | |||
| b637e9dcb8 | |||
| 1e36c8687e | |||
| 5bac61362b | |||
| 313ae8c16a | |||
| c847e34b39 | |||
| e7e3e6d263 | |||
| 4ffd963fa0 | |||
| 56fe4bedd6 | |||
| d91278181d | |||
| 20149d84d9 | |||
| 3534c39a20 | |||
| c586b55667 | |||
| 33d560001e | |||
| f148c44c6a | |||
| 235bfd5dfe | |||
| 68d28e37b0 | |||
| 37a7d5d74a | |||
| d4d309409f | |||
| 85bd6599e4 | |||
| 91b3d190ae | |||
| fc017915f5 | |||
| 9ad0a4588b | |||
| 016b8d1b7f | |||
| 80305c1b24 | |||
| 37e2ecace2 | |||
| 054c8657e3 | |||
| d4170fad39 | |||
| 946aadb4a0 | |||
| bcdfb2a330 | |||
| ba8c300018 | |||
| 8cdc371217 | |||
| 61e20828da | |||
| 55e1c66da5 | |||
| 86f3ac21ce | |||
| 149f2435a5 | |||
| c0569dbc82 | |||
| 8bb43b9c9e | |||
| 559756214b | |||
| 6d0cf239c6 | |||
| 3fc964433a | |||
| 0caf61c08a | |||
| 667624659b | |||
| 38efa28278 | |||
| e8cc53af5e | |||
| a4851cfe68 | |||
| 9887e8ec50 | |||
| f326ab9c88 | |||
| dcf2a5e208 | |||
| 1e9438e0b0 | |||
| 697ef765ee | |||
| a99b9f7dee | |||
| c488b928a7 | |||
| 2c7fa47161 | |||
| 88fc8a97e3 | |||
| 66f6fbd393 | |||
| 8632e831ba | |||
| 4bbfc36b16 | |||
| 80d38b8ac8 | |||
| 211b6a6113 |
@ -6,19 +6,17 @@ set -exuo pipefail
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
FROM 1.22-413-pt2.7.1:latest
|
||||
FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements/hpu.txt
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
@ -117,7 +117,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Test # 40min
|
||||
- label: Entrypoints Test (LLM) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@ -125,8 +125,6 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@ -135,9 +133,21 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Test (API Server) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -630,6 +640,18 @@ steps:
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
|
||||
- label: Transformers Nightly Models Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
|
||||
6
.gemini/config.yaml
Normal file
6
.gemini/config.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
|
||||
have_fun: false # Just review the code
|
||||
code_review:
|
||||
comment_severity_threshold: HIGH # Reduce quantity of comments
|
||||
pull_request_opened:
|
||||
summary: false # Don't summarize the PR in a separate comment
|
||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -16,6 +16,7 @@
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm
|
||||
/vllm/entrypoints @aarnphm
|
||||
/vllm/compilation @zou3519 @youkaichao
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
|
||||
@ -21,7 +21,7 @@ repos:
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.32.0
|
||||
rev: v1.34.0
|
||||
hooks:
|
||||
- id: typos
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
@ -166,7 +166,7 @@ repos:
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: true
|
||||
files: vllm/config.py|tests/test_config.py
|
||||
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -553,7 +553,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu"
|
||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${MLA_ARCHS}")
|
||||
|
||||
@ -63,8 +63,6 @@ vLLM is fast with:
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
|
||||
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
|
||||
- Seamless integration with popular Hugging Face models
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
|
||||
#include "attention_dtypes.h"
|
||||
#include "attention_utils.cuh"
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
@ -33,12 +34,6 @@ typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||
@ -670,7 +665,6 @@ __global__ void paged_attention_v2_reduce_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
372
csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
Normal file
372
csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
Normal file
@ -0,0 +1,372 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
*this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
*LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
*POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*
|
||||
* Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
|
||||
* by Alcanderian JieXin Liang
|
||||
*/
|
||||
|
||||
/*!
|
||||
\file
|
||||
\brief An universal device layer for cutlass 3.x-style kernels.
|
||||
*/
|
||||
|
||||
// clang-format off
|
||||
#pragma once
|
||||
|
||||
// common
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/device_kernel.h"
|
||||
|
||||
#if !defined(__CUDACC_RTC__)
|
||||
#include "cutlass/cluster_launch.hpp"
|
||||
#include "cutlass/trace.h"
|
||||
#endif // !defined(__CUDACC_RTC__)
|
||||
|
||||
#include "../kernel/sm100_fmha_mla_tma_warpspecialized.hpp"
|
||||
#include "../kernel/sm100_fmha_mla_reduction.hpp"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass::fmha::device {
|
||||
|
||||
using namespace cute;
|
||||
using namespace cutlass::fmha::kernel;
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////// CUTLASS 3.x API /////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<
|
||||
class Kernel_
|
||||
>
|
||||
class MLA {
|
||||
public:
|
||||
|
||||
using Kernel = Kernel_;
|
||||
|
||||
using ReductionKernel = cutlass::fmha::kernel::Sm100FmhaMlaReductionKernel<
|
||||
typename Kernel::ElementOut,
|
||||
typename Kernel::ElementAcc,
|
||||
typename Kernel::ElementAcc,
|
||||
Kernel::TileShapeH::value,
|
||||
Kernel::TileShapeL::value,
|
||||
256 /*Max split*/
|
||||
>;
|
||||
|
||||
/// Argument structure: User API
|
||||
using KernelArguments = typename Kernel::Arguments;
|
||||
using ReductionArguments = typename ReductionKernel::Arguments;
|
||||
|
||||
using Arguments = KernelArguments;
|
||||
|
||||
/// Argument structure: Kernel API
|
||||
using KernelParams = typename Kernel::Params;
|
||||
using ReductionParams = typename ReductionKernel::Params;
|
||||
struct Params {
|
||||
KernelParams fmha_params;
|
||||
ReductionParams reduction_params;
|
||||
};
|
||||
|
||||
private:
|
||||
|
||||
/// Kernel API parameters object
|
||||
Params params_;
|
||||
|
||||
bool is_initialized(bool set = false) {
|
||||
static bool initialized = false;
|
||||
if (set) initialized = true;
|
||||
return initialized;
|
||||
}
|
||||
|
||||
static ReductionArguments to_reduction_args(Arguments const& args) {
|
||||
auto [H, K, D, B] = args.problem_shape;
|
||||
return ReductionArguments{
|
||||
nullptr, args.epilogue.ptr_o, nullptr, args.epilogue.ptr_lse,
|
||||
args.mainloop.softmax_scale, B, args.split_kv, K, args.mainloop.ptr_seq,
|
||||
args.ptr_split_kv, Kernel::TileShapeS::value
|
||||
};
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/// Access the Params structure
|
||||
Params const& params() const {
|
||||
return params_;
|
||||
}
|
||||
|
||||
static void set_split_kv (KernelArguments& args) {
|
||||
// printf("set_split_kv start");
|
||||
if (args.split_kv >= 1) return;
|
||||
auto [H, K, D, B] = args.problem_shape;
|
||||
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
||||
int sm_count = args.hw_info.sm_count;
|
||||
// printf(" sm_count = %d\n", sm_count);
|
||||
int max_splits = ceil_div(K, 128);
|
||||
max_splits = min(16, max_splits);
|
||||
// printf(" max_splits = %d\n", max_splits);
|
||||
int sms_per_batch = max(1, sm_count / B);
|
||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||
int split_heur = min(max_splits, sms_per_batch);
|
||||
int waves = ceil_div(B * split_heur, sm_count);
|
||||
int k_waves = ceil_div(max_splits, split_heur);
|
||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||
args.split_kv = split_wave_aware;
|
||||
// printf(" args.split_kv = %d\n", args.split_kv);
|
||||
|
||||
}
|
||||
|
||||
/// Determines whether the GEMM can execute the given problem.
|
||||
static Status
|
||||
can_implement(Arguments const& args) {
|
||||
if (! Kernel::can_implement(args)) {
|
||||
return Status::kInvalid;
|
||||
}
|
||||
if (! ReductionKernel::can_implement(to_reduction_args(args))) {
|
||||
return Status::kInvalid;
|
||||
}
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
/// Gets the workspace size
|
||||
static size_t
|
||||
get_workspace_size(Arguments const& args) {
|
||||
size_t workspace_bytes = 0;
|
||||
workspace_bytes += Kernel::get_workspace_size(args);
|
||||
workspace_bytes += ReductionKernel::get_workspace_size(to_reduction_args(args));
|
||||
return workspace_bytes;
|
||||
}
|
||||
|
||||
/// Computes the maximum number of active blocks per multiprocessor
|
||||
static int maximum_active_blocks(int /* smem_capacity */ = -1) {
|
||||
CUTLASS_TRACE_HOST("MLA::maximum_active_blocks()");
|
||||
int max_active_blocks = -1;
|
||||
int smem_size = Kernel::SharedStorageSize;
|
||||
|
||||
// first, account for dynamic smem capacity if needed
|
||||
cudaError_t result;
|
||||
if (smem_size >= (48 << 10)) {
|
||||
CUTLASS_TRACE_HOST(" Setting smem size to " << smem_size);
|
||||
result = cudaFuncSetAttribute(
|
||||
device_kernel<Kernel>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
smem_size);
|
||||
if (cudaSuccess != result) {
|
||||
result = cudaGetLastError(); // to clear the error bit
|
||||
CUTLASS_TRACE_HOST(
|
||||
" cudaFuncSetAttribute() returned error: "
|
||||
<< cudaGetErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// query occupancy after setting smem size
|
||||
result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks,
|
||||
device_kernel<Kernel>,
|
||||
Kernel::MaxThreadsPerBlock,
|
||||
smem_size);
|
||||
|
||||
if (cudaSuccess != result) {
|
||||
result = cudaGetLastError(); // to clear the error bit
|
||||
CUTLASS_TRACE_HOST(
|
||||
" cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
|
||||
<< cudaGetErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
CUTLASS_TRACE_HOST(" max_active_blocks: " << max_active_blocks);
|
||||
return max_active_blocks;
|
||||
}
|
||||
|
||||
/// Initializes GEMM state from arguments.
|
||||
Status
|
||||
initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
|
||||
CUTLASS_TRACE_HOST("MLA::initialize() - workspace "
|
||||
<< workspace << ", stream: " << (stream ? "non-null" : "null"));
|
||||
|
||||
// Initialize the workspace
|
||||
Status status = Kernel::initialize_workspace(args, workspace, stream);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
status = ReductionKernel::initialize_workspace(to_reduction_args(args), workspace, stream);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
KernelParams kernel_params = Kernel::to_underlying_arguments(args, workspace);
|
||||
|
||||
ReductionArguments reduction_args = to_reduction_args(args);
|
||||
if (reduction_args.split_kv > 1) {
|
||||
reduction_args.ptr_oaccum = kernel_params.epilogue.ptr_o_acc;
|
||||
reduction_args.ptr_lseaccum = kernel_params.epilogue.ptr_lse_acc;
|
||||
}
|
||||
ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
|
||||
// Initialize the Params structure
|
||||
params_ = Params {kernel_params, reduction_params};
|
||||
|
||||
if (is_initialized()) return Status::kSuccess;
|
||||
|
||||
// account for dynamic smem capacity if needed
|
||||
// no dynamic smem is needed for reduction kernel
|
||||
int smem_size = Kernel::SharedStorageSize;
|
||||
if (smem_size >= (48 << 10)) {
|
||||
CUTLASS_TRACE_HOST(" Setting smem size to " << smem_size);
|
||||
cudaError_t result = cudaFuncSetAttribute(
|
||||
device_kernel<Kernel>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
smem_size);
|
||||
if (cudaSuccess != result) {
|
||||
result = cudaGetLastError(); // to clear the error bit
|
||||
CUTLASS_TRACE_HOST(" cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
is_initialized(true);
|
||||
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
/// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
|
||||
Status
|
||||
update(Arguments const& args, void* workspace = nullptr) {
|
||||
CUTLASS_TRACE_HOST("MLA()::update() - workspace: " << workspace);
|
||||
|
||||
size_t workspace_bytes = get_workspace_size(args);
|
||||
if (workspace_bytes > 0 && nullptr == workspace) {
|
||||
return Status::kErrorWorkspaceNull;
|
||||
}
|
||||
|
||||
auto fmha_params = Kernel::to_underlying_arguments(args, workspace);
|
||||
|
||||
ReductionArguments reduction_args = to_reduction_args(args);
|
||||
if (reduction_args.split_kv > 1) {
|
||||
reduction_args.ptr_oaccum = fmha_params.epilogue.ptr_o_acc;
|
||||
reduction_args.ptr_lseaccum = fmha_params.epilogue.ptr_lse_acc;
|
||||
}
|
||||
ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
|
||||
// Initialize the Params structure
|
||||
params_ = Params {fmha_params, reduction_params};
|
||||
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
/// Primary run() entry point API that is static allowing users to create and manage their own params.
|
||||
/// Supplied params struct must be construct by calling Kernel::to_underling_arguments()
|
||||
static Status
|
||||
run(Params& params, cudaStream_t stream = nullptr) {
|
||||
CUTLASS_TRACE_HOST("MLA::run()");
|
||||
dim3 const block = Kernel::get_block_shape();
|
||||
dim3 const grid = Kernel::get_grid_shape(params.fmha_params);
|
||||
|
||||
// configure smem size and carveout
|
||||
int smem_size = Kernel::SharedStorageSize;
|
||||
|
||||
Status launch_result;
|
||||
// Use extended launch API only for mainloops that use it
|
||||
if constexpr(Kernel::ArchTag::kMinComputeCapability >= 90) {
|
||||
dim3 cluster(cute::size<0>(typename Kernel::ClusterShape{}),
|
||||
cute::size<1>(typename Kernel::ClusterShape{}),
|
||||
cute::size<2>(typename Kernel::ClusterShape{}));
|
||||
void const* kernel = (void const*) device_kernel<Kernel>;
|
||||
void* kernel_params[] = {¶ms.fmha_params};
|
||||
launch_result = ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params);
|
||||
}
|
||||
else {
|
||||
launch_result = Status::kSuccess;
|
||||
device_kernel<Kernel><<<grid, block, smem_size, stream>>>(params.fmha_params);
|
||||
}
|
||||
|
||||
cudaError_t result = cudaGetLastError();
|
||||
if (cudaSuccess != result or Status::kSuccess != launch_result) {
|
||||
//return Status::kSuccess;
|
||||
CUTLASS_TRACE_HOST(" Kernel launch failed. Reason: " << result);
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
if (params.reduction_params.split_kv > 1) {
|
||||
// launch reduction kernel
|
||||
dim3 const block = ReductionKernel::get_block_shape();
|
||||
dim3 const grid = ReductionKernel::get_grid_shape(params.reduction_params);
|
||||
device_kernel<ReductionKernel><<<grid, block, 0, stream>>>(params.reduction_params);
|
||||
cudaError_t result = cudaGetLastError();
|
||||
if (cudaSuccess == result) {
|
||||
return Status::kSuccess;
|
||||
}
|
||||
else {
|
||||
CUTLASS_TRACE_HOST(" Kernel launch failed. Reason: " << result);
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
else {
|
||||
return Status::kSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Non-static launch overloads that first create and set the internal params struct of this kernel handle.
|
||||
//
|
||||
|
||||
/// Launches the kernel after first constructing Params internal state from supplied arguments.
|
||||
Status
|
||||
run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
|
||||
Status status = initialize(args, workspace, stream);
|
||||
if (Status::kSuccess == status) {
|
||||
status = run(params_, stream);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
/// Launches the kernel after first constructing Params internal state from supplied arguments.
|
||||
Status
|
||||
operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
|
||||
return run(args, workspace, stream);
|
||||
}
|
||||
|
||||
/// Overload that allows a user to re-launch the same kernel without updating internal params struct.
|
||||
Status
|
||||
run(cudaStream_t stream = nullptr) {
|
||||
return run(params_, stream);
|
||||
}
|
||||
|
||||
/// Overload that allows a user to re-launch the same kernel without updating internal params struct.
|
||||
Status
|
||||
operator()(cudaStream_t stream = nullptr) {
|
||||
return run(params_, stream);
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace cutlass::fmha::device
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -0,0 +1,203 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
|
||||
*reserved. SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
*this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
*LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
*POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*
|
||||
* Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
|
||||
* by Alcanderian JieXin Liang
|
||||
*/
|
||||
|
||||
// clang-format off
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/arch/arch.h"
|
||||
#include "cute/tensor.hpp"
|
||||
|
||||
namespace cutlass::fmha::kernel {
|
||||
|
||||
using namespace cute;
|
||||
template<
|
||||
class ElementOut,
|
||||
class ElementAcc,
|
||||
class ElementScale,
|
||||
size_t kNumHeads,
|
||||
size_t kHeadDimLatent,
|
||||
int kMaxSplits
|
||||
>
|
||||
struct Sm100FmhaMlaReductionKernel {
|
||||
|
||||
static const int SharedStorageSize = 0;
|
||||
static const int MaxThreadsPerBlock = 128;
|
||||
static const int MinBlocksPerMultiprocessor = 1;
|
||||
|
||||
using ArchTag = cutlass::arch::Sm100;
|
||||
|
||||
static_assert(kHeadDimLatent % MaxThreadsPerBlock == 0);
|
||||
struct Arguments {
|
||||
ElementAcc* ptr_oaccum = nullptr;
|
||||
ElementOut* ptr_o = nullptr;
|
||||
ElementAcc* ptr_lseaccum = nullptr;
|
||||
ElementAcc* ptr_lse = nullptr;
|
||||
ElementScale scale = 1.f;
|
||||
int num_batches = 0;
|
||||
int split_kv = -1;
|
||||
int dim_k = -1;
|
||||
int* ptr_seq = nullptr;
|
||||
int* ptr_split_kv = nullptr;
|
||||
int tile_shape_s = 128;
|
||||
};
|
||||
using Params = Arguments;
|
||||
|
||||
static Params to_underlying_arguments(Arguments const& args, void* workspace) {
|
||||
return {args.ptr_oaccum, args.ptr_o, args.ptr_lseaccum, args.ptr_lse,
|
||||
args.scale, args.num_batches, args.split_kv, args.dim_k, args.ptr_seq,
|
||||
args.ptr_split_kv, args.tile_shape_s};
|
||||
}
|
||||
|
||||
static size_t get_workspace_size(Arguments const& /*args*/) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static Status initialize_workspace(
|
||||
Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
static dim3 get_grid_shape(Params const& params) {
|
||||
return dim3(kNumHeads, 1, params.num_batches);
|
||||
}
|
||||
|
||||
static dim3 get_block_shape() {
|
||||
return dim3(MaxThreadsPerBlock, 1, 1);
|
||||
}
|
||||
|
||||
static bool can_implement(Arguments const& args) {
|
||||
if (args.num_batches <= 0) return false;
|
||||
if (args.split_kv <= 0) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE void operator() (Params const& params, char* smem_raw) {
|
||||
if (params.split_kv <= 1) return;
|
||||
auto blk_coord = make_coord(blockIdx.x, _0{}, blockIdx.z);
|
||||
|
||||
__shared__ ElementAcc sLseScale[kMaxSplits];
|
||||
const size_t offset_lseaccum = get<0>(blk_coord) + kNumHeads * params.split_kv * get<2>(blk_coord);
|
||||
const size_t offset_lse = get<0>(blk_coord) + kNumHeads * get<2>(blk_coord);
|
||||
|
||||
Tensor gLSEaccum = make_tensor(make_gmem_ptr(params.ptr_lseaccum + offset_lseaccum),
|
||||
make_shape(params.split_kv), Stride<Int<kNumHeads>>{});
|
||||
|
||||
Tensor gLSE = make_tensor(make_gmem_ptr(params.ptr_lse + offset_lse),
|
||||
Shape<_1>{}, Stride<_1>{});
|
||||
|
||||
auto dim_k = params.ptr_seq == nullptr ? params.dim_k : params.ptr_seq[get<2>(blk_coord)];
|
||||
auto local_split_kv = params.ptr_split_kv == nullptr ? params.split_kv : params.ptr_split_kv[get<2>(blk_coord)];
|
||||
auto k_tile_total = ceil_div(dim_k, params.tile_shape_s);
|
||||
auto k_tile_per_cta = ceil_div(k_tile_total, local_split_kv);
|
||||
local_split_kv = ceil_div(k_tile_total, k_tile_per_cta);
|
||||
|
||||
int warp_idx = cutlass::canonical_warp_idx_sync();
|
||||
if (warp_idx == 0) {
|
||||
constexpr int kNLsePerThread = cute::ceil_div(kMaxSplits, 32);
|
||||
|
||||
ElementAcc local_lse[kNLsePerThread];
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kNLsePerThread; ++i) {
|
||||
const int split = i * 32 + threadIdx.x;
|
||||
local_lse[i] = split < local_split_kv ? gLSEaccum(split) : -std::numeric_limits<ElementAcc>::infinity();
|
||||
}
|
||||
|
||||
ElementAcc lse_max = -std::numeric_limits<ElementAcc>::infinity();
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kNLsePerThread; ++i) {
|
||||
lse_max = max(lse_max, local_lse[i]);
|
||||
}
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int offset = 16; offset >= 1; offset /= 2) {
|
||||
lse_max = max(lse_max, __shfl_xor_sync(0xffffffff, lse_max, offset));
|
||||
}
|
||||
lse_max = lse_max == -std::numeric_limits<ElementAcc>::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf
|
||||
lse_max = __shfl_sync(0xffffffff, lse_max, 0);
|
||||
|
||||
ElementAcc sum_lse = 0;
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kNLsePerThread; ++i) {
|
||||
sum_lse = sum_lse + expf(local_lse[i] - lse_max);
|
||||
}
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int offset = 16; offset >= 1; offset /= 2) {
|
||||
sum_lse = sum_lse + __shfl_xor_sync(0xffffffff, sum_lse, offset);
|
||||
}
|
||||
|
||||
sum_lse = __shfl_sync(0xffffffff, sum_lse, 0);
|
||||
|
||||
ElementAcc global_lse = (sum_lse == 0.f || sum_lse != sum_lse) ? std::numeric_limits<ElementAcc>::infinity() : logf(sum_lse) + lse_max;
|
||||
if (threadIdx.x == 0 and params.ptr_lse != nullptr) {
|
||||
gLSE(0) = global_lse;
|
||||
}
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kNLsePerThread; ++i) {
|
||||
const int split = i * 32 + threadIdx.x;
|
||||
if (split < local_split_kv) {
|
||||
sLseScale[split] = expf(local_lse[i] - global_lse);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
constexpr int Elements = kHeadDimLatent / MaxThreadsPerBlock;
|
||||
const size_t offset_oaccum = kHeadDimLatent * params.split_kv * (get<0>(blk_coord) + kNumHeads * get<2>(blk_coord));
|
||||
Tensor gOaccum = make_tensor(make_gmem_ptr(params.ptr_oaccum + offset_oaccum),
|
||||
Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
|
||||
ElementAcc local_val[Elements] = {0};
|
||||
for (int split = 0; split < local_split_kv; ++split) {
|
||||
ElementAcc lse_scale = sLseScale[split];
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for(int i = 0; i < Elements; ++i) {
|
||||
local_val[i] += lse_scale * gOaccum(threadIdx.x + MaxThreadsPerBlock * i);
|
||||
}
|
||||
gOaccum.data() = gOaccum.data() + kHeadDimLatent;
|
||||
}
|
||||
auto ptr_o_local = params.ptr_o + (get<0>(blk_coord) + get<2>(blk_coord) * kNumHeads) * kHeadDimLatent;
|
||||
Tensor gO = make_tensor(make_gmem_ptr(ptr_o_local), Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for(int i = 0; i < Elements; ++i) {
|
||||
gO(threadIdx.x + MaxThreadsPerBlock * i) = static_cast<ElementOut>(local_val[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cutlass::fmha::kernel
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,165 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
|
||||
*reserved. SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
*this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
*LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
*POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*
|
||||
* Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
|
||||
* by Alcanderian JieXin Liang
|
||||
*/
|
||||
|
||||
// clang-format off
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/kernel_hardware_info.h"
|
||||
|
||||
namespace cutlass::fmha::kernel {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct Sm100MlaIndividualTileScheduler {
|
||||
|
||||
struct Params {
|
||||
dim3 grid;
|
||||
};
|
||||
|
||||
bool valid_ = true;
|
||||
|
||||
CUTLASS_DEVICE
|
||||
Sm100MlaIndividualTileScheduler(Params const&) {}
|
||||
|
||||
template<class ProblemShape, class ClusterShape>
|
||||
static Params to_underlying_arguments(
|
||||
ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
|
||||
ClusterShape const& cluster_shape, int const& split_kv) {
|
||||
using namespace cute;
|
||||
dim3 grid(get<0>(cluster_shape), get<3>(problem_shape) /* Batch */, split_kv /*Maximum Split KV*/);
|
||||
return Params{ grid };
|
||||
}
|
||||
|
||||
static dim3 get_grid_shape(Params const& params) {
|
||||
return params.grid;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
bool is_valid() {
|
||||
return valid_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
auto get_block_coord() {
|
||||
using namespace cute;
|
||||
return make_coord(blockIdx.x, _0{}, blockIdx.y, blockIdx.z);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
Sm100MlaIndividualTileScheduler& operator++() {
|
||||
valid_ = false;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct Sm100MlaPersistentTileScheduler {
|
||||
|
||||
struct Params {
|
||||
int num_blocks;
|
||||
FastDivmod divmod_m_block;
|
||||
FastDivmod divmod_b;
|
||||
FastDivmod divmod_split_kv;
|
||||
KernelHardwareInfo hw_info;
|
||||
};
|
||||
|
||||
int block_idx = 0;
|
||||
Params params;
|
||||
|
||||
CUTLASS_DEVICE
|
||||
Sm100MlaPersistentTileScheduler(Params const& params) : block_idx(blockIdx.x), params(params) {}
|
||||
|
||||
template<class ProblemShape, class ClusterShape>
|
||||
static Params to_underlying_arguments(
|
||||
ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
|
||||
ClusterShape const& cluster_shape, int const& split_kv) {
|
||||
using namespace cute;
|
||||
// Get SM count if needed, otherwise use user supplied SM count
|
||||
int sm_count = hw_info.sm_count;
|
||||
if (sm_count <= 1 || sm_count % size<0>(cluster_shape) != 0) {
|
||||
CUTLASS_TRACE_HOST(" WARNING: Arguments do not include a valid SM count.\n"
|
||||
" For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
|
||||
sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
|
||||
}
|
||||
|
||||
CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
|
||||
hw_info.sm_count = sm_count;
|
||||
|
||||
int num_m_blocks = size<0>(cluster_shape);
|
||||
int num_blocks = num_m_blocks * get<3>(problem_shape) /* Batch */;
|
||||
num_blocks *= split_kv; /* Maximum Split KV*/
|
||||
|
||||
return Params {
|
||||
num_blocks,
|
||||
{ num_m_blocks}, { get<3>(problem_shape) }, {split_kv},
|
||||
hw_info
|
||||
};
|
||||
}
|
||||
|
||||
static dim3 get_grid_shape(Params const& params) {
|
||||
dim3 grid(std::min(params.num_blocks, params.hw_info.sm_count), 1, 1);
|
||||
return grid;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
bool is_valid() {
|
||||
return block_idx < params.num_blocks;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
auto get_block_coord() {
|
||||
using namespace cute;
|
||||
int block_decode = block_idx;
|
||||
int m_block, bidb, n_split_kv;
|
||||
params.divmod_m_block(block_decode, m_block, block_decode);
|
||||
params.divmod_b(block_decode, bidb, block_decode);
|
||||
params.divmod_split_kv(block_decode, n_split_kv, block_decode);
|
||||
return make_coord(m_block, _0{}, bidb, n_split_kv);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
Sm100MlaPersistentTileScheduler& operator++() {
|
||||
block_idx += gridDim.x;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace cutlass::fmha::kernel
|
||||
283
csrc/attention/mla/sm100_cutlass_mla_kernel.cu
Normal file
283
csrc/attention/mla/sm100_cutlass_mla_kernel.cu
Normal file
@ -0,0 +1,283 @@
|
||||
/*
|
||||
Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
Copyright 2025 SGLang Team. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
/*
|
||||
* Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
|
||||
* by Alcanderian JieXin Liang
|
||||
*/
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <cutlass/cutlass.h>
|
||||
#include <cutlass/kernel_hardware_info.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <cute/tensor.hpp>
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass_sm100_mla/device/sm100_mla.hpp"
|
||||
#include "cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp"
|
||||
|
||||
// clang-format off
|
||||
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
|
||||
void sm100_cutlass_mla_decode(
|
||||
torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table,
|
||||
torch::Tensor const& workspace,
|
||||
int64_t num_kv_splits) {
|
||||
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
|
||||
}
|
||||
int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
|
||||
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_get_workspace_size");
|
||||
}
|
||||
#else
|
||||
|
||||
#define CUTLASS_CHECK(status) \
|
||||
{ \
|
||||
cutlass::Status error = status; \
|
||||
TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
|
||||
}
|
||||
|
||||
using namespace cute;
|
||||
using namespace cutlass::fmha::kernel;
|
||||
|
||||
template <bool v>
|
||||
struct IsPersistent {
|
||||
static const bool value = v;
|
||||
};
|
||||
|
||||
template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
|
||||
struct MlaSm100 {
|
||||
using Element = T;
|
||||
using ElementAcc = float;
|
||||
using ElementOut = T;
|
||||
|
||||
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
|
||||
using TileShapeH = cute::tuple_element_t<0, TileShape>;
|
||||
using TileShapeD = cute::tuple_element_t<2, TileShape>;
|
||||
|
||||
// H K (D_latent D_rope) B
|
||||
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
|
||||
|
||||
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
|
||||
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
|
||||
using StrideO = StrideK; // H D B
|
||||
using StrideLSE = cute::tuple<_1, int>; // H B
|
||||
|
||||
using TileScheduler =
|
||||
std::conditional_t<PersistenceOption::value, Sm100MlaPersistentTileScheduler, Sm100MlaIndividualTileScheduler>;
|
||||
|
||||
using FmhaKernel = cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
|
||||
TileShape,
|
||||
Element,
|
||||
ElementAcc,
|
||||
ElementOut,
|
||||
ElementAcc,
|
||||
TileScheduler,
|
||||
/*kIsCpAsync=*/!IsPaged128>;
|
||||
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename T::Fmha::Arguments args_from_options(
|
||||
at::Tensor const& out,
|
||||
at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache,
|
||||
at::Tensor const& seq_lens,
|
||||
at::Tensor const& page_table,
|
||||
double sm_scale,
|
||||
int64_t num_kv_splits) {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = q_nope.device().index();
|
||||
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
|
||||
|
||||
int batches = q_nope.sizes()[0];
|
||||
int page_count_per_seq = page_table.sizes()[1];
|
||||
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
|
||||
int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
||||
int max_seq_len = page_size * page_count_per_seq;
|
||||
using TileShapeH = typename T::TileShapeH;
|
||||
using TileShapeD = typename T::TileShapeD;
|
||||
auto problem_shape = cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
|
||||
|
||||
auto [H, K, D, B] = problem_shape;
|
||||
auto [D_latent, D_rope] = D;
|
||||
|
||||
float scale = float(sm_scale);
|
||||
|
||||
using StrideQ = typename T::StrideQ;
|
||||
using StrideK = typename T::StrideK;
|
||||
using StrideO = typename T::StrideO;
|
||||
using StrideLSE = typename T::StrideLSE;
|
||||
|
||||
StrideQ stride_Q_nope = cute::make_tuple(
|
||||
static_cast<int64_t>(q_nope.stride(1)), _1{}, static_cast<int64_t>(q_nope.stride(0)));
|
||||
StrideQ stride_Q_pe = cute::make_tuple(
|
||||
static_cast<int64_t>(q_pe.stride(1)), _1{}, static_cast<int64_t>(q_pe.stride(0)));
|
||||
|
||||
StrideK stride_C = cute::make_tuple(
|
||||
static_cast<int64_t>(0 + D_latent + D_rope), _1{}, static_cast<int64_t>(page_size * (D_latent + D_rope)));
|
||||
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
|
||||
StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
|
||||
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{}, static_cast<int64_t>(0 + H * D_latent));
|
||||
|
||||
using Element = typename T::Element;
|
||||
using ElementOut = typename T::ElementOut;
|
||||
using ElementAcc = typename T::ElementAcc;
|
||||
auto Q_nope_ptr = static_cast<Element*>(q_nope.data_ptr());
|
||||
auto Q_pe_ptr = static_cast<Element*>(q_pe.data_ptr());
|
||||
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
|
||||
typename T::Fmha::Arguments arguments{
|
||||
problem_shape,
|
||||
{scale,
|
||||
Q_nope_ptr,
|
||||
stride_Q_nope,
|
||||
Q_pe_ptr,
|
||||
stride_Q_pe,
|
||||
C_ptr,
|
||||
stride_C,
|
||||
C_ptr + D_latent,
|
||||
stride_C,
|
||||
static_cast<int*>(seq_lens.data_ptr()),
|
||||
static_cast<int*>(page_table.data_ptr()),
|
||||
stride_PT,
|
||||
page_count_total,
|
||||
page_size},
|
||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||
hw_info,
|
||||
// TODO(trevor-m): Change split_kv back to -1 when
|
||||
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
|
||||
// perform worse with larger context length and smaller batch sizes.
|
||||
num_kv_splits, // split_kv
|
||||
nullptr, // is_var_split_kv
|
||||
};
|
||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||
// split_kv automatically based on batch size and sequence length to balance
|
||||
// workload across available SMs. Consider using var_split_kv for manual
|
||||
// control if needed.
|
||||
T::Fmha::set_split_kv(arguments);
|
||||
return arguments;
|
||||
}
|
||||
|
||||
template <typename Element, bool IsPaged128, typename PersistenceOption>
|
||||
void runMla(
|
||||
at::Tensor const& out,
|
||||
at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache,
|
||||
at::Tensor const& seq_lens,
|
||||
at::Tensor const& page_table,
|
||||
at::Tensor const& workspace,
|
||||
double sm_scale,
|
||||
int64_t num_kv_splits,
|
||||
cudaStream_t stream) {
|
||||
using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
|
||||
typename MlaSm100Type::Fmha fmha;
|
||||
auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
|
||||
|
||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
||||
|
||||
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
|
||||
|
||||
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
|
||||
}
|
||||
|
||||
#define DISPATCH_BOOL(expr, const_expr, ...) \
|
||||
[&]() -> bool { \
|
||||
if (expr) { \
|
||||
constexpr bool const_expr = true; \
|
||||
return __VA_ARGS__(); \
|
||||
} else { \
|
||||
constexpr bool const_expr = false; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
}()
|
||||
|
||||
void sm100_cutlass_mla_decode(
|
||||
torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table,
|
||||
torch::Tensor const& workspace,
|
||||
double sm_scale,
|
||||
int64_t num_kv_splits) {
|
||||
auto in_dtype = q_nope.dtype();
|
||||
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
||||
const int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
||||
|
||||
// NOTE(alcanderian): IsPersistent has bug with manual split_kv.
|
||||
// Kernel will hang if batch is too large with large num_kv_splits. (for example bs=8, num_kv_splits=8)
|
||||
// Maybe per batch split kv will fix this.
|
||||
DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
|
||||
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
||||
runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
||||
runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
||||
}
|
||||
return true;
|
||||
});
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
|
||||
// Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
|
||||
// which are float, so Element type here doesn't matter.
|
||||
using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
|
||||
|
||||
// Get split kv. Requires problem shape and sm_count only.
|
||||
typename MlaSm100Type::Fmha::Arguments arguments;
|
||||
using TileShapeH = typename MlaSm100Type::TileShapeH;
|
||||
using TileShapeD = typename MlaSm100Type::TileShapeD;
|
||||
arguments.problem_shape =
|
||||
cute::make_tuple(TileShapeH{}, static_cast<int>(max_seq_len), TileShapeD{}, static_cast<int>(num_batches));
|
||||
// Assumes device 0 when getting sm_count.
|
||||
arguments.hw_info.sm_count =
|
||||
sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
|
||||
arguments.split_kv = num_kv_splits;
|
||||
MlaSm100Type::Fmha::set_split_kv(arguments);
|
||||
|
||||
return MlaSm100Type::Fmha::get_workspace_size(arguments);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
|
||||
m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
|
||||
}
|
||||
|
||||
// clang-format on
|
||||
@ -18,12 +18,7 @@
|
||||
*/
|
||||
|
||||
#include "attention_kernels.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -187,7 +182,6 @@ void paged_attention_v1(
|
||||
CALL_V1_LAUNCHER_BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -18,12 +18,7 @@
|
||||
*/
|
||||
|
||||
#include "attention_kernels.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -197,7 +192,6 @@ void paged_attention_v2(
|
||||
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -58,7 +58,7 @@ namespace {
|
||||
|
||||
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
|
||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
|
||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
|
||||
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CPU(x); \
|
||||
|
||||
@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
|
||||
int64_t topk,
|
||||
int64_t num_tokens_post_pad);
|
||||
|
||||
// shared expert implememntation for int8 w8a8
|
||||
// shared expert implementation for int8 w8a8
|
||||
template <typename scalar_t>
|
||||
void shared_expert_int8_kernel_impl(
|
||||
scalar_t* __restrict__ output,
|
||||
|
||||
@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
|
||||
__m512 vd0;
|
||||
__m512 vd1[COLS];
|
||||
|
||||
// oops! 4x4 spills but luckly we use 4x2
|
||||
// oops! 4x4 spills but luckily we use 4x2
|
||||
__m512 vbias[COLS];
|
||||
|
||||
// [NOTE]: s8s8 igemm compensation in avx512-vnni
|
||||
|
||||
@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
|
||||
#define CVT_FP16_TO_FP32(a) \
|
||||
_mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
||||
|
||||
// this doesn't hanel NaN.
|
||||
// this doesn't handle NaN.
|
||||
inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
|
||||
const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
|
||||
|
||||
|
||||
@ -4,10 +4,10 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#if defined(USE_ROCM) && defined(__GFX9__)
|
||||
#define WARP_SIZE 64
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
@ -7,7 +7,11 @@
|
||||
|
||||
#include <c10/util/BFloat16.h>
|
||||
#include <c10/util/Half.h>
|
||||
#include <c10/cuda/CUDAException.h> // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
|
||||
#ifdef USE_ROCM
|
||||
#include <c10/hip/HIPException.h> // For C10_HIP_CHECK and C10_HIP_KERNEL_LAUNCH_CHECK
|
||||
#else
|
||||
#include <c10/cuda/CUDAException.h> // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/block/block_load.cuh>
|
||||
@ -320,8 +324,13 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
||||
dim3 grid(params.batch, params.dim / kNRows);
|
||||
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
||||
if (kSmemSize >= 48 * 1024) {
|
||||
#ifdef USE_ROCM
|
||||
C10_HIP_CHECK(hipFuncSetAttribute(
|
||||
reinterpret_cast<const void*>(kernel), hipFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||
#else
|
||||
C10_CUDA_CHECK(cudaFuncSetAttribute(
|
||||
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
||||
#endif
|
||||
}
|
||||
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
@ -514,6 +514,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor page_table, float scale) -> ()");
|
||||
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
|
||||
|
||||
// SM100 CUTLASS MLA decode
|
||||
ops.def(
|
||||
"sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
|
||||
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
|
||||
" Tensor page_table, Tensor workspace, float "
|
||||
"scale,"
|
||||
" int num_kv_splits) -> ()");
|
||||
// conditionally compiled so impl in source file
|
||||
|
||||
// SM100 CUTLASS MLA workspace
|
||||
ops.def(
|
||||
"sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
|
||||
" int sm_count, int num_kv_splits) "
|
||||
"-> int");
|
||||
// conditionally compiled so impl in source file
|
||||
|
||||
// Compute NVFP4 block quantized tensor.
|
||||
ops.def(
|
||||
"scaled_fp4_quant(Tensor! output, Tensor input,"
|
||||
|
||||
@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
|
||||
ARG PIP_KEYRING_PROVIDER=disabled
|
||||
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
||||
|
||||
# Flag enables build-in KV-connector dependency libs into docker images
|
||||
# Flag enables built-in KV-connector dependency libs into docker images
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
@ -207,6 +207,19 @@ ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED
|
||||
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
|
||||
ENV VLLM_USE_PRECOMPILED=""
|
||||
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
|
||||
export VLLM_USE_PRECOMPILED=1 && \
|
||||
echo "Using precompiled wheels"; \
|
||||
else \
|
||||
unset VLLM_USE_PRECOMPILED && \
|
||||
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
|
||||
fi
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
|
||||
@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="1a7f4dfa"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="6487649"
|
||||
ARG AITER_BRANCH="916bf3c"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
@ -8,7 +8,6 @@ API documentation for vLLM's configuration classes.
|
||||
|
||||
- [vllm.config.ModelConfig][]
|
||||
- [vllm.config.CacheConfig][]
|
||||
- [vllm.config.TokenizerPoolConfig][]
|
||||
- [vllm.config.LoadConfig][]
|
||||
- [vllm.config.ParallelConfig][]
|
||||
- [vllm.config.SchedulerConfig][]
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 57 KiB |
@ -1,3 +1,7 @@
|
||||
---
|
||||
toc_depth: 4
|
||||
---
|
||||
|
||||
# vLLM CLI Guide
|
||||
|
||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||
@ -37,8 +41,15 @@ Start the vLLM OpenAI Compatible API server.
|
||||
|
||||
# To search by keyword
|
||||
vllm serve --help=max
|
||||
|
||||
# To view full help with pager (less/more)
|
||||
vllm serve --help=page
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
|
||||
## chat
|
||||
|
||||
Generate chat completions via the running API server.
|
||||
|
||||
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
## CLI Arguments
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
To see the available CLI arguments, run `vllm serve --help`!
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
|
||||
|
||||
## Configuration file
|
||||
|
||||
|
||||
@ -3,6 +3,15 @@
|
||||
[](){ #deployment-anyscale }
|
||||
|
||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||
It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
|
||||
|
||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
|
||||
|
||||
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
||||
|
||||
## Production-ready vLLM on Anyscale quickstarts
|
||||
|
||||
- [Offline batch inference](https://console.anyscale.com/template-preview/llm_batch_inference?utm_source=vllm_docs)
|
||||
- [Deploy vLLM services](https://console.anyscale.com/template-preview/llm_serving?utm_source=vllm_docs)
|
||||
- [Curate a dataset](https://console.anyscale.com/template-preview/audio-dataset-curation-llm-judge?utm_source=vllm_docs)
|
||||
- [Finetune an LLM](https://console.anyscale.com/template-preview/entity-recognition-with-llms?utm_source=vllm_docs)
|
||||
|
||||
@ -1,26 +1,42 @@
|
||||
# Open WebUI
|
||||
|
||||
1. Install the [Docker](https://docs.docker.com/engine/install/)
|
||||
[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
|
||||
and user-friendly self-hosted AI platform designed to operate entirely offline.
|
||||
It supports various LLM runners like Ollama and OpenAI-compatible APIs,
|
||||
with built-in RAG capabilities, making it a powerful AI deployment solution.
|
||||
|
||||
2. Start the vLLM server with the supported chat completion model, e.g.
|
||||
To get started with Open WebUI using vLLM, follow these steps:
|
||||
|
||||
```bash
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
1. Install the [Docker](https://docs.docker.com/engine/install/).
|
||||
|
||||
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
|
||||
2. Start the vLLM server with a supported chat completion model:
|
||||
|
||||
```bash
|
||||
docker run -d -p 3000:8080 \
|
||||
--name open-webui \
|
||||
-v open-webui:/app/backend/data \
|
||||
-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
|
||||
--restart always \
|
||||
ghcr.io/open-webui/open-webui:main
|
||||
```
|
||||
```console
|
||||
vllm serve Qwen/Qwen3-0.6B-Chat
|
||||
```
|
||||
|
||||
1. Open it in the browser: <http://open-webui-host:3000/>
|
||||
!!! note
|
||||
When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
|
||||
For example:
|
||||
|
||||
On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
|
||||
```console
|
||||
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||

|
||||
3. Start the Open WebUI Docker container:
|
||||
|
||||
```console
|
||||
docker run -d \
|
||||
--name open-webui \
|
||||
-p 3000:8080 \
|
||||
-v open-webui:/app/backend/data \
|
||||
-e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
|
||||
--restart always \
|
||||
ghcr.io/open-webui/open-webui:main
|
||||
```
|
||||
|
||||
4. Open it in the browser: <http://open-webui-host:3000/>
|
||||
|
||||
At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
|
||||
|
||||

|
||||
|
||||
20
docs/deployment/integrations/kuberay.md
Normal file
20
docs/deployment/integrations/kuberay.md
Normal file
@ -0,0 +1,20 @@
|
||||
# KubeRay
|
||||
|
||||
[KubeRay](https://github.com/ray-project/kuberay) provides a Kubernetes-native way to run vLLM workloads on Ray clusters.
|
||||
A Ray cluster can be declared in YAML, and the operator then handles pod scheduling, networking configuration, restarts, and blue-green deployments — all while preserving the familiar Kubernetes experience.
|
||||
|
||||
## Why KubeRay instead of manual scripts?
|
||||
|
||||
| Feature | Manual scripts | KubeRay |
|
||||
|---------|-----------------------------------------------------------|---------|
|
||||
| Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
|
||||
| Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
|
||||
| Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
|
||||
| Declarative config | Bash flags & environment variables | Git-ops-friendly YAML CRDs (RayCluster/RayService) |
|
||||
|
||||
Using KubeRay reduces the operational burden and simplifies integration of Ray + vLLM with existing Kubernetes workflows (CI/CD, secrets, storage classes, etc.).
|
||||
|
||||
## Learn more
|
||||
|
||||
* ["Serve a Large Language Model using Ray Serve LLM on Kubernetes"](https://docs.ray.io/en/master/cluster/kubernetes/examples/rayserve-llm-example.html) - An end-to-end example of how to serve a model using vLLM, KubeRay, and Ray Serve.
|
||||
* [KubeRay documentation](https://docs.ray.io/en/latest/cluster/kubernetes/index.html)
|
||||
@ -13,6 +13,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
- [Helm](frameworks/helm.md)
|
||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||
- [KServe](integrations/kserve.md)
|
||||
- [KubeRay](integrations/kuberay.md)
|
||||
- [kubernetes-sigs/lws](frameworks/lws.md)
|
||||
- [meta-llama/llama-stack](integrations/llamastack.md)
|
||||
- [substratusai/kubeai](integrations/kubeai.md)
|
||||
|
||||
@ -10,6 +10,7 @@ Contents:
|
||||
- [BitBLAS](bitblas.md)
|
||||
- [GGUF](gguf.md)
|
||||
- [GPTQModel](gptqmodel.md)
|
||||
- [INC](inc.md)
|
||||
- [INT4 W4A16](int4.md)
|
||||
- [INT8 W8A8](int8.md)
|
||||
- [FP8 W8A8](fp8.md)
|
||||
|
||||
56
docs/features/quantization/inc.md
Normal file
56
docs/features/quantization/inc.md
Normal file
@ -0,0 +1,56 @@
|
||||
---
|
||||
title: FP8 INC
|
||||
---
|
||||
[](){ #inc }
|
||||
|
||||
vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
|
||||
Currently, quantization is validated only in Llama models.
|
||||
|
||||
Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to:
|
||||
[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
|
||||
|
||||
!!! note
|
||||
Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
|
||||
|
||||
!!! note
|
||||
`QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
|
||||
The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
|
||||
|
||||
## Run Online Inference Using FP8
|
||||
|
||||
Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
|
||||
|
||||
```bash
|
||||
export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
|
||||
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
|
||||
```
|
||||
|
||||
!!! tip
|
||||
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
|
||||
|
||||
!!! tip
|
||||
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
|
||||
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
|
||||
`VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
|
||||
|
||||
## Run Offline Inference Using FP8
|
||||
|
||||
To run offline inference (after completing the model calibration process):
|
||||
|
||||
* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
|
||||
* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
|
||||
* Call shutdown method of the model_executor at the end of the run.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc")
|
||||
...
|
||||
# Call llm.generate on the required prompts and sampling params.
|
||||
...
|
||||
llm.llm_engine.model_executor.shutdown()
|
||||
```
|
||||
|
||||
## Device for the Model's Weights Uploading
|
||||
|
||||
The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
|
||||
This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
|
||||
@ -2,18 +2,19 @@
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU |
|
||||
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
|
||||
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ |
|
||||
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ |
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU |
|
||||
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
|
||||
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
|
||||
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
|
||||
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ |
|
||||
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ |
|
||||
|
||||
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
|
||||
- ✅︎ indicates that the quantization method is supported on the specified hardware.
|
||||
|
||||
@ -103,9 +103,7 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m
|
||||
|
||||
vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
|
||||
|
||||
By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
|
||||
|
||||
Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
|
||||
However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
|
||||
|
||||
## Automatic Function Calling
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
|
||||
hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
|
||||
apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
|
||||
pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
|
||||
pip list | grep neural # verify that neural_compressor is installed
|
||||
pip list | grep neural # verify that neural_compressor_pt is installed
|
||||
```
|
||||
|
||||
Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
|
||||
@ -120,12 +120,13 @@ docker run \
|
||||
- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
|
||||
for accelerating low-batch latency and throughput
|
||||
- Attention with Linear Biases (ALiBi)
|
||||
- INC quantization
|
||||
|
||||
### Unsupported features
|
||||
|
||||
- Beam search
|
||||
- LoRA adapters
|
||||
- Quantization
|
||||
- AWQ quantization
|
||||
- Prefill chunking (mixed-batch inferencing)
|
||||
|
||||
### Supported configurations
|
||||
|
||||
@ -16,6 +16,7 @@ sys.modules["blake3"] = MagicMock()
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
|
||||
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
@ -24,15 +25,18 @@ logger = logging.getLogger("mkdocs")
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
"""Custom formatter that generates markdown for argument groups."""
|
||||
|
||||
def __init__(self, prog):
|
||||
def __init__(self, prog, starting_heading_level=3):
|
||||
super().__init__(prog,
|
||||
max_help_position=float('inf'),
|
||||
width=float('inf'))
|
||||
self._section_heading_prefix = "#" * starting_heading_level
|
||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||
self._markdown_output = []
|
||||
|
||||
def start_section(self, heading):
|
||||
if heading not in {"positional arguments", "options"}:
|
||||
self._markdown_output.append(f"\n### {heading}\n\n")
|
||||
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
|
||||
def end_section(self):
|
||||
pass
|
||||
@ -46,9 +50,13 @@ class MarkdownFormatter(HelpFormatter):
|
||||
|
||||
def add_arguments(self, actions):
|
||||
for action in actions:
|
||||
if (len(action.option_strings) == 0
|
||||
or "--help" in action.option_strings):
|
||||
continue
|
||||
|
||||
option_strings = f'`{"`, `".join(action.option_strings)}`'
|
||||
self._markdown_output.append(f"#### {option_strings}\n\n")
|
||||
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
|
||||
if choices := action.choices:
|
||||
choices = f'`{"`, `".join(str(c) for c in choices)}`'
|
||||
@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
||||
return cls.add_cli_args(parser, **kwargs)
|
||||
|
||||
|
||||
def create_serve_parser() -> FlexibleArgumentParser:
|
||||
"""Create a parser for the serve command with markdown formatting."""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.formatter_class = lambda prog: MarkdownFormatter(
|
||||
prog, starting_heading_level=4)
|
||||
return make_arg_parser(parser)
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
logger.info("Generating argparse documentation")
|
||||
logger.debug("Root directory: %s", ROOT_DIR.resolve())
|
||||
@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
"engine_args": create_parser(EngineArgs),
|
||||
"async_engine_args": create_parser(AsyncEngineArgs,
|
||||
async_args_only=True),
|
||||
"serve": create_serve_parser(),
|
||||
}
|
||||
|
||||
# Generate documentation for each parser
|
||||
|
||||
@ -316,6 +316,7 @@ Specified using `--task generate`.
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
|
||||
@ -580,14 +581,14 @@ Specified using `--task generate`.
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -595,7 +596,7 @@ Specified using `--task generate`.
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
@ -57,12 +57,20 @@ vllm serve $MODEL --headless --data-parallel-size 4 --data-parallel-size-local 4
|
||||
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
|
||||
```
|
||||
|
||||
This DP mode can also be used with Ray, in which case only a single launch command is needed irrespective of the number of nodes:
|
||||
This DP mode can also be used with Ray by specifying `--data-parallel-backend=ray`:
|
||||
|
||||
```bash
|
||||
vllm serve $MODEL --data-parallel-size 16 --tensor-parallel-size 2 --data-parallel-backend=ray
|
||||
vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 2 \
|
||||
--data-parallel-backend=ray
|
||||
```
|
||||
|
||||
There are several notable differences when using Ray:
|
||||
|
||||
- A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
|
||||
- There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
|
||||
- There is no need to specify `--data-parallel-rpc-port`
|
||||
- Remote DP ranks will be allocated based on node resources of the Ray cluster
|
||||
|
||||
Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.
|
||||
|
||||
When deploying large DP sizes using this method, the API server process can become a bottleneck. In this case, the orthogonal `--api-server-count` command line option can be used to scale this out (for example `--api-server-count=4`). This is transparent to users - a single HTTP endpoint / port is still exposed. Note that this API server scale-out is "internal" and still confined to the "head" node.
|
||||
|
||||
@ -106,14 +106,13 @@ to enable simultaneous generation and embedding using the same engine instance i
|
||||
|
||||
Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
|
||||
Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
|
||||
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require
|
||||
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
|
||||
enforcing eager mode and disabling prefix caching in V1.
|
||||
|
||||
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
|
||||
`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
|
||||
these models currently require enforcing eager mode, disabling prefix caching, and using the FlashInfer attention
|
||||
backend in V1. It is also necessary to pass a non-standard block size for attention layers (this is not possible
|
||||
using the `vllm serve` CLI yet).
|
||||
backend in V1.
|
||||
|
||||
#### Encoder-Decoder Models
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ on HuggingFace model repository.
|
||||
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
from typing import Any, NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
@ -30,7 +30,9 @@ question_per_audio_count = {
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
prompt: Optional[str] = None
|
||||
prompt_token_ids: Optional[dict[str, list[int]]] = None
|
||||
multi_modal_data: Optional[dict[str, Any]] = None
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Voxtral
|
||||
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (
|
||||
AudioChunk,
|
||||
RawAudio,
|
||||
TextChunk,
|
||||
UserMessage,
|
||||
)
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
model_name = "mistralai/Voxtral-Mini-3B-2507"
|
||||
tokenizer = MistralTokenizer.from_hf_hub(model_name)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
tokenizer_mode="mistral",
|
||||
enforce_eager=True,
|
||||
enable_chunked_prefill=False,
|
||||
)
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(audio_count)
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk])]
|
||||
|
||||
req = ChatCompletionRequest(messages=messages, model=model_name)
|
||||
|
||||
tokens = tokenizer.encode_chat_completion(req)
|
||||
prompt_ids, audios = tokens.tokens, tokens.audios
|
||||
|
||||
audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
|
||||
|
||||
multi_modal_data = {"audio": audios_and_sr}
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt_token_ids=prompt_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
)
|
||||
|
||||
|
||||
# Granite Speech
|
||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||
# NOTE - the setting in this example are somehat different than what is
|
||||
@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"voxtral": run_voxtral,
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
@ -311,16 +368,24 @@ def main(args):
|
||||
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
|
||||
)
|
||||
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
mm_data = req_data.multi_modal_data
|
||||
if not mm_data:
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
|
||||
assert args.num_prompts > 0
|
||||
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
|
||||
inputs = {"multi_modal_data": mm_data}
|
||||
|
||||
if req_data.prompt:
|
||||
inputs["prompt"] = req_data.prompt
|
||||
else:
|
||||
inputs["prompt_token_ids"] = req_data.prompt_token_ids
|
||||
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
|
||||
@ -1,17 +1,31 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
a simple demonstration of RLHF with vLLM, inspired by
|
||||
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
|
||||
It follows the design that, training processes and inference processes
|
||||
are different, and they live on different GPUs.
|
||||
Training processes send prompts to inference processes to generate data,
|
||||
and also synchronize the weights of the model by broadcasting the weights
|
||||
from the training process to the inference process.
|
||||
Note that this is a simple demonstration of one training instance and one
|
||||
inference instance. In practice, there could be multiple training instances
|
||||
and multiple inference instances. For the full implementation, please refer
|
||||
to the OpenRLHF framework.
|
||||
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
|
||||
|
||||
The script separates training and inference workloads onto distinct GPUs
|
||||
so that Ray can manage process placement and inter-process communication.
|
||||
A Hugging Face Transformer model occupies GPU 0 for training, whereas a
|
||||
tensor-parallel vLLM inference engine occupies GPU 1–2.
|
||||
|
||||
The example performs the following steps:
|
||||
|
||||
* Load the training model on GPU 0.
|
||||
* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
|
||||
and Ray placement groups.
|
||||
* Generate text from a list of prompts using the inference engine.
|
||||
* Update the weights of the training model and broadcast the updated weights
|
||||
to the inference engine by using a Ray collective RPC group. Note that
|
||||
for demonstration purposes we simply zero out the weights.
|
||||
|
||||
For a production-ready implementation that supports multiple training and
|
||||
inference replicas, see the OpenRLHF framework:
|
||||
https://github.com/OpenRLHF/OpenRLHF
|
||||
|
||||
This example assumes a single-node cluster with three GPUs, but Ray
|
||||
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
|
||||
workloads. Residual GPU activity interferes with vLLM memory profiling and
|
||||
causes unexpected behavior.
|
||||
"""
|
||||
|
||||
import os
|
||||
@ -28,29 +42,27 @@ from vllm.utils import get_ip, get_open_port
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
"""Configure the vLLM worker for Ray placement group execution."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# a hack to make the script work.
|
||||
# stop ray from manipulating CUDA_VISIBLE_DEVICES
|
||||
# at the top-level
|
||||
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
|
||||
# so that vLLM can manage its own device placement within the worker.
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
"""
|
||||
Start the training process, here we use huggingface transformers
|
||||
as an example to hold a model on GPU 0.
|
||||
"""
|
||||
|
||||
# Load the OPT-125M model onto GPU 0 for the training workload.
|
||||
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
|
||||
train_model.to("cuda:0")
|
||||
"""
|
||||
Start the inference process, here we use vLLM to hold a model on GPU 1 and
|
||||
GPU 2. For the details on how to use ray, please refer to the ray
|
||||
documentation https://docs.ray.io/en/latest/ .
|
||||
"""
|
||||
|
||||
# Initialize Ray and set the visible devices. The vLLM engine will
|
||||
# be placed on GPUs 1 and 2.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
|
||||
ray.init()
|
||||
|
||||
# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
|
||||
# Learn more about Ray placement groups:
|
||||
# https://docs.ray.io/en/latest/placement-groups.html
|
||||
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
|
||||
ray.get(pg_inference.ready())
|
||||
scheduling_inference = PlacementGroupSchedulingStrategy(
|
||||
@ -58,10 +70,9 @@ scheduling_inference = PlacementGroupSchedulingStrategy(
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=0,
|
||||
)
|
||||
"""
|
||||
launch the vLLM inference engine.
|
||||
here we use `enforce_eager` to reduce the start time.
|
||||
"""
|
||||
|
||||
# Launch the vLLM inference engine. The `enforce_eager` flag reduces
|
||||
# start-up latency.
|
||||
llm = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=0,
|
||||
@ -74,7 +85,7 @@ llm = ray.remote(
|
||||
distributed_executor_backend="ray",
|
||||
)
|
||||
|
||||
# Generate texts from the prompts.
|
||||
# Generate text from the prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
@ -93,8 +104,8 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
# set up the communication between the training process
|
||||
# and the inference engine.
|
||||
# Set up the communication channel between the training process and the
|
||||
# inference engine.
|
||||
master_address = get_ip()
|
||||
master_port = get_open_port()
|
||||
|
||||
@ -107,21 +118,23 @@ model_update_group = stateless_init_process_group(
|
||||
)
|
||||
ray.get(handle)
|
||||
|
||||
# simulate training, modify the weights of the model.
|
||||
# Simulate a training step by zeroing out all model weights.
|
||||
# In a real RLHF training loop the weights would be updated using the gradient
|
||||
# from an RL objective such as PPO on a reward model.
|
||||
for name, p in train_model.named_parameters():
|
||||
p.data.zero_()
|
||||
|
||||
# sync weight from the training process to the inference engine.
|
||||
# Synchronize the updated weights to the inference engine.
|
||||
for name, p in train_model.named_parameters():
|
||||
handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
|
||||
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
|
||||
ray.get(handle)
|
||||
|
||||
# check if the weights are updated.
|
||||
# Verify that the inference weights have been updated.
|
||||
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
|
||||
|
||||
# use the updated model to generate texts, they will be nonsense
|
||||
# because the weights are all zeros.
|
||||
# Generate text with the updated model. The output is expected to be nonsense
|
||||
# because the weights are zero.
|
||||
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
print("-" * 50)
|
||||
for output in outputs_updated:
|
||||
|
||||
@ -84,6 +84,7 @@ def main():
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config=speculative_config,
|
||||
disable_log_stats=False,
|
||||
max_model_len=16384,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
|
||||
|
||||
@ -1,35 +1,81 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Launch a Ray cluster inside Docker for vLLM inference.
|
||||
#
|
||||
# This script can start either a head node or a worker node, depending on the
|
||||
# --head or --worker flag provided as the third positional argument.
|
||||
#
|
||||
# Usage:
|
||||
# 1. Designate one machine as the head node and execute:
|
||||
# bash run_cluster.sh \
|
||||
# vllm/vllm-openai \
|
||||
# <head_node_ip> \
|
||||
# --head \
|
||||
# /abs/path/to/huggingface/cache \
|
||||
# -e VLLM_HOST_IP=<head_node_ip>
|
||||
#
|
||||
# 2. On every worker machine, execute:
|
||||
# bash run_cluster.sh \
|
||||
# vllm/vllm-openai \
|
||||
# <head_node_ip> \
|
||||
# --worker \
|
||||
# /abs/path/to/huggingface/cache \
|
||||
# -e VLLM_HOST_IP=<worker_node_ip>
|
||||
#
|
||||
# Each worker requires a unique VLLM_HOST_IP value.
|
||||
# Keep each terminal session open. Closing a session stops the associated Ray
|
||||
# node and thereby shuts down the entire cluster.
|
||||
# Every machine must be reachable at the supplied IP address.
|
||||
#
|
||||
# The container is named "node-<random_suffix>". To open a shell inside
|
||||
# a container after launch, use:
|
||||
# docker exec -it node-<random_suffix> /bin/bash
|
||||
#
|
||||
# Then, you can execute vLLM commands on the Ray cluster as if it were a
|
||||
# single machine, e.g. vllm serve ...
|
||||
#
|
||||
# To stop the container, use:
|
||||
# docker stop node-<random_suffix>
|
||||
|
||||
# Check for minimum number of required arguments
|
||||
# Check for minimum number of required arguments.
|
||||
if [ $# -lt 4 ]; then
|
||||
echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
|
||||
echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Assign the first three arguments and shift them away
|
||||
# Extract the mandatory positional arguments and remove them from $@.
|
||||
DOCKER_IMAGE="$1"
|
||||
HEAD_NODE_ADDRESS="$2"
|
||||
NODE_TYPE="$3" # Should be --head or --worker
|
||||
NODE_TYPE="$3" # Should be --head or --worker.
|
||||
PATH_TO_HF_HOME="$4"
|
||||
shift 4
|
||||
|
||||
# Additional arguments are passed directly to the Docker command
|
||||
# Preserve any extra arguments so they can be forwarded to Docker.
|
||||
ADDITIONAL_ARGS=("$@")
|
||||
|
||||
# Validate node type
|
||||
# Validate the NODE_TYPE argument.
|
||||
if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
|
||||
echo "Error: Node type must be --head or --worker"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Define a function to cleanup on EXIT signal
|
||||
# Generate a unique container name with random suffix.
|
||||
# Docker container names must be unique on each host.
|
||||
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
|
||||
# for example, on a multi-GPU machine.
|
||||
CONTAINER_NAME="node-${RANDOM}"
|
||||
|
||||
# Define a cleanup routine that removes the container when the script exits.
|
||||
# This prevents orphaned containers from accumulating if the script is interrupted.
|
||||
cleanup() {
|
||||
docker stop node
|
||||
docker rm node
|
||||
docker stop "${CONTAINER_NAME}"
|
||||
docker rm "${CONTAINER_NAME}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Command setup for head or worker node
|
||||
# Build the Ray start command based on the node role.
|
||||
# The head node manages the cluster and accepts connections on port 6379,
|
||||
# while workers connect to the head's address.
|
||||
RAY_START_CMD="ray start --block"
|
||||
if [ "${NODE_TYPE}" == "--head" ]; then
|
||||
RAY_START_CMD+=" --head --port=6379"
|
||||
@ -37,11 +83,15 @@ else
|
||||
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
|
||||
fi
|
||||
|
||||
# Run the docker command with the user specified parameters and additional arguments
|
||||
# Launch the container with the assembled parameters.
|
||||
# --network host: Allows Ray nodes to communicate directly via host networking
|
||||
# --shm-size 10.24g: Increases shared memory
|
||||
# --gpus all: Gives container access to all GPUs on the host
|
||||
# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
|
||||
docker run \
|
||||
--entrypoint /bin/bash \
|
||||
--network host \
|
||||
--name node \
|
||||
--name "${CONTAINER_NAME}" \
|
||||
--shm-size 10.24g \
|
||||
--gpus all \
|
||||
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
|
||||
|
||||
@ -61,6 +61,7 @@ plugins:
|
||||
- search
|
||||
- autorefs
|
||||
- awesome-nav
|
||||
- glightbox
|
||||
# For API reference generation
|
||||
- api-autonav:
|
||||
modules: ["vllm"]
|
||||
|
||||
183
pyproject.toml
183
pyproject.toml
@ -174,3 +174,186 @@ respect-ignore-files = true
|
||||
|
||||
[tool.ty.environment]
|
||||
python = "./.venv"
|
||||
|
||||
[tool.typos.files]
|
||||
# these files may be written in non english words
|
||||
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||
"vllm/third_party/*"]
|
||||
ignore-hidden = true
|
||||
ignore-files = true
|
||||
ignore-dot = true
|
||||
ignore-vcs = true
|
||||
ignore-global = true
|
||||
ignore-parent = true
|
||||
|
||||
[tool.typos.default]
|
||||
binary = false
|
||||
check-filename = false
|
||||
check-file = true
|
||||
unicode = true
|
||||
ignore-hex = true
|
||||
identifier-leading-digits = false
|
||||
locale = "en"
|
||||
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
|
||||
".*[Tt]h[rR].*"]
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.default.extend-identifiers]
|
||||
bbc5b7ede = "bbc5b7ede"
|
||||
womens_doubles = "womens_doubles"
|
||||
v_2nd = "v_2nd"
|
||||
# splitted_input = "splitted_input"
|
||||
NOOPs = "NOOPs"
|
||||
typ = "typ"
|
||||
nin_shortcut = "nin_shortcut"
|
||||
UperNetDecoder = "UperNetDecoder"
|
||||
subtile = "subtile"
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||
SFOuput = "SFOuput"
|
||||
# huggingface transformers repo uses these words
|
||||
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||
|
||||
[tool.typos.default.extend-words]
|
||||
iy = "iy"
|
||||
tendencias = "tendencias"
|
||||
# intel cpu features
|
||||
tme = "tme"
|
||||
dout = "dout"
|
||||
Pn = "Pn"
|
||||
arange = "arange"
|
||||
|
||||
[tool.typos.type.py]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.py.extend-identifiers]
|
||||
arange = "arange"
|
||||
NDArray = "NDArray"
|
||||
EOFError = "EOFError"
|
||||
fo = "fo"
|
||||
ba = "ba"
|
||||
|
||||
[tool.typos.type.py.extend-words]
|
||||
|
||||
[tool.typos.type.cpp]
|
||||
extend-glob = ["*.cu"]
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.cpp.extend-identifiers]
|
||||
countr_one = "countr_one"
|
||||
k_ot = "k_ot"
|
||||
ot = "ot"
|
||||
|
||||
[tool.typos.type.cpp.extend-words]
|
||||
|
||||
[tool.typos.type.rust]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.rust.extend-identifiers]
|
||||
flate2 = "flate2"
|
||||
|
||||
[tool.typos.type.rust.extend-words]
|
||||
ser = "ser"
|
||||
|
||||
[tool.typos.type.lock]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.lock.extend-identifiers]
|
||||
|
||||
[tool.typos.type.lock.extend-words]
|
||||
|
||||
[tool.typos.type.jl]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.jl.extend-identifiers]
|
||||
|
||||
[tool.typos.type.jl.extend-words]
|
||||
modul = "modul"
|
||||
egals = "egals"
|
||||
usig = "usig"
|
||||
egal = "egal"
|
||||
|
||||
[tool.typos.type.go]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.go.extend-identifiers]
|
||||
flate = "flate"
|
||||
|
||||
[tool.typos.type.go.extend-words]
|
||||
|
||||
[tool.typos.type.css]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.css.extend-identifiers]
|
||||
nd = "nd"
|
||||
|
||||
[tool.typos.type.css.extend-words]
|
||||
|
||||
[tool.typos.type.man]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.man.extend-identifiers]
|
||||
Nd = "Nd"
|
||||
|
||||
[tool.typos.type.man.extend-words]
|
||||
|
||||
[tool.typos.type.cert]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.cert.extend-identifiers]
|
||||
|
||||
[tool.typos.type.cert.extend-words]
|
||||
|
||||
[tool.typos.type.sh]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.sh.extend-identifiers]
|
||||
ot = "ot"
|
||||
|
||||
[tool.typos.type.sh.extend-words]
|
||||
|
||||
[tool.typos.type.vimscript]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.vimscript.extend-identifiers]
|
||||
windo = "windo"
|
||||
|
||||
[tool.typos.type.vimscript.extend-words]
|
||||
|
||||
@ -25,7 +25,7 @@ outlines_core == 0.2.10
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
@ -33,7 +33,7 @@ pyzmq >= 25.0.0
|
||||
msgspec
|
||||
gguf >= 0.13.0
|
||||
importlib_metadata; python_version < '3.10'
|
||||
mistral_common[opencv] >= 1.6.2
|
||||
mistral_common[opencv] >= 1.8.0
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
@ -47,3 +47,4 @@ python-json-logger # Used by logging as per examples/others/logging_configuratio
|
||||
scipy # Required for phi-4-multimodal-instruct
|
||||
ninja # Required for xgrammar, rocm, tpu, xpu
|
||||
pybase64 # fast base64 implementation
|
||||
cbor2 # Required for cross-language serialization of hashable objects
|
||||
|
||||
@ -4,6 +4,7 @@ mkdocs-material
|
||||
mkdocstrings-python
|
||||
mkdocs-gen-files
|
||||
mkdocs-awesome-nav
|
||||
mkdocs-glightbox
|
||||
python-markdown-math
|
||||
regex
|
||||
ruff
|
||||
@ -11,10 +12,12 @@ ruff
|
||||
# Required for argparse hook only
|
||||
-f https://download.pytorch.org/whl/cpu
|
||||
cachetools
|
||||
cbor2
|
||||
cloudpickle
|
||||
fastapi
|
||||
msgspec
|
||||
openai
|
||||
partial-json-parser
|
||||
pillow
|
||||
psutil
|
||||
pybase64
|
||||
|
||||
@ -23,7 +23,7 @@ jiwer # required for audio tests
|
||||
timm # required for internvl test
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.6.2 # required for pixtral test
|
||||
mistral_common[opencv] >= 1.8.0 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
|
||||
@ -28,7 +28,7 @@ torchvision==0.22.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
mamba_ssm # required for plamo2 test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.6.2 # required for pixtral test
|
||||
mistral_common[opencv] >= 1.8.0 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
|
||||
@ -305,7 +305,7 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common==1.6.2
|
||||
mistral-common==1.8.0
|
||||
# via -r requirements/test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
@ -518,6 +518,8 @@ pyasn1-modules==0.4.2
|
||||
# via google-auth
|
||||
pybind11==2.13.6
|
||||
# via lm-eval
|
||||
pycountry==24.6.1
|
||||
# via pydantic-extra-types
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pycryptodomex==3.22.0
|
||||
@ -528,9 +530,12 @@ pydantic==2.11.5
|
||||
# datamodel-code-generator
|
||||
# mistral-common
|
||||
# mteb
|
||||
# pydantic-extra-types
|
||||
# ray
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pydantic-extra-types==2.10.5
|
||||
# via mistral-common
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyparsing==3.2.0
|
||||
@ -835,6 +840,7 @@ typing-extensions==4.12.2
|
||||
# pqdm
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# pydantic-extra-types
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspection
|
||||
|
||||
@ -18,9 +18,9 @@ setuptools==78.1.0
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch==2.9.0.dev20250703
|
||||
torchvision==0.24.0.dev20250703
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch==2.9.0.dev20250711
|
||||
torchvision==0.24.0.dev20250711
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250711-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250711-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250711-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
||||
|
||||
3
setup.py
3
setup.py
@ -692,7 +692,8 @@ setup(
|
||||
"tensorizer": ["tensorizer==2.10.1"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile"], # Required for audio processing
|
||||
"audio": ["librosa", "soundfile",
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"video": [] # Kept for backwards compatibility
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
|
||||
@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
def api_server(distributed_executor_backend: str):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
@ -40,8 +40,6 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
"facebook/opt-125m",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--tokenizer-pool-size",
|
||||
str(tokenizer_pool_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_executor_backend,
|
||||
]
|
||||
@ -54,10 +52,8 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
uvicorn_process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
||||
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||
distributed_executor_backend: str):
|
||||
def test_api_server(api_server, distributed_executor_backend: str):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
|
||||
@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
|
||||
assert not vllm_config.compilation_config.use_cudagraph
|
||||
|
||||
|
||||
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
|
||||
# on the state of the cache directory on the current machine, which
|
||||
# may be influenced by other tests.
|
||||
@pytest.mark.parametrize("val", ["1"])
|
||||
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
|
||||
# spawn means that the counters are in the same process.
|
||||
monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
|
||||
monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
|
||||
|
||||
compilation_config = {
|
||||
"use_cudagraph": False, # speed things up a bit
|
||||
}
|
||||
with (
|
||||
compilation_counter.expect(num_cache_entries_updated=0,
|
||||
num_compiled_artifacts_saved=0),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner('facebook/opt-125m',
|
||||
compilation_config=compilation_config,
|
||||
gpu_memory_utilization=0.4) as _):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enabled", [True, False])
|
||||
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
|
||||
@ -132,9 +132,7 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
|
||||
dtype=dtype,
|
||||
seed=42)
|
||||
|
||||
all_reduce_fusion_pass = AllReduceFusionPass(
|
||||
vllm_config, vllm_config.compilation_config.pass_config.
|
||||
fi_allreduce_fusion_max_token_num)
|
||||
all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
|
||||
backend = TestBackend(all_reduce_fusion_pass)
|
||||
|
||||
model = test_model_cls(hidden_size)
|
||||
|
||||
@ -804,7 +804,7 @@ class VllmRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
prompts: Union[list[str], list[torch.Tensor], list[int]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
@ -826,11 +826,16 @@ class VllmRunner:
|
||||
if audios is not None and (audio := audios[i]) is not None:
|
||||
multi_modal_data["audio"] = audio
|
||||
|
||||
text_prompt_kwargs = {
|
||||
("prompt" if isinstance(prompt, str) else "prompt_embeds"):
|
||||
prompt,
|
||||
text_prompt_kwargs: dict[str, Any] = {
|
||||
"multi_modal_data": multi_modal_data or None
|
||||
}
|
||||
if isinstance(prompt, str):
|
||||
text_prompt_kwargs["prompt"] = prompt
|
||||
elif isinstance(prompt, list):
|
||||
text_prompt_kwargs["prompt_token_ids"] = prompt
|
||||
else:
|
||||
text_prompt_kwargs["prompt_embeds"] = prompt
|
||||
|
||||
inputs.append(TextPrompt(**text_prompt_kwargs))
|
||||
|
||||
return inputs
|
||||
|
||||
@ -14,8 +14,9 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
@ -158,7 +159,7 @@ TEXT_GENERATION_MODELS = {
|
||||
"databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
|
||||
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
|
||||
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
|
||||
"tiiuae/falcon-7b": PPTestSettings.fast(),
|
||||
"google/gemma-1.1-2b-it": PPTestSettings.fast(),
|
||||
@ -210,9 +211,11 @@ TEXT_GENERATION_MODELS = {
|
||||
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
|
||||
load_format="dummy", task="embed"
|
||||
),
|
||||
}
|
||||
|
||||
MULTIMODAL_MODELS = {
|
||||
@ -248,6 +251,7 @@ TEST_MODELS = [
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"ibm/PowerLM-3b",
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
@ -287,6 +291,11 @@ def _compare_tp(
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
hf_config = get_config(model_id, trust_remote_code)
|
||||
|
||||
dtype = "float16"
|
||||
if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
|
||||
dtype = "bfloat16"
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
@ -316,7 +325,7 @@ def _compare_tp(
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
dtype,
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
@ -338,6 +347,7 @@ def _compare_tp(
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
|
||||
testing_ray_compiled_graph = False
|
||||
if distributed_backend == "ray" and (vllm_major_version == "1"
|
||||
or specific_case):
|
||||
# For V1, test Ray Compiled Graph for all the tests
|
||||
@ -351,6 +361,7 @@ def _compare_tp(
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of a Ray Compiled Graph issue.
|
||||
common_args.append("--disable-frontend-multiprocessing")
|
||||
testing_ray_compiled_graph = True
|
||||
elif distributed_backend == "mp":
|
||||
# Both V0/V1 of multiprocessing executor support PP
|
||||
pp_env = {
|
||||
@ -394,7 +405,6 @@ def _compare_tp(
|
||||
tp_env,
|
||||
method=method)
|
||||
except Exception:
|
||||
testing_ray_compiled_graph = pp_env is not None
|
||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
||||
# Ray Compiled Graph tests are flaky for V0,
|
||||
# so we don't want to fail the test
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from argparse import ArgumentError, ArgumentTypeError
|
||||
from argparse import ArgumentError
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Annotated, Literal, Optional
|
||||
@ -12,8 +12,8 @@ import pytest
|
||||
from vllm.config import CompilationConfig, config
|
||||
from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
|
||||
get_type, get_type_hints, is_not_builtin,
|
||||
is_type, literal_to_kwargs, nullable_kvs,
|
||||
optional_type, parse_type)
|
||||
is_type, literal_to_kwargs, optional_type,
|
||||
parse_type)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -25,18 +25,10 @@ from vllm.utils import FlexibleArgumentParser
|
||||
"foo": 1,
|
||||
"bar": 2
|
||||
}),
|
||||
(json.loads, "foo=1,bar=2", {
|
||||
"foo": 1,
|
||||
"bar": 2
|
||||
}),
|
||||
])
|
||||
def test_parse_type(type, value, expected):
|
||||
parse_type_func = parse_type(type)
|
||||
context = nullcontext()
|
||||
if value == "foo=1,bar=2":
|
||||
context = pytest.warns(DeprecationWarning)
|
||||
with context:
|
||||
assert parse_type_func(value) == expected
|
||||
assert parse_type_func(value) == expected
|
||||
|
||||
|
||||
def test_optional_type():
|
||||
@ -203,34 +195,6 @@ def test_get_kwargs():
|
||||
assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("arg", "expected"), [
|
||||
(None, dict()),
|
||||
("image=16", {
|
||||
"image": 16
|
||||
}),
|
||||
("image=16,video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
("Image=16, Video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
])
|
||||
def test_limit_mm_per_prompt_parser(arg, expected):
|
||||
"""This functionality is deprecated and will be removed in the future.
|
||||
This argument should be passed as JSON string instead.
|
||||
|
||||
TODO: Remove with nullable_kvs."""
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args(["--limit-mm-per-prompt", arg])
|
||||
|
||||
assert args.limit_mm_per_prompt == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg", "expected"),
|
||||
[
|
||||
@ -326,18 +290,6 @@ def test_prefix_cache_default():
|
||||
assert not engine_args.enable_prefix_caching
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg"),
|
||||
[
|
||||
"image", # Missing =
|
||||
"image=4,image=5", # Conflicting values
|
||||
"image=video=4" # Too many = in tokenized arg
|
||||
])
|
||||
def test_bad_nullable_kvs(arg):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
nullable_kvs(arg)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
|
||||
@ -176,4 +176,8 @@ async def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert classification_output.keys() == invocation_output.keys()
|
||||
assert classification_output["data"] == invocation_output["data"]
|
||||
for classification_data, invocation_data in zip(
|
||||
classification_output["data"], invocation_output["data"]):
|
||||
assert classification_data.keys() == invocation_data.keys()
|
||||
assert classification_data["probs"] == pytest.approx(
|
||||
invocation_data["probs"], rel=0.01)
|
||||
|
||||
@ -14,6 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
from ...models.utils import check_embeddings_close
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
@ -321,7 +322,13 @@ async def test_invocations(server: RemoteOpenAIServer,
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert completion_output.keys() == invocation_output.keys()
|
||||
assert completion_output["data"] == invocation_output["data"]
|
||||
for completion_data, invocation_data in zip(completion_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert completion_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="completion",
|
||||
name_1="invocation")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -355,4 +362,10 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert chat_output.keys() == invocation_output.keys()
|
||||
assert chat_output["data"] == invocation_output["data"]
|
||||
for chat_data, invocation_data in zip(chat_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert chat_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="chat",
|
||||
name_1="invocation")
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
@ -29,7 +30,7 @@ def server():
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
f"image={MAXIMUM_IMAGES}",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
|
||||
@ -281,7 +281,13 @@ async def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert completion_output.keys() == invocation_output.keys()
|
||||
assert completion_output["data"] == invocation_output["data"]
|
||||
for completion_data, invocation_data in zip(completion_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert completion_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=completion_data["data"],
|
||||
embeddings_1_lst=invocation_data["data"],
|
||||
name_0="completion",
|
||||
name_1="invocation")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -314,4 +320,10 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert chat_output.keys() == invocation_output.keys()
|
||||
assert chat_output["data"] == invocation_output["data"]
|
||||
for chat_data, invocation_data in zip(chat_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert chat_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=chat_data["data"],
|
||||
embeddings_1_lst=invocation_data["data"],
|
||||
name_0="chat",
|
||||
name_1="invocation")
|
||||
|
||||
@ -120,4 +120,8 @@ def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert rerank_output.keys() == invocation_output.keys()
|
||||
assert rerank_output["results"] == invocation_output["results"]
|
||||
for rerank_result, invocations_result in zip(rerank_output["results"],
|
||||
invocation_output["results"]):
|
||||
assert rerank_result.keys() == invocations_result.keys()
|
||||
assert rerank_result["relevance_score"] == pytest.approx(
|
||||
invocations_result["relevance_score"], rel=0.01)
|
||||
|
||||
@ -215,4 +215,8 @@ class TestModel:
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert score_output.keys() == invocation_output.keys()
|
||||
assert score_output["data"] == invocation_output["data"]
|
||||
for score_data, invocation_data in zip(score_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert score_data.keys() == invocation_data.keys()
|
||||
assert score_data["score"] == pytest.approx(
|
||||
invocation_data["score"], rel=0.01)
|
||||
|
||||
@ -32,6 +32,7 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--enable-tokenizer-info-endpoint",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
@ -283,3 +284,106 @@ async def test_detokenize(
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {"prompt": prompt}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenizer_info_basic(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
"""Test basic tokenizer info endpoint functionality."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
assert "tokenizer_class" in result
|
||||
assert isinstance(result["tokenizer_class"], str)
|
||||
assert result["tokenizer_class"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
|
||||
"""Test that the response matches expected schema types."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
field_types = {
|
||||
"add_bos_token": bool,
|
||||
"add_prefix_space": bool,
|
||||
"clean_up_tokenization_spaces": bool,
|
||||
"split_special_tokens": bool,
|
||||
"bos_token": str,
|
||||
"eos_token": str,
|
||||
"pad_token": str,
|
||||
"unk_token": str,
|
||||
"chat_template": str,
|
||||
"errors": str,
|
||||
"model_max_length": int,
|
||||
"additional_special_tokens": list,
|
||||
"added_tokens_decoder": dict,
|
||||
}
|
||||
for field, expected_type in field_types.items():
|
||||
if field in result and result[field] is not None:
|
||||
assert isinstance(
|
||||
result[field],
|
||||
expected_type), (f"{field} should be {expected_type.__name__}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_added_tokens_structure(
|
||||
server: RemoteOpenAIServer, ):
|
||||
"""Test added_tokens_decoder structure if present."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
added_tokens = result.get("added_tokens_decoder")
|
||||
if added_tokens:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
assert isinstance(token_id, str), "Token IDs should be strings"
|
||||
assert isinstance(token_info, dict), "Token info should be a dict"
|
||||
assert "content" in token_info, "Token info should have content"
|
||||
assert "special" in token_info, (
|
||||
"Token info should have special flag")
|
||||
assert isinstance(token_info["special"],
|
||||
bool), ("Special flag should be boolean")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_consistency_with_tokenize(
|
||||
server: RemoteOpenAIServer, ):
|
||||
"""Test that tokenizer info is consistent with tokenization endpoint."""
|
||||
info_response = requests.get(server.url_for("tokenizer_info"))
|
||||
info_response.raise_for_status()
|
||||
info = info_response.json()
|
||||
tokenize_response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"prompt": "Hello world!"
|
||||
},
|
||||
)
|
||||
tokenize_response.raise_for_status()
|
||||
tokenize_result = tokenize_response.json()
|
||||
info_max_len = info.get("model_max_length")
|
||||
tokenize_max_len = tokenize_result.get("max_model_len")
|
||||
if info_max_len and tokenize_max_len:
|
||||
assert info_max_len >= tokenize_max_len, (
|
||||
"Info max length should be >= tokenize max length")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
|
||||
"""Test chat template is properly included."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
chat_template = result.get("chat_template")
|
||||
if chat_template:
|
||||
assert isinstance(chat_template,
|
||||
str), ("Chat template should be a string")
|
||||
assert chat_template.strip(), "Chat template should not be empty"
|
||||
@ -17,6 +17,11 @@ from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
@ -33,9 +38,15 @@ def winning_call():
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -65,10 +76,13 @@ async def test_bad_requests(mary_had_lamb):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
|
||||
async def test_long_audio_request(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("openai"):
|
||||
return
|
||||
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
@ -87,7 +101,8 @@ async def test_long_audio_request(mary_had_lamb):
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
assert out.count("Mary had a little lamb") == 10
|
||||
counts = out.count("Mary had a little lamb")
|
||||
assert counts == 10, counts
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -154,7 +169,8 @@ async def test_streaming_response(winning_call):
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
extra_body=dict(stream=True))
|
||||
extra_body=dict(stream=True),
|
||||
timeout=30)
|
||||
# Reconstruct from chunks and validate
|
||||
async for chunk in res:
|
||||
# just a chunk
|
||||
@ -184,7 +200,8 @@ async def test_stream_options(winning_call):
|
||||
temperature=0.0,
|
||||
extra_body=dict(stream=True,
|
||||
stream_include_usage=True,
|
||||
stream_continuous_usage_stats=True))
|
||||
stream_continuous_usage_stats=True),
|
||||
timeout=30)
|
||||
final = False
|
||||
continuous = True
|
||||
async for chunk in res:
|
||||
|
||||
@ -39,8 +39,8 @@ async def test_basic_audio(foscolo):
|
||||
# TODO remove once language detection is implemented
|
||||
extra_body=dict(language="it"),
|
||||
temperature=0.0)
|
||||
out = json.loads(translation)['text'].strip()
|
||||
assert "Nor will I ever touch the sacred" in out
|
||||
out = json.loads(translation)['text'].strip().lower()
|
||||
assert "greek sea" in out
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -168,5 +168,4 @@ async def test_long_audio_request(foscolo):
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(translation)['text'].strip().lower()
|
||||
# TODO investigate higher model uncertainty in for longer translations.
|
||||
assert out.count("nor will i ever") == 2
|
||||
assert out.count("greek sea") == 2
|
||||
|
||||
@ -416,7 +416,7 @@ class RankTensors:
|
||||
# We dequant and use that as hidden_states so the tests are stable.
|
||||
# quantizing and dequantizing yield slightly different results
|
||||
# depending on the hardware. Here we, quantize and dequantize
|
||||
# first - so further quantize and dequantize will yeild the same
|
||||
# first - so further quantize and dequantize will yield the same
|
||||
# values.
|
||||
if config.is_per_tensor_act_quant:
|
||||
a_q, a_scales = ops.scaled_fp8_quant(
|
||||
|
||||
@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
|
||||
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
|
||||
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
|
||||
|
||||
# triton referrence
|
||||
# triton reference
|
||||
out_triton = fused_experts(
|
||||
hidden_states=tokens_bf16,
|
||||
w1=w1,
|
||||
|
||||
@ -61,14 +61,6 @@ V1_SUPPORTED_MODELS = [
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
]
|
||||
|
||||
ATTN_BLOCK_SIZES = {
|
||||
"ibm-ai-platform/Bamba-9B-v1": 528,
|
||||
"Zyphra/Zamba2-1.2B-instruct": 80,
|
||||
"nvidia/Nemotron-H-8B-Base-8K": 528,
|
||||
"ibm-granite/granite-4.0-tiny-preview": 400,
|
||||
"tiiuae/Falcon-H1-0.5B-Base": 800,
|
||||
}
|
||||
|
||||
# Avoid OOM
|
||||
MAX_NUM_SEQS = 4
|
||||
|
||||
@ -105,11 +97,6 @@ def test_models(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
|
||||
block_size = ATTN_BLOCK_SIZES[model]
|
||||
else:
|
||||
block_size = 16
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
if model in HYBRID_MODELS:
|
||||
@ -118,8 +105,7 @@ def test_models(
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=False,
|
||||
block_size=block_size) as vllm_model:
|
||||
enable_prefix_caching=False) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
|
||||
115
tests/models/multimodal/generation/test_voxtral.py
Normal file
115
tests/models/multimodal/generation/test_voxtral.py
Normal file
@ -0,0 +1,115 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
|
||||
TextChunk, UserMessage)
|
||||
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
|
||||
from ....conftest import AudioTestAssets
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
] + MISTRAL_FORMAT_ARGS
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_assets, question):
|
||||
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(len(audio_assets))
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
|
||||
|
||||
return tokenizer.apply_chat_template(messages=messages)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_with_multiple_audios(vllm_runner,
|
||||
audio_assets: AudioTestAssets, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer_mode="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
def asset_to_chunk(asset):
|
||||
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
|
||||
audio.format = "wav"
|
||||
audio_dict = AudioChunk.from_audio(audio).to_openai()
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
@ -1,9 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModel
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
|
||||
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
|
||||
model_name = "jinaai/jina-reranker-m0"
|
||||
|
||||
mm_processor_kwargs = {
|
||||
@ -14,73 +20,90 @@ mm_processor_kwargs = {
|
||||
limit_mm_per_prompt = {"image": 2}
|
||||
|
||||
|
||||
def vllm_reranker(model_name,
|
||||
query,
|
||||
documents,
|
||||
query_type="text",
|
||||
doc_type="text"):
|
||||
from vllm import LLM
|
||||
def vllm_reranker(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model_name: str,
|
||||
dtype: str,
|
||||
query_strs: list[str],
|
||||
document_strs: list[str],
|
||||
query_type: str = "text",
|
||||
doc_type: str = "text",
|
||||
):
|
||||
|
||||
model = LLM(
|
||||
model=model_name,
|
||||
task="score",
|
||||
max_model_len=32768,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
|
||||
def create_image_param(url: str):
|
||||
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
|
||||
return {"type": "image_url", "image_url": {"url": f"{url}"}}
|
||||
|
||||
if query_type == "image":
|
||||
query = {"content": [create_image_param(url) for url in query]}
|
||||
query: Union[list[str], ScoreMultiModalParam]
|
||||
if query_type == "text":
|
||||
query = query_strs
|
||||
elif query_type == "image":
|
||||
query = ScoreMultiModalParam(
|
||||
content=[create_image_param(url) for url in query_strs])
|
||||
|
||||
if doc_type == "image":
|
||||
documents = {"content": [create_image_param(url) for url in documents]}
|
||||
documents: Union[list[str], ScoreMultiModalParam]
|
||||
if doc_type == "text":
|
||||
documents = document_strs
|
||||
elif doc_type == "image":
|
||||
documents = ScoreMultiModalParam(
|
||||
content=[create_image_param(url) for url in document_strs])
|
||||
|
||||
outputs = model.score(query, documents)
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
task="score",
|
||||
dtype=dtype,
|
||||
max_num_seqs=2,
|
||||
max_model_len=2048,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.model.score(query, documents)
|
||||
|
||||
return [output.outputs.score for output in outputs]
|
||||
|
||||
|
||||
def hf_reranker(model_name,
|
||||
query,
|
||||
documents,
|
||||
query_type="text",
|
||||
doc_type="text"):
|
||||
|
||||
def hf_reranker(
|
||||
hf_runner: type[HfRunner],
|
||||
model_name: str,
|
||||
dtype: str,
|
||||
query_strs: list[str],
|
||||
document_strs: list[str],
|
||||
query_type: str = "text",
|
||||
doc_type: str = "text",
|
||||
):
|
||||
checkpoint_to_hf_mapper = {
|
||||
"visual.": "model.visual.",
|
||||
"model.": "model.language_model.",
|
||||
}
|
||||
|
||||
model = AutoModel.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=True,
|
||||
key_mapping=checkpoint_to_hf_mapper).to("cuda").eval()
|
||||
data_pairs = [[query_strs[0], d] for d in document_strs]
|
||||
|
||||
data_pairs = [[query[0], d] for d in documents]
|
||||
|
||||
scores = model.compute_score(data_pairs,
|
||||
max_length=2048,
|
||||
query_type=query_type,
|
||||
doc_type=doc_type)
|
||||
return scores
|
||||
with hf_runner(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
auto_cls=AutoModel,
|
||||
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
|
||||
) as hf_model:
|
||||
return hf_model.model.compute_score(data_pairs,
|
||||
max_length=2048,
|
||||
query_type=query_type,
|
||||
doc_type=doc_type)
|
||||
|
||||
|
||||
# Visual Documents Reranking
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_text_image(model_name):
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = ["slm markdown"]
|
||||
documents = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "text", "image")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "text", "image")
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"text", "image")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "text", "image")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@ -88,8 +111,8 @@ def test_model_text_image(model_name):
|
||||
|
||||
# Textual Documents Reranking
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_text_text(model_name):
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = ["slm markdown"]
|
||||
documents = [
|
||||
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||
@ -104,9 +127,10 @@ def test_model_text_text(model_name):
|
||||
lower computational requirements.""", # noqa: E501
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "text", "text")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "text", "text")
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"text", "text")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "text", "text")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@ -114,8 +138,8 @@ def test_model_text_text(model_name):
|
||||
|
||||
# Image Querying for Textual Documents
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_image_text(model_name):
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
]
|
||||
@ -133,8 +157,10 @@ def test_model_image_text(model_name):
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "image", "text")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "image", "text")
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"image", "text")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "image", "text")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@ -142,8 +168,8 @@ def test_model_image_text(model_name):
|
||||
|
||||
# Image Querying for Image Documents
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_image_image(model_name):
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
|
||||
query = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
]
|
||||
@ -152,9 +178,10 @@ def test_model_image_image(model_name):
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "image", "image")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "image",
|
||||
"image")
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"image", "image")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "image", "image")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
@ -141,6 +141,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
|
||||
trust_remote_code=True),
|
||||
"BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
|
||||
trust_remote_code=True),
|
||||
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
|
||||
extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501
|
||||
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
|
||||
@ -438,6 +440,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", tokenizer_mode="mistral"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
|
||||
# [Cross-encoder]
|
||||
@ -462,6 +465,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
|
||||
tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
|
||||
"EagleLlama4ForCausalLM": _HfExamplesInfo(
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
|
||||
trust_remote_code=True,
|
||||
speculative_model="morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
|
||||
tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
|
||||
"EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
|
||||
trust_remote_code=True,
|
||||
is_available_online=False,
|
||||
@ -511,4 +519,4 @@ class HfExampleModels:
|
||||
raise ValueError(f"No example model defined for {model_id}")
|
||||
|
||||
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
||||
|
||||
@ -36,6 +36,11 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
"KimiVLForConditionalGeneration"):
|
||||
pytest.skip("Avoid OOM")
|
||||
|
||||
if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
|
||||
from vllm.model_executor.models.llama4 import Llama4ForCausalLM
|
||||
from vllm.model_executor.models.registry import ModelRegistry
|
||||
ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
|
||||
|
||||
# Avoid OOM and reduce initialization time by only using 1 layer
|
||||
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
hf_config.update(model_info.hf_overrides)
|
||||
@ -43,7 +48,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
text_config = hf_config.get_text_config()
|
||||
|
||||
# Ensure at least 2 expert per group
|
||||
# Since `grouped_topk` assums top-2
|
||||
# Since `grouped_topk` assumes top-2
|
||||
n_group = getattr(text_config, 'n_group', None)
|
||||
num_experts = n_group * 2 if n_group is not None else 2
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ import pytest
|
||||
|
||||
from vllm.compilation.backends import VllmBackend
|
||||
from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
|
||||
get_field)
|
||||
get_field, update_config)
|
||||
from vllm.model_executor.layers.pooler import PoolingType
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -46,6 +46,34 @@ def test_get_field():
|
||||
assert c.default_factory is MISSING
|
||||
|
||||
|
||||
@dataclass
|
||||
class _TestNestedConfig:
|
||||
a: _TestConfigFields = field(
|
||||
default_factory=lambda: _TestConfigFields(a=0))
|
||||
|
||||
|
||||
def test_update_config():
|
||||
# Simple update
|
||||
config1 = _TestConfigFields(a=0)
|
||||
new_config1 = update_config(config1, {"a": 42})
|
||||
assert new_config1.a == 42
|
||||
# Nonexistent field
|
||||
with pytest.raises(AssertionError):
|
||||
new_config1 = update_config(config1, {"nonexistent": 1})
|
||||
# Nested update with dataclass
|
||||
config2 = _TestNestedConfig()
|
||||
new_inner_config = _TestConfigFields(a=1, c="new_value")
|
||||
new_config2 = update_config(config2, {"a": new_inner_config})
|
||||
assert new_config2.a == new_inner_config
|
||||
# Nested update with dict
|
||||
config3 = _TestNestedConfig()
|
||||
new_config3 = update_config(config3, {"a": {"c": "new_value"}})
|
||||
assert new_config3.a.c == "new_value"
|
||||
# Nested update with invalid type
|
||||
with pytest.raises(AssertionError):
|
||||
new_config3 = update_config(config3, {"a": "new_value"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_task"),
|
||||
[
|
||||
|
||||
@ -14,7 +14,7 @@ RTOL = 0.03
|
||||
@dataclass
|
||||
class GSM8KAccuracyTestConfig:
|
||||
model_name: str
|
||||
excepted_value: float
|
||||
expected_value: float
|
||||
|
||||
def get_model_args(self) -> str:
|
||||
return (f"pretrained={self.model_name},"
|
||||
@ -25,13 +25,13 @@ class GSM8KAccuracyTestConfig:
|
||||
ACCURACY_CONFIGS = [
|
||||
GSM8KAccuracyTestConfig(
|
||||
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
excepted_value=0.76), # no bias
|
||||
expected_value=0.76), # no bias
|
||||
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
||||
# so only one of these tests can run in a single call to pytest. As
|
||||
# a follow up, move this into the LM-EVAL section of the CI.
|
||||
# GSM8KAccuracyTestConfig(
|
||||
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
|
||||
# excepted_value=0.66), # bias in QKV layers
|
||||
# expected_value=0.66), # bias in QKV layers
|
||||
]
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
|
||||
batch_size="auto",
|
||||
)
|
||||
|
||||
EXPECTED_VALUE = config.excepted_value
|
||||
EXPECTED_VALUE = config.expected_value
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
|
||||
0
tests/v1/core/__init__.py
Normal file
0
tests/v1/core/__init__.py
Normal file
228
tests/v1/core/test_async_scheduler.py
Normal file
228
tests/v1/core/test_async_scheduler.py
Normal file
@ -0,0 +1,228 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections import deque
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
from .utils import create_requests, create_scheduler
|
||||
|
||||
|
||||
def _make_model_runner_output(
|
||||
scheduler_output: SchedulerOutput, ) -> ModelRunnerOutput:
|
||||
req_ids = list(scheduler_output.num_scheduled_tokens.keys())
|
||||
return ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index={
|
||||
req_id: i
|
||||
for i, req_id in enumerate(req_ids)
|
||||
},
|
||||
sampled_token_ids=[[i] for i in range(len(req_ids))],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens", [1, 2, 3, 5])
|
||||
def test_stop_by_max_tokens(max_tokens: int):
|
||||
scheduler = create_scheduler(async_scheduling=True)
|
||||
requests = create_requests(num_requests=2, max_tokens=max_tokens)
|
||||
req0, req1 = requests
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
scheduler.add_request(req0)
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
scheduler.add_request(req1)
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
while sched_outputs:
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
assert scheduler.get_num_unfinished_requests() == 0
|
||||
assert req0.num_output_tokens == max_tokens
|
||||
assert req1.num_output_tokens == max_tokens
|
||||
|
||||
|
||||
def test_abort():
|
||||
scheduler = create_scheduler(async_scheduling=True)
|
||||
requests = create_requests(num_requests=10, max_tokens=20)
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
|
||||
abort_order_copy = abort_order.copy()
|
||||
|
||||
def abort_request():
|
||||
if not abort_order:
|
||||
return
|
||||
req = requests[abort_order.pop(0)]
|
||||
scheduler.finish_requests(req.request_id,
|
||||
RequestStatus.FINISHED_ABORTED)
|
||||
|
||||
while sched_outputs:
|
||||
# Abort a scheduled request.
|
||||
abort_request()
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
for i, req in enumerate(requests):
|
||||
assert req.status == RequestStatus.FINISHED_ABORTED
|
||||
assert req.num_output_tokens == abort_order_copy.index(i)
|
||||
|
||||
|
||||
def test_preempt():
|
||||
scheduler = create_scheduler(async_scheduling=True)
|
||||
requests = create_requests(num_requests=10, max_tokens=20)
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
|
||||
abort_order_copy = abort_order.copy()
|
||||
|
||||
def abort_request():
|
||||
if not abort_order:
|
||||
return
|
||||
req = requests[abort_order.pop(0)]
|
||||
scheduler.finish_requests(req.request_id,
|
||||
RequestStatus.FINISHED_ABORTED)
|
||||
|
||||
while sched_outputs:
|
||||
# Abort a scheduled request.
|
||||
abort_request()
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
for i, req in enumerate(requests):
|
||||
assert req.status == RequestStatus.FINISHED_ABORTED
|
||||
assert req.num_output_tokens == abort_order_copy.index(i)
|
||||
|
||||
|
||||
def test_prefix_caching_for_prefill_dedup():
|
||||
CHUNK_SIZE = 1000
|
||||
BLOCK_SIZE = 16
|
||||
num_prompt_tokens = 100
|
||||
scheduler = create_scheduler(async_scheduling=True,
|
||||
max_num_batched_tokens=CHUNK_SIZE,
|
||||
enable_prefix_caching=True,
|
||||
block_size=BLOCK_SIZE)
|
||||
requests = create_requests(num_requests=5,
|
||||
num_tokens=num_prompt_tokens,
|
||||
max_tokens=3,
|
||||
same_prompt=True)
|
||||
requests_copy = requests.copy()
|
||||
|
||||
# Two requests with the same prompt.
|
||||
req0 = requests.pop(0)
|
||||
req1 = requests.pop(0)
|
||||
scheduler.add_request(req0)
|
||||
scheduler.add_request(req1)
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_output = scheduler.schedule()
|
||||
sched_outputs.append(sched_output)
|
||||
# Make sure prefix caching de-duplicates the prompts in the same step,
|
||||
# so all the blocks except the last are shared between the two requests.
|
||||
assert len(sched_output.num_scheduled_tokens) == 2
|
||||
num_blocks = num_prompt_tokens // BLOCK_SIZE
|
||||
assert req0.num_cached_tokens == 0
|
||||
assert req1.num_cached_tokens >= num_blocks * BLOCK_SIZE
|
||||
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
while sched_outputs:
|
||||
if requests:
|
||||
scheduler.add_request(requests.pop(0))
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
# Other requests scheduled after the two requests should also get
|
||||
# prefix cache hit.
|
||||
assert scheduler.get_num_unfinished_requests() == 0
|
||||
for req in requests_copy[1:]:
|
||||
assert req.num_cached_tokens >= num_blocks * BLOCK_SIZE
|
||||
|
||||
|
||||
def test_prefix_caching_for_multi_turn():
|
||||
CHUNK_SIZE = 1000
|
||||
BLOCK_SIZE = 16
|
||||
num_prompt_tokens = 100
|
||||
num_output_tokens = 200
|
||||
scheduler = create_scheduler(async_scheduling=True,
|
||||
max_num_batched_tokens=CHUNK_SIZE,
|
||||
enable_prefix_caching=True,
|
||||
block_size=BLOCK_SIZE)
|
||||
requests = create_requests(num_requests=5,
|
||||
num_tokens=num_prompt_tokens,
|
||||
max_tokens=num_output_tokens)
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
# Process the requests.
|
||||
while sched_outputs:
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
assert scheduler.get_num_unfinished_requests() == 0
|
||||
|
||||
# Create next-turn requests whose prompts are the full output of the
|
||||
# previous turn.
|
||||
next_turn_requests = create_requests(
|
||||
num_requests=5,
|
||||
num_tokens=num_prompt_tokens + num_output_tokens,
|
||||
max_tokens=num_output_tokens,
|
||||
)
|
||||
for i, req in enumerate(next_turn_requests):
|
||||
req.prompt_token_ids = (requests[i].prompt_token_ids +
|
||||
list(requests[i].output_token_ids))
|
||||
# Schedule the next-turn requests.
|
||||
for req in next_turn_requests:
|
||||
scheduler.add_request(req)
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
# Make sure the next-turn requests get prefix cache hit by the previous
|
||||
# requests.
|
||||
for req in next_turn_requests:
|
||||
assert (req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE *
|
||||
BLOCK_SIZE)
|
||||
@ -8,7 +8,7 @@ import torch
|
||||
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import GiB_bytes, sha256
|
||||
from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
# disable yapf here as it formats differently than isort such that both fail
|
||||
# yapf: disable
|
||||
@ -16,7 +16,8 @@ from vllm.v1.core.kv_cache_utils import (
|
||||
FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
|
||||
estimate_max_model_len, generate_block_hash_extra_keys,
|
||||
get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
|
||||
hash_block_tokens, hash_request_tokens, unify_kv_cache_configs)
|
||||
hash_block_tokens, hash_request_tokens, init_none_hash,
|
||||
unify_kv_cache_configs)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheTensor,
|
||||
SlidingWindowSpec)
|
||||
@ -78,24 +79,27 @@ def new_sliding_window_spec(block_size=16,
|
||||
sliding_window=sliding_window)
|
||||
|
||||
|
||||
def test_none_hash(monkeypatch):
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
|
||||
def test_none_hash(monkeypatch, hash_fn):
|
||||
import vllm.v1.core.kv_cache_utils
|
||||
|
||||
# case 1: PYTHONHASHSEED is not set, use random
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv('PYTHONHASHSEED', raising=False)
|
||||
reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
|
||||
reloaded_kv_cache_utils.init_none_hash(hash_fn)
|
||||
assert reloaded_kv_cache_utils.NONE_HASH is not None
|
||||
assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
|
||||
assert reloaded_kv_cache_utils.NONE_HASH != 0
|
||||
|
||||
# case 2: PYTHONHASHSEED is set, use the seed
|
||||
# case 2: PYTHONHASHSEED is set, use the seed and hash_fn
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv('PYTHONHASHSEED', 'python hash seed')
|
||||
reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
|
||||
reloaded_kv_cache_utils.init_none_hash(hash_fn)
|
||||
assert reloaded_kv_cache_utils.NONE_HASH is not None
|
||||
assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
|
||||
assert sha256('python hash seed') == reloaded_kv_cache_utils.NONE_HASH
|
||||
assert hash_fn('python hash seed') == reloaded_kv_cache_utils.NONE_HASH
|
||||
|
||||
|
||||
def test_kv_cache_block():
|
||||
@ -287,9 +291,10 @@ def test_generate_block_hash_extra_keys_cache_salt():
|
||||
assert next_mm_idx == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, hash])
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
|
||||
def test_hash_block_tokens(hash_fn):
|
||||
import vllm.v1.core.kv_cache_utils
|
||||
init_none_hash(hash_fn)
|
||||
parent_block_hash = 123
|
||||
curr_block_token_ids = (1, 2, 3)
|
||||
extra_keys = ("key1", "key2")
|
||||
@ -303,9 +308,10 @@ def test_hash_block_tokens(hash_fn):
|
||||
assert block_hash.extra_keys == extra_keys
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, hash])
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
|
||||
def test_hash_request_tokens(hash_fn):
|
||||
import vllm.v1.core.kv_cache_utils
|
||||
init_none_hash(hash_fn)
|
||||
request = make_request(
|
||||
request_id=0,
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
@ -332,8 +338,10 @@ def test_hash_request_tokens(hash_fn):
|
||||
assert block_hashes[1].extra_keys == ("hash2", )
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, hash])
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
|
||||
def test_hash_tokens_different_mm_input(hash_fn):
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
request1 = make_request(
|
||||
request_id=0,
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
@ -359,8 +367,10 @@ def test_hash_tokens_different_mm_input(hash_fn):
|
||||
assert block_hashes1[1] != block_hashes2[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, hash])
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
|
||||
def test_hash_request_tokens_no_mm_inputs(hash_fn):
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
request = make_request(
|
||||
request_id=0,
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
@ -916,4 +926,4 @@ def test_get_kv_cache_config():
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
|
||||
])
|
||||
])
|
||||
|
||||
@ -11,11 +11,12 @@ import torch
|
||||
from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import sha256
|
||||
from vllm.utils import sha256, sha256_cbor_64bit
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
|
||||
KVCacheBlock, hash_block_tokens)
|
||||
KVCacheBlock, hash_block_tokens,
|
||||
init_none_hash)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, SlidingWindowSpec)
|
||||
|
||||
@ -91,7 +92,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
|
||||
@pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"])
|
||||
def test_prefill(hash_algo):
|
||||
manager = KVCacheManager(
|
||||
make_kv_cache_config(16, 11),
|
||||
@ -101,7 +102,8 @@ def test_prefill(hash_algo):
|
||||
)
|
||||
|
||||
# choose the hash function according to the parameter
|
||||
hash_fn = sha256 if hash_algo == "sha256" else hash
|
||||
hash_fn = (sha256_cbor_64bit if hash_algo == "sha256_cbor_64bit" else
|
||||
sha256 if hash_algo == "sha256" else hash)
|
||||
|
||||
# Complete 3 blocks (48 tokens)
|
||||
common_token_ids = [i for i in range(3) for _ in range(16)]
|
||||
@ -696,12 +698,14 @@ def test_basic_prefix_caching_disabled():
|
||||
assert not blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, hash])
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
|
||||
def test_cache_blocks(hash_fn):
|
||||
"""
|
||||
This is a unit test that tests the correctness of the _cache_full_blocks
|
||||
function of KVCacheManager.
|
||||
"""
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
block_size = 4
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=5,
|
||||
|
||||
@ -19,133 +19,7 @@ from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
from vllm.v1.structured_output.request import StructuredOutputRequest
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
|
||||
|
||||
def create_scheduler(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: Optional[bool] = None,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
use_kv_connector: bool = False,
|
||||
num_blocks: int = 10000,
|
||||
block_size: int = 16,
|
||||
max_model_len: Optional[int] = None,
|
||||
num_speculative_tokens: Optional[int] = None,
|
||||
skip_tokenizer_init: bool = False,
|
||||
) -> Scheduler:
|
||||
'''Create scheduler under test.
|
||||
|
||||
Args:
|
||||
model: model under test
|
||||
max_num_seqs: max sequences to schedule
|
||||
max_num_batch_tokens: max num tokens to batch
|
||||
enable_prefix_caching: optionally force APC config
|
||||
(True/False) or use default
|
||||
(None)
|
||||
|
||||
Returns:
|
||||
{class}`Scheduler` instance
|
||||
'''
|
||||
if max_model_len is None:
|
||||
max_model_len = max_num_batched_tokens
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=max_model_len,
|
||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache = ({} if enable_prefix_caching is None else {
|
||||
'enable_prefix_caching': enable_prefix_caching
|
||||
})
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
**kwargs_cache,
|
||||
)
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": "local_storage"},
|
||||
) if use_kv_connector else None
|
||||
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
if num_speculative_tokens is not None:
|
||||
speculative_config = SpeculativeConfig(
|
||||
model="ngram", num_speculative_tokens=num_speculative_tokens)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
speculative_config=speculative_config,
|
||||
)
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks, # A large number of blocks to hold all requests
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(['layer'],
|
||||
FullAttentionSpec(block_size, 1, 1, torch.float32,
|
||||
False))
|
||||
],
|
||||
)
|
||||
cache_config.num_gpu_blocks = num_blocks
|
||||
return Scheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
|
||||
|
||||
def create_requests(num_requests: int,
|
||||
num_tokens: int = 10,
|
||||
mm_positions: Optional[list[PlaceholderRange]] = None,
|
||||
max_tokens: int = 16,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
prompt_logprobs: Optional[int] = None):
|
||||
sampling_params = SamplingParams(ignore_eos=False,
|
||||
max_tokens=max_tokens,
|
||||
stop_token_ids=stop_token_ids,
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
if mm_positions is not None:
|
||||
mm_position = mm_positions[i]
|
||||
mm_inputs = [MultiModalKwargs({})] * len(mm_position)
|
||||
else:
|
||||
mm_position = None
|
||||
mm_inputs = None
|
||||
request = Request(
|
||||
request_id=f"{i}",
|
||||
prompt_token_ids=[i] * num_tokens,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
multi_modal_inputs=mm_inputs,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
)
|
||||
requests.append(request)
|
||||
return requests
|
||||
from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
|
||||
|
||||
|
||||
def test_add_requests():
|
||||
|
||||
152
tests/v1/core/utils.py
Normal file
152
tests/v1/core/utils.py
Normal file
@ -0,0 +1,152 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec)
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
|
||||
|
||||
def create_scheduler(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: Optional[bool] = None,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
use_kv_connector: bool = False,
|
||||
num_blocks: int = 10000,
|
||||
block_size: int = 16,
|
||||
max_model_len: Optional[int] = None,
|
||||
num_speculative_tokens: Optional[int] = None,
|
||||
skip_tokenizer_init: bool = False,
|
||||
async_scheduling: bool = False,
|
||||
) -> Union[Scheduler, AsyncScheduler]:
|
||||
'''Create scheduler under test.
|
||||
|
||||
Args:
|
||||
model: model under test
|
||||
max_num_seqs: max sequences to schedule
|
||||
max_num_batch_tokens: max num tokens to batch
|
||||
enable_prefix_caching: optionally force APC config
|
||||
(True/False) or use default
|
||||
(None)
|
||||
|
||||
Returns:
|
||||
{class}`Scheduler` instance
|
||||
'''
|
||||
if max_model_len is None:
|
||||
max_model_len = max_num_batched_tokens
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=max_model_len,
|
||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
enable_chunked_prefill=True,
|
||||
async_scheduling=async_scheduling,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache = ({} if enable_prefix_caching is None else {
|
||||
'enable_prefix_caching': enable_prefix_caching
|
||||
})
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
**kwargs_cache,
|
||||
)
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": "local_storage"},
|
||||
) if use_kv_connector else None
|
||||
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
if num_speculative_tokens is not None:
|
||||
speculative_config = SpeculativeConfig(
|
||||
model="ngram", num_speculative_tokens=num_speculative_tokens)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
speculative_config=speculative_config,
|
||||
)
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks, # A large number of blocks to hold all requests
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(['layer'],
|
||||
FullAttentionSpec(block_size, 1, 1, torch.float32,
|
||||
False))
|
||||
],
|
||||
)
|
||||
cache_config.num_gpu_blocks = num_blocks
|
||||
scheduler_cls = AsyncScheduler if async_scheduling else Scheduler
|
||||
return scheduler_cls(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
|
||||
|
||||
def create_requests(
|
||||
num_requests: int,
|
||||
num_tokens: int = 10,
|
||||
mm_positions: Optional[list[PlaceholderRange]] = None,
|
||||
max_tokens: int = 16,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
prompt_logprobs: Optional[int] = None,
|
||||
same_prompt: bool = False,
|
||||
) -> list[Request]:
|
||||
sampling_params = SamplingParams(ignore_eos=False,
|
||||
max_tokens=max_tokens,
|
||||
stop_token_ids=stop_token_ids,
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
if mm_positions is not None:
|
||||
mm_position = mm_positions[i]
|
||||
mm_inputs = [MultiModalKwargs({})] * len(mm_position)
|
||||
else:
|
||||
mm_position = None
|
||||
mm_inputs = None
|
||||
prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
|
||||
num_tokens)
|
||||
request = Request(
|
||||
request_id=f"{i}",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
multi_modal_inputs=mm_inputs,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
)
|
||||
requests.append(request)
|
||||
return requests
|
||||
@ -6,8 +6,10 @@ import random
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -53,14 +55,6 @@ def model_name():
|
||||
return "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
|
||||
def eagle_model_name():
|
||||
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
|
||||
|
||||
|
||||
def eagle3_model_name():
|
||||
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
|
||||
|
||||
def test_ngram_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
@ -77,6 +71,8 @@ def test_ngram_correctness(
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
@ -103,34 +99,50 @@ def test_ngram_correctness(
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
|
||||
@pytest.mark.parametrize("model_setup", [
|
||||
("eagle", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
|
||||
("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
|
||||
pytest.param(
|
||||
("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
|
||||
],
|
||||
ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
|
||||
def test_eagle_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
use_eagle3: bool,
|
||||
model_setup: tuple[str, str, str, int],
|
||||
):
|
||||
'''
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
model_setup: (method, model_name, eagle_model_name, tp_size)
|
||||
'''
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
method, model_name, spec_model_name, tp_size = model_setup
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048)
|
||||
ref_llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=tp_size)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
spec_model_name = eagle3_model_name(
|
||||
) if use_eagle3 else eagle_model_name()
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=tp_size,
|
||||
speculative_config={
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"method": method,
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 3,
|
||||
"max_model_len": 2048,
|
||||
@ -152,3 +164,5 @@ def test_eagle_correctness(
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
166
tests/v1/entrypoints/openai/responses/test_image.py
Normal file
166
tests/v1/entrypoints/openai/responses/test_image.py
Normal file
@ -0,0 +1,166 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
|
||||
# Use a small vision model for testing
|
||||
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_image_server_args():
|
||||
return [
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"6000",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def image_server(default_image_server_args):
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
default_image_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(image_server):
|
||||
async with image_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image() -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(fetch_image(image_url))
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
||||
model_name: str, image_url: str):
|
||||
content_text = "What's in this image?"
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": image_url,
|
||||
"detail": "auto",
|
||||
},
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": content_text
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# test image url
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url":
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
|
||||
"detail": "auto",
|
||||
},
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": content_text
|
||||
},
|
||||
],
|
||||
}]
|
||||
# test image base64
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
|
||||
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
image_urls: list[str]):
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*({
|
||||
"type": "input_image",
|
||||
"image_url": image_url,
|
||||
"detail": "auto",
|
||||
} for image_url in image_urls),
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
if len(image_urls) > MAXIMUM_IMAGES:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-image input
|
||||
await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
# the server should still work afterwards
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}],
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
else:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||
|
||||
# Number of data parallel ranks for external LB testing
|
||||
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
|
||||
# Default tensor parallell size to use
|
||||
# Default tensor parallel size to use
|
||||
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.v1.utils import bind_kv_cache
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
|
||||
|
||||
def test_bind_kv_cache():
|
||||
|
||||
@ -145,3 +145,35 @@ def test_gemma3_27b_with_text_input_and_tp(
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(),
|
||||
reason="This is a basic test for TPU only")
|
||||
def test_w8a8_quantization(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
|
||||
max_tokens = 5
|
||||
tensor_parallel_size = 1
|
||||
max_num_seqs = 4
|
||||
|
||||
prompt = "The next numbers of the sequence " + ", ".join(
|
||||
str(i) for i in range(1024)) + " are:"
|
||||
example_prompts = [prompt]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=64,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
assert "1024" in output or "0, 1" in output
|
||||
|
||||
@ -434,16 +434,28 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
|
||||
assert all(not kv.is_contiguous() for kv in model_runner.kv_caches)
|
||||
|
||||
|
||||
def test_update_config(model_runner):
|
||||
# Simple update
|
||||
model_runner.update_config({"load_config": {"load_format": "dummy"}})
|
||||
assert model_runner.load_config.load_format == "dummy"
|
||||
# Raise error on non-existing config
|
||||
with pytest.raises(AssertionError):
|
||||
model_runner.update_config({"do_not_exist_config": "dummy"})
|
||||
|
||||
|
||||
def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
|
||||
# In this test, model_runner loads model + weights in one go, while
|
||||
# model_runner_2 loads dummy weights first then load real weights inplace
|
||||
model_runner.load_model()
|
||||
original_load_format = model_runner_2.load_config.load_format
|
||||
model_runner_2.load_config.load_format = "dummy"
|
||||
model_runner_2.update_config({"load_config": {"load_format": "dummy"}})
|
||||
model_runner_2.load_model() # Initial model loading with dummy weights
|
||||
assert str(model_runner.get_model().state_dict()) != str(
|
||||
model_runner_2.get_model().state_dict())
|
||||
model_runner_2.load_config.load_format = original_load_format
|
||||
model_runner_2.update_config(
|
||||
{"load_config": {
|
||||
"load_format": original_load_format
|
||||
}})
|
||||
model_runner_2.load_model() # Load real weights inplace
|
||||
assert str(model_runner.get_model().state_dict()) == str(
|
||||
model_runner_2.get_model().state_dict())
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
# set this on your machine
|
||||
vllm-directory := "/home/rshaw/vllm/"
|
||||
|
||||
launch_dp_ep MODEL SIZE:
|
||||
VLLM_ALL2ALL_BACKEND="pplx" vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests --max-model-len 32000 --enforce-eager
|
||||
|
||||
launch_tp MODEL SIZE:
|
||||
vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests --max-model-len 32000
|
||||
|
||||
eval MODEL:
|
||||
lm_eval --model local-completions --tasks gsm8k \
|
||||
--model_args model={{MODEL}},base_url=http://127.0.0.1:8000/v1/completions,num_concurrent=100,tokenized_requests=False
|
||||
|
||||
benchmark MODEL NUM_PROMPTS:
|
||||
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
|
||||
--model {{MODEL}} \
|
||||
--dataset-name random \
|
||||
--random-input-len 1000 \
|
||||
--random-output-len 100 \
|
||||
--num-prompts {{NUM_PROMPTS}} \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--metric-percentiles 90,95,99 \
|
||||
--ignore-eos \
|
||||
--seed $(date +%s)
|
||||
|
||||
benchmark_all_decode MODEL NUM_PROMPTS:
|
||||
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
|
||||
--model {{MODEL}} \
|
||||
--dataset-name random \
|
||||
--random-input-len 1 \
|
||||
--random-output-len 1000 \
|
||||
--num-prompts {{NUM_PROMPTS}} \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--metric-percentiles 90,95,99 \
|
||||
--ignore-eos \
|
||||
--seed $(date +%s)
|
||||
179
typos.toml
179
typos.toml
@ -1,179 +0,0 @@
|
||||
[files]
|
||||
# these files may be written in non english words
|
||||
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||
"vllm/third_party/*"]
|
||||
ignore-hidden = true
|
||||
ignore-files = true
|
||||
ignore-dot = true
|
||||
ignore-vcs = true
|
||||
ignore-global = true
|
||||
ignore-parent = true
|
||||
|
||||
[default]
|
||||
binary = false
|
||||
check-filename = false
|
||||
check-file = true
|
||||
unicode = true
|
||||
ignore-hex = true
|
||||
identifier-leading-digits = false
|
||||
locale = "en"
|
||||
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
|
||||
".*ot.*", ".*[Tt]h[rR].*"]
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[default.extend-identifiers]
|
||||
bbc5b7ede = "bbc5b7ede"
|
||||
womens_doubles = "womens_doubles"
|
||||
v_2nd = "v_2nd"
|
||||
splitted_input = "splitted_input"
|
||||
NOOPs = "NOOPs"
|
||||
typ = "typ"
|
||||
nin_shortcut = "nin_shortcut"
|
||||
UperNetDecoder = "UperNetDecoder"
|
||||
subtile = "subtile"
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||
SFOuput = "SFOuput"
|
||||
# huggingface transformers repo uses these words
|
||||
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||
|
||||
[default.extend-words]
|
||||
iy = "iy"
|
||||
tendencias = "tendencias"
|
||||
# intel cpu features
|
||||
tme = "tme"
|
||||
dout = "dout"
|
||||
Pn = "Pn"
|
||||
arange = "arange"
|
||||
|
||||
[type.py]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.py.extend-identifiers]
|
||||
arange = "arange"
|
||||
NDArray = "NDArray"
|
||||
EOFError = "EOFError"
|
||||
|
||||
[type.py.extend-words]
|
||||
|
||||
[type.cpp]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.cpp.extend-identifiers]
|
||||
countr_one = "countr_one"
|
||||
|
||||
[type.cpp.extend-words]
|
||||
|
||||
[type.rust]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.rust.extend-identifiers]
|
||||
flate2 = "flate2"
|
||||
|
||||
[type.rust.extend-words]
|
||||
ser = "ser"
|
||||
|
||||
[type.lock]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.lock.extend-identifiers]
|
||||
|
||||
[type.lock.extend-words]
|
||||
|
||||
[type.jl]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.jl.extend-identifiers]
|
||||
|
||||
[type.jl.extend-words]
|
||||
modul = "modul"
|
||||
egals = "egals"
|
||||
usig = "usig"
|
||||
egal = "egal"
|
||||
|
||||
[type.go]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.go.extend-identifiers]
|
||||
flate = "flate"
|
||||
|
||||
[type.go.extend-words]
|
||||
|
||||
[type.css]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.css.extend-identifiers]
|
||||
nd = "nd"
|
||||
|
||||
[type.css.extend-words]
|
||||
|
||||
[type.man]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.man.extend-identifiers]
|
||||
Nd = "Nd"
|
||||
|
||||
[type.man.extend-words]
|
||||
|
||||
[type.cert]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.cert.extend-identifiers]
|
||||
|
||||
[type.cert.extend-words]
|
||||
|
||||
[type.sh]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.sh.extend-identifiers]
|
||||
stap = "stap"
|
||||
ot = "ot"
|
||||
|
||||
[type.sh.extend-words]
|
||||
|
||||
[type.vimscript]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.vimscript.extend-identifiers]
|
||||
windo = "windo"
|
||||
|
||||
[type.vimscript.extend-words]
|
||||
@ -1843,6 +1843,26 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
|
||||
return out
|
||||
|
||||
|
||||
def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
|
||||
q_pe: torch.Tensor,
|
||||
kv_c_and_k_pe_cache: torch.Tensor,
|
||||
seq_lens: torch.Tensor, page_table: torch.Tensor,
|
||||
workspace: torch.Tensor, scale: float,
|
||||
num_kv_splits: int) -> torch.Tensor:
|
||||
torch.ops._C.sm100_cutlass_mla_decode(out, q_nope, q_pe,
|
||||
kv_c_and_k_pe_cache, seq_lens,
|
||||
page_table, workspace, scale,
|
||||
num_kv_splits)
|
||||
return out
|
||||
|
||||
|
||||
def sm100_cutlass_mla_get_workspace_size(max_seq_len: int, num_batches: int,
|
||||
sm_count: int,
|
||||
num_kv_splits: int) -> int:
|
||||
return torch.ops._C.sm100_cutlass_mla_get_workspace_size(
|
||||
max_seq_len, num_batches, sm_count, num_kv_splits)
|
||||
|
||||
|
||||
if hasattr(torch.ops._C, "weight_packed_linear"):
|
||||
|
||||
@register_fake("_C::weight_packed_linear")
|
||||
|
||||
@ -961,7 +961,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
|
||||
"... H (two D) -> ... (H two) D",
|
||||
two=2)
|
||||
|
||||
else: # re-use the kv cache, full attention
|
||||
else: # reuse the kv cache, full attention
|
||||
q = q.view(-1, self.num_heads, self.head_size)
|
||||
q1, q2 = self.split_heads(q)
|
||||
# kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from functools import cache
|
||||
from typing import Generator, Optional, Union
|
||||
|
||||
@ -79,31 +80,61 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
|
||||
return forced_attn_backend
|
||||
|
||||
|
||||
def supports_head_size(
|
||||
@dataclass(frozen=True)
|
||||
class _IsSupported:
|
||||
can_import: bool
|
||||
head_size: bool
|
||||
dtype: bool
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return self.can_import and self.head_size and self.dtype
|
||||
|
||||
|
||||
def is_attn_backend_supported(
|
||||
attn_backend: Union[str, type[AttentionBackend]],
|
||||
head_size: int,
|
||||
) -> bool:
|
||||
dtype: torch.dtype,
|
||||
*,
|
||||
allow_import_error: bool = True,
|
||||
) -> _IsSupported:
|
||||
if isinstance(attn_backend, str):
|
||||
try:
|
||||
attn_backend = resolve_obj_by_qualname(attn_backend)
|
||||
except ImportError:
|
||||
return False
|
||||
if not allow_import_error:
|
||||
raise
|
||||
|
||||
return _IsSupported(can_import=False, head_size=False, dtype=False)
|
||||
|
||||
assert isinstance(attn_backend, type)
|
||||
|
||||
# TODO: Update the interface once V0 is removed
|
||||
if get_supported_head_sizes := getattr(attn_backend,
|
||||
"get_supported_head_sizes", None):
|
||||
return head_size in get_supported_head_sizes()
|
||||
if validate_head_size := getattr(attn_backend, "validate_head_size", None):
|
||||
is_head_size_supported = head_size in get_supported_head_sizes()
|
||||
elif validate_head_size := getattr(attn_backend, "validate_head_size",
|
||||
None):
|
||||
try:
|
||||
validate_head_size(head_size)
|
||||
return True
|
||||
is_head_size_supported = True
|
||||
except Exception:
|
||||
return False
|
||||
is_head_size_supported = False
|
||||
else:
|
||||
raise NotImplementedError(f"{attn_backend.__name__} does not support "
|
||||
"head size validation")
|
||||
|
||||
raise NotImplementedError(f"{attn_backend.__name__} does not support "
|
||||
"head size validation")
|
||||
if get_supported_dtypes := getattr(attn_backend, "get_supported_dtypes",
|
||||
None):
|
||||
is_dtype_supported = dtype in get_supported_dtypes()
|
||||
else:
|
||||
raise NotImplementedError(f"{attn_backend.__name__} does not support "
|
||||
"dtype validation")
|
||||
|
||||
return _IsSupported(
|
||||
can_import=True,
|
||||
head_size=is_head_size_supported,
|
||||
dtype=is_dtype_supported,
|
||||
)
|
||||
|
||||
|
||||
def get_attn_backend(
|
||||
|
||||
0
vllm/attention/utils/__init__.py
Normal file
0
vllm/attention/utils/__init__.py
Normal file
@ -654,6 +654,9 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_class = ASRDataset
|
||||
args.hf_split = "train"
|
||||
elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_class = MLPerfDataset
|
||||
args.hf_split = "train"
|
||||
else:
|
||||
supported_datasets = set([
|
||||
dataset_name for cls in HuggingFaceDataset.__subclasses__()
|
||||
@ -1447,3 +1450,82 @@ class ASRDataset(HuggingFaceDataset):
|
||||
)
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MLPerf Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class MLPerfDataset(HuggingFaceDataset):
|
||||
"""
|
||||
MLPerf Inference Dataset.
|
||||
|
||||
Dataset on HF:
|
||||
https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
|
||||
https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
|
||||
|
||||
Each record contains:
|
||||
- "system_prompt": system role instruction.
|
||||
- "question": user question.
|
||||
- "output": reference answer.
|
||||
|
||||
We combine the system prompt and question into a chat-formatted prompt
|
||||
(using the tokenizer's chat template) and set the expected output length to
|
||||
the tokenized length of the provided reference answer.
|
||||
"""
|
||||
|
||||
SUPPORTED_DATASET_PATHS = {
|
||||
"mgoin/mlperf-inference-llama2-data",
|
||||
"mgoin/mlperf-inference-llama3.1-data",
|
||||
}
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
# Force dynamic output length based on reference completion.
|
||||
dynamic_output = output_len is None
|
||||
sampled_requests: list[SampleRequest] = []
|
||||
|
||||
for item in self.data:
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
|
||||
system_prompt = item["system_prompt"]
|
||||
question = item["question"]
|
||||
reference_answer = item["output"]
|
||||
|
||||
# Build chat-style prompt using tokenizer template, if available.
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": question},
|
||||
]
|
||||
prompt_formatted = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||
|
||||
# Determine output length from reference answer tokens.
|
||||
ref_out_len = len(
|
||||
tokenizer(reference_answer, add_special_tokens=False).input_ids
|
||||
)
|
||||
expected_output_len = ref_out_len if dynamic_output else output_len
|
||||
|
||||
# Validate sequence lengths.
|
||||
if not is_valid_sequence(prompt_len, expected_output_len):
|
||||
continue
|
||||
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt_formatted,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=expected_output_len,
|
||||
)
|
||||
)
|
||||
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
@ -96,25 +96,30 @@ DEFAULT_PIP_PATTERNS = {
|
||||
def run(command):
|
||||
"""Return (return-code, stdout, stderr)."""
|
||||
shell = True if type(command) is str else False
|
||||
p = subprocess.Popen(command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
shell=shell)
|
||||
raw_output, raw_err = p.communicate()
|
||||
rc = p.returncode
|
||||
if get_platform() == 'win32':
|
||||
enc = 'oem'
|
||||
else:
|
||||
enc = locale.getpreferredencoding()
|
||||
output = raw_output.decode(enc)
|
||||
if command == 'nvidia-smi topo -m':
|
||||
# don't remove the leading whitespace of `nvidia-smi topo -m`
|
||||
# because they are meaningful
|
||||
output = output.rstrip()
|
||||
else:
|
||||
output = output.strip()
|
||||
err = raw_err.decode(enc)
|
||||
return rc, output, err.strip()
|
||||
try:
|
||||
p = subprocess.Popen(command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
shell=shell)
|
||||
raw_output, raw_err = p.communicate()
|
||||
rc = p.returncode
|
||||
if get_platform() == 'win32':
|
||||
enc = 'oem'
|
||||
else:
|
||||
enc = locale.getpreferredencoding()
|
||||
output = raw_output.decode(enc)
|
||||
if command == 'nvidia-smi topo -m':
|
||||
# don't remove the leading whitespace of `nvidia-smi topo -m`
|
||||
# because they are meaningful
|
||||
output = output.rstrip()
|
||||
else:
|
||||
output = output.strip()
|
||||
err = raw_err.decode(enc)
|
||||
return rc, output, err.strip()
|
||||
|
||||
except FileNotFoundError:
|
||||
cmd_str = command if isinstance(command, str) else command[0]
|
||||
return 127, '', f"Command not found: {cmd_str}"
|
||||
|
||||
|
||||
def run_and_read_all(run_lambda, command):
|
||||
@ -148,7 +153,7 @@ def get_conda_packages(run_lambda, patterns=None):
|
||||
if patterns is None:
|
||||
patterns = DEFAULT_CONDA_PATTERNS
|
||||
conda = os.environ.get('CONDA_EXE', 'conda')
|
||||
out = run_and_read_all(run_lambda, "{} list".format(conda))
|
||||
out = run_and_read_all(run_lambda, [conda, 'list'])
|
||||
if out is None:
|
||||
return out
|
||||
|
||||
|
||||
@ -183,9 +183,10 @@ class CompilerManager:
|
||||
assert compiled_graph is not None, "Failed to compile the graph"
|
||||
|
||||
# store the artifact in the cache
|
||||
if handle is not None:
|
||||
if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
|
||||
self.cache[(runtime_shape, graph_index,
|
||||
self.compiler.name)] = handle
|
||||
compilation_counter.num_cache_entries_updated += 1
|
||||
self.is_cache_updated = True
|
||||
if graph_index == 0:
|
||||
# adds some info logging for the first graph
|
||||
|
||||
@ -20,10 +20,12 @@ from vllm.utils import direct_register_custom_op
|
||||
from .vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
if find_spec("flashinfer"):
|
||||
import flashinfer.comm as flashinfer_comm
|
||||
|
||||
flashinfer_comm = (flashinfer_comm if hasattr(
|
||||
flashinfer_comm, "trtllm_allreduce_fusion") else None)
|
||||
try:
|
||||
import flashinfer.comm as flashinfer_comm
|
||||
flashinfer_comm = (flashinfer_comm if hasattr(
|
||||
flashinfer_comm, "trtllm_allreduce_fusion") else None)
|
||||
except ImportError:
|
||||
flashinfer_comm = None
|
||||
else:
|
||||
flashinfer_comm = None
|
||||
from vllm.platforms import current_platform
|
||||
@ -395,7 +397,7 @@ class AllReduceFusedAddRMSNormPattern(BasePattern):
|
||||
|
||||
class AllReduceFusionPass(VllmInductorPass):
|
||||
|
||||
def __init__(self, config: VllmConfig, max_token_num: int):
|
||||
def __init__(self, config: VllmConfig):
|
||||
super().__init__(config)
|
||||
self.disabled = True
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
@ -411,7 +413,8 @@ class AllReduceFusionPass(VllmInductorPass):
|
||||
use_fp32_lamport = self.model_dtype == torch.float32
|
||||
if flashinfer_comm is None:
|
||||
logger.warning(
|
||||
"Flashinfer is not installed, skipping allreduce fusion pass")
|
||||
"Flashinfer is not installed or comm module not found, "
|
||||
"skipping allreduce fusion pass")
|
||||
return
|
||||
# Check if the world size is supported
|
||||
if self.tp_size not in _FI_MAX_SIZES:
|
||||
@ -426,7 +429,8 @@ class AllReduceFusionPass(VllmInductorPass):
|
||||
flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
|
||||
tp_rank=rank,
|
||||
tp_size=self.tp_size,
|
||||
max_token_num=max_token_num,
|
||||
max_token_num=config.compilation_config.pass_config.
|
||||
fi_allreduce_fusion_max_token_num,
|
||||
hidden_dim=self.hidden_dim,
|
||||
group=self.group,
|
||||
use_fp32_lamport=use_fp32_lamport,
|
||||
@ -438,7 +442,8 @@ class AllReduceFusionPass(VllmInductorPass):
|
||||
rank=rank,
|
||||
world_size=self.tp_size,
|
||||
use_fp32_lamport=use_fp32_lamport,
|
||||
max_token_num=max_token_num,
|
||||
max_token_num=config.compilation_config.pass_config.
|
||||
fi_allreduce_fusion_max_token_num,
|
||||
)
|
||||
|
||||
for epsilon in [1e-5, 1e-6]:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user