Compare commits

..

1 Commits

Author SHA1 Message Date
ca15f0afe6 ci(Mergify): configuration update
Signed-off-by: Lu Fang <null>
2025-06-09 15:44:44 +08:00
30 changed files with 359 additions and 437 deletions

357
.github/mergify.yml vendored
View File

@ -1,196 +1,181 @@
pull_request_rules:
- name: label-documentation
description: Automatically apply documentation label
conditions:
- or:
- files~=^[^/]+\.md$
- files~=^docs/
- files~=^examples/
actions:
label:
add:
- documentation
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
- or:
- files~=^\.github/
- files~=\.buildkite/
- files~=^cmake/
- files=CMakeLists.txt
- files~=^docker/Dockerfile
- files~=^requirements.*\.txt
- files=setup.py
actions:
label:
add:
- ci/build
- name: label-frontend
description: Automatically apply frontend label
conditions:
- files~=^vllm/entrypoints/
actions:
label:
add:
- frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
- files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py
actions:
label:
add:
- multi-modality
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
add:
- structured-output
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
- or:
- files~=^vllm/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/
actions:
label:
add:
- speculative-decoding
- name: label-v1
description: Automatically apply v1 label
conditions:
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
actions:
label:
add:
- v1
- name: label-tpu
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
- or:
- files~=tpu.py
- files~=_tpu
- files~=tpu_
- files~=/tpu/
- files~=pallas
actions:
label:
add:
- tpu
- name: label-tpu-remove
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
- and:
- -files~=tpu.py
- -files~=_tpu
- -files~=tpu_
- -files~=/tpu/
- -files~=pallas
actions:
label:
remove:
- tpu
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions:
label:
add:
- tool-calling
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-documentation
description: Automatically apply documentation label
conditions:
- or:
- files~=^[^/]+\.md$
- files~=^docs/
- files~=^examples/
actions:
label:
add:
- documentation
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
- or:
- files~=^\.github/
- files~=\.buildkite/
- files~=^cmake/
- files=CMakeLists.txt
- files~=^docker/Dockerfile
- files~=^requirements.*\.txt
- files=setup.py
actions:
label:
add:
- ci/build
- name: label-frontend
description: Automatically apply frontend label
conditions:
- files~=^vllm/entrypoints/
actions:
label:
add:
- frontend
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
- files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py
actions:
label:
add:
- multi-modality
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
add:
- structured-output
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
- or:
- files~=^vllm/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/
actions:
label:
add:
- speculative-decoding
- name: label-v1
description: Automatically apply v1 label
conditions:
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
actions:
label:
add:
- v1
- name: label-tpu
description: Automatically apply tpu label
conditions:
- or:
- files~=tpu.py
- files~=_tpu
- files~=tpu_
- files~=/tpu/
- files~=pallas
actions:
label:
add:
- tpu
- name: label-tpu-remove
description: Automatically remove tpu label
conditions:
- and:
- -files~=tpu.py
- -files~=_tpu
- -files~=tpu_
- -files~=/tpu/
- -files~=pallas
actions:
label:
remove:
- tpu
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions:
label:
add:
- tool-calling
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- conflict
- -closed
actions:
label:
add:
- needs-rebase
comment:
message: |
This pull request has merge conflicts that must be resolved before it can be
merged. Please rebase the PR, @{{author}}.
actions:
label:
add:
- needs-rebase
comment:
message: |
This pull request has merge conflicts that must be resolved before it can be
merged. Please rebase the PR, @{{author}}.
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
- name: assign reviewer for tensorizer changes
conditions:
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
- name: assign reviewer for tensorizer changes
conditions:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
actions:
assign:
users:
- sangstar
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
- -closed
actions:
label:
remove:
- needs-rebase
actions:
label:
remove:
- needs-rebase

View File

@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
#
@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#

View File

@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is
**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.

View File

@ -31,7 +31,6 @@ refer to the [PyTorch Security
Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
Key points from the PyTorch security guide:
- PyTorch Distributed features are intended for internal communication only
- They are not built for use in untrusted environments or networks
- No authorization protocol is included for performance reasons

View File

@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.10.1 # required for compressed-tensors
compressed-tensors == 0.10.0 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files

View File

@ -8,7 +8,6 @@ import uvicorn
from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server
import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.utils import FlexibleArgumentParser
@ -47,8 +46,9 @@ if __name__ == "__main__":
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
vllm.entrypoints.api_server.engine = engine
uvicorn.run(app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
uvicorn.run(
app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)

View File

@ -25,12 +25,6 @@ TOKEN_IDS = [
]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
@ -110,19 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
outputs = llm.generate(PROMPTS, sampling_params=None)
assert len(PROMPTS) == len(outputs)
def test_max_model_len():
max_model_len = 20
llm = LLM(
model=MODEL_NAME,
max_model_len=max_model_len,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
sampling_params = SamplingParams(max_tokens=max_model_len + 10)
outputs = llm.generate(PROMPTS, sampling_params)
for output in outputs:
num_total_tokens = len(output.prompt_token_ids) + len(
output.outputs[0].token_ids)
assert num_total_tokens == max_model_len

View File

@ -183,34 +183,6 @@ def test_env(
assert backend.get_name() == expected
@pytest.mark.parametrize("device", ["cpu", "cuda"])
@pytest.mark.parametrize("use_v1", [True, False])
def test_fp32_fallback(
device: str,
use_v1: bool,
monkeypatch: pytest.MonkeyPatch,
):
"""Test attention backend selection with fp32."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
if device == "cpu":
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float32, torch.float32,
16, False)
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
if use_v1 else "TORCH_SDPA")
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
backend = get_attn_backend(16, torch.float32, torch.float32,
16, False)
assert (backend.get_name() == "FLEX_ATTENTION"
if use_v1 else "XFORMERS")
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to

View File

@ -274,7 +274,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare(
a_chunk,
None,
None,

View File

@ -164,6 +164,11 @@ def mixtral_lora_files():
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
@pytest.fixture(scope="session")
def gemma_lora_files():
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
@pytest.fixture(scope="session")
def chatglm3_lora_files():
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")

View File

@ -4,6 +4,9 @@ import subprocess
import sys
from typing import Union
import pytest
import ray
import vllm
from vllm import LLM
from vllm.lora.request import LoRARequest
@ -118,6 +121,37 @@ def test_llama_lora(sql_lora_files):
generate_and_test(llm, sql_lora_files)
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
# used by the engine yet.
@pytest.mark.skip_v1
@create_new_process_for_each_test()
def test_llama_lora_warmup(sql_lora_files):
"""Test that the LLM initialization works with a warmup LORA path and
is more conservative"""
@ray.remote(num_gpus=1)
def get_num_gpu_blocks_lora():
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
return num_gpu_blocks_lora_warmup
@ray.remote(num_gpus=1)
def get_num_gpu_blocks_no_lora():
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
num_gpu_blocks_no_lora_warmup = (
llm.llm_engine.cache_config.num_gpu_blocks)
return num_gpu_blocks_no_lora_warmup
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
num_gpu_blocks_no_lora_warmup = ray.get(
get_num_gpu_blocks_no_lora.remote())
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
"The warmup with lora should be more "
"conservative than without lora, therefore the number of "
"memory blocks for the KV cache should be "
"less when using lora than when not using lora")
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files):

View File

@ -15,6 +15,13 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_RANK = 8
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines_lora):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass
def make_lora_request(lora_id: int):
return LoRARequest(lora_name=f"{lora_id}",

View File

@ -11,6 +11,14 @@ MODEL_PATH = "microsoft/phi-2"
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(
@ -51,7 +59,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
@pytest.mark.skip_v1
def test_phi2_lora(phi2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.

View File

@ -16,8 +16,6 @@ from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker as V1Worker
from vllm.worker.worker import Worker
NUM_LORAS = 16
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):
@ -60,12 +58,12 @@ def test_worker_apply_lora(sql_lora_files):
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(
block_size=16,
gpu_memory_utilization=1.0,
swap_space=0,
cache_dtype="auto",
),
lora_config=LoRAConfig(max_lora_rank=8,
max_cpu_loras=NUM_LORAS,
max_loras=NUM_LORAS),
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
max_loras=32),
)
worker = worker_cls(
vllm_config=vllm_config,
@ -80,9 +78,9 @@ def test_worker_apply_lora(sql_lora_files):
set_active_loras(worker, [])
assert worker.list_loras() == set()
n_loras = 32
lora_requests = [
LoRARequest(str(i + 1), i + 1, sql_lora_files)
for i in range(NUM_LORAS)
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
]
set_active_loras(worker, lora_requests)
@ -91,12 +89,12 @@ def test_worker_apply_lora(sql_lora_files):
for lora_request in lora_requests
}
for i in range(NUM_LORAS):
for i in range(32):
random.seed(i)
iter_lora_requests = random.choices(lora_requests,
k=random.randint(1, NUM_LORAS))
k=random.randint(1, n_loras))
random.shuffle(iter_lora_requests)
iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
set_active_loras(worker, lora_requests)
assert worker.list_loras().issuperset(
{lora_request.lora_int_id

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import unittest.mock as mock
import pytest
@ -16,8 +17,24 @@ from vllm.v1.worker.tpu_model_runner import (
TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
_get_padded_token_len, _get_req_paddings, _get_token_paddings)
# Mock torch_xla module since it may not be available in the test environments
torch_xla_patcher = mock.patch.dict(
"sys.modules", {
"torch_xla": mock.MagicMock(),
"torch_xla.core.xla_model": mock.MagicMock(),
"torch_xla.runtime": mock.MagicMock(),
})
torch_xla_patcher.start()
def get_vllm_config():
# Mock the PallasAttentionBackend
pallas_attention_backend_patcher = mock.patch(
"vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
pallas_attention_backend_patcher.start()
@pytest.fixture
def model_runner():
# Patchers have already been started at module level.
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
@ -43,19 +60,18 @@ def get_vllm_config():
cache_config=cache_config,
scheduler_config=scheduler_config,
)
return vllm_config
def get_model_runner(vllm_config):
device = "xla:0" # Mocking TPU device
return TPUModelRunner(vllm_config, device)
with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
return TPUModelRunner(vllm_config, device)
@pytest.fixture
def model_runner():
# Patchers have already been started at module level.
vllm_config = get_vllm_config()
return get_model_runner(vllm_config)
@pytest.fixture(autouse=True, scope="session")
def cleanup_patches():
yield
torch_xla_patcher.stop()
pallas_attention_backend_patcher.stop()
def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
@ -354,14 +370,12 @@ def test_get_req_paddings():
assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
model_runner):
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
error_msg = f"{layer_1} must come before the current layer"
vllm_config = model_runner.vllm_config
with pytest.raises(ValueError, match=error_msg), \
set_current_vllm_config(vllm_config):
with pytest.raises(ValueError, match=error_msg):
fwd_context = {
# initialization below will fail because target layer is invalid;
# the target layer needs to come before layer 1
@ -385,14 +399,13 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
assert fwd_context is not None
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
invalid_layer = "model.layers.0.cross_attn.attn"
error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
vllm_config = model_runner.vllm_config
with pytest.raises(ValueError, match=error_msg), \
set_current_vllm_config(vllm_config):
with pytest.raises(ValueError, match=error_msg):
fwd_context = {
layer_0:
Attention(
@ -415,13 +428,12 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
assert fwd_context is not None
def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
error_msg = f"{layer_1} cannot be the same as the current layer"
vllm_config = model_runner.vllm_config
with pytest.raises(ValueError, match=error_msg), \
set_current_vllm_config(vllm_config):
with pytest.raises(ValueError, match=error_msg):
fwd_context = {
# initialization below will fail because target layer is invalid;
# the target layer needs to come before layer 1
@ -445,10 +457,11 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
assert fwd_context is not None
def test_init_kv_cache_without_kv_sharing():
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
def test_init_kv_cache_without_kv_sharing(model_runner):
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
vllm_config = get_vllm_config()
vllm_config = model_runner.vllm_config
with set_current_vllm_config(vllm_config):
fwd_context = {
layer_0:
@ -469,38 +482,33 @@ def test_init_kv_cache_without_kv_sharing():
# suppress var not used error
assert fwd_context is not None
# Set high context length to test max context length estimation
vllm_config.model_config.max_model_len = 1_000_000
vllm_config.model_config.max_model_len = 3_000_000
vllm_ctx = vllm_config.compilation_config.static_forward_context
model_runner = get_model_runner(vllm_config)
kv_cache_spec = model_runner.get_kv_cache_spec()
assert len(kv_cache_spec) == 2
assert len(model_runner.shared_kv_cache_layers) == 0
available_memory = 20 * GiB_bytes
# page size for each layer KV can be calculated as
# 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
# * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers)
# page size for layer 0's kv_cache_spec is 32KB
num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers)
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
available_memory)
assert kv_cache_config.num_blocks == num_expected_blocks
assert len(kv_cache_config.kv_cache_tensors) == 2
assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
assert len(kv_cache_config.tensors) == 2
assert kv_cache_config.tensors[layer_0].size == available_memory // 2
assert kv_cache_config.tensors[layer_1].size == available_memory // 2
max_context_len =\
estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
# max context len with KV sharing should be 2x as large as without
# max_context_len = available_memory / (page_size / block_size) / num_caches
# max_context_len = 5GB / (512KB / 128) / 2 = 655360
assert max_context_len == 655360
assert max_context_len == 1310720
# important: override tensor size to prevent large mem alloc during test
# this will only allocate 2 block worth of memory (2 * 512kb)
# this will only allocate 2 block worth of memory (2 * 32kb)
kv_cache_config.num_blocks = 1
for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
kv_cache_tensor.size = (
kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
for layer in kv_cache_config.tensors:
kv_cache_config.tensors[layer].size =\
kv_cache_spec[layer].page_size_bytes
model_runner.initialize_kv_cache(kv_cache_config)
@ -516,10 +524,11 @@ def test_init_kv_cache_without_kv_sharing():
assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
def test_init_kv_cache_with_kv_sharing_valid():
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
def test_init_kv_cache_with_kv_sharing_valid(model_runner):
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
vllm_config = get_vllm_config()
vllm_config = model_runner.vllm_config
with set_current_vllm_config(vllm_config):
fwd_context = {
layer_0:
@ -543,34 +552,33 @@ def test_init_kv_cache_with_kv_sharing_valid():
# Set high context length to test max context length estimation
vllm_config.model_config.max_model_len = 3_000_000
vllm_ctx = vllm_config.compilation_config.static_forward_context
model_runner = get_model_runner(vllm_config)
kv_cache_spec = model_runner.get_kv_cache_spec()
assert len(kv_cache_spec) == 1
assert layer_0 in kv_cache_spec
assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
available_memory = 20 * GiB_bytes
# page size for layer 0's kv_cache_spec is 512KB
# page size for layer 0's kv_cache_spec is 32KB
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
# which is twice as many as without KV sharing
num_expected_blocks = 2 * 20480 # 20GB / 512KB
num_expected_blocks = 655360 # 20GB / 32KB
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
available_memory)
assert kv_cache_config.num_blocks == num_expected_blocks
assert len(kv_cache_config.kv_cache_tensors) == 1
assert len(kv_cache_config.tensors) == 1
# Each layer now has twice the available memory for KV cache
# compared to no KV sharing
assert kv_cache_config.kv_cache_tensors[0].size == available_memory
assert kv_cache_config.tensors[layer_0].size == available_memory
max_context_len =\
estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
# max context len with KV sharing should be 2x as large as without
assert max_context_len == (2 * 655360)
assert max_context_len == 2 * 1310720
# important: override tensor size to prevent large mem alloc during test
# this will only allocate 1 block worth of memory (512kb)
# this will only allocate 1 block worth of memory (32kb)
kv_cache_config.num_blocks = 1
kv_cache_config.kv_cache_tensors[0].size =\
kv_cache_config.tensors[layer_0].size =\
kv_cache_spec[layer_0].page_size_bytes
model_runner.initialize_kv_cache(kv_cache_config)

View File

@ -4,7 +4,6 @@
import random
import pytest
import torch
from vllm.attention import Attention
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
@ -400,7 +399,6 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
error_msg = f"{layer_1} must come before the current layer"
@ -429,7 +427,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
invalid_layer = "model.layers.0.cross_attn.attn"
@ -458,7 +455,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
error_msg = f"{layer_1} cannot be the same as the current layer"
@ -487,7 +483,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
def test_init_kv_cache_without_kv_sharing():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
vllm_config = get_vllm_config()
@ -555,7 +550,6 @@ def test_init_kv_cache_without_kv_sharing():
def test_init_kv_cache_with_kv_sharing_valid():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
vllm_config = get_vllm_config()

View File

@ -4497,13 +4497,13 @@ class VllmConfig:
# warning message here and will log it later.
if not (current_platform.is_cuda() or current_platform.is_rocm()):
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
self.disable_hybrid_kv_cache_manager = True
if self.kv_transfer_config is not None:
# Hybrid KV cache manager is not compatible with KV transfer.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
self.disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
self.disable_hybrid_kv_cache_manager = True
def update_sizes_for_sequence_parallelism(self,
possible_sizes: list) -> list:

View File

@ -233,11 +233,16 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
# Defaults for internode and intranode are taken from DeepEP tests.
num_nvl_bytes = 1024 * 1024 * 1024
num_qps_per_rank = num_local_experts
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
hidden=token_hidden_size,
num_ranks=num_ep_ranks,
num_experts=num_global_experts)
num_rdma_bytes = None
if self.internode:
num_rdma_bytes = 1024 * 1024 * 1024
else:
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
hidden=token_hidden_size,
num_ranks=num_ep_ranks,
num_experts=num_global_experts)
assert num_rdma_bytes is not None
return dict(group=self.cpu_group,

View File

@ -1337,6 +1337,13 @@ class EngineArgs:
recommend_to_remove=False)
return False
# Only Fp16 and Bf16 dtypes since we only support FA.
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
if model_config.dtype not in V1_SUPPORTED_DTYPES:
_raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
recommend_to_remove=False)
return False
# No Embedding Models so far.
if model_config.task not in ["generate"]:
_raise_or_fallback(feature_name=f"--task {model_config.task}",

View File

@ -82,7 +82,7 @@ class StopChecker:
return
# Check if the sequence has reached max_model_len.
if seq.get_len() >= self._get_max_model_len(lora_req):
if seq.get_len() > self._get_max_model_len(lora_req):
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
return

View File

@ -17,7 +17,6 @@ from typing import Any, Optional
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.launcher import serve_http
@ -30,6 +29,7 @@ from vllm.version import __version__ as VLLM_VERSION
logger = init_logger("vllm.entrypoints.api_server")
TIMEOUT_KEEP_ALIVE = 5 # seconds.
app = FastAPI()
engine = None
@ -134,7 +134,7 @@ async def run_server(args: Namespace,
host=args.host,
port=args.port,
log_level=args.log_level,
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,

View File

@ -103,6 +103,8 @@ from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
from vllm.v1.metrics.prometheus import get_prometheus_registry
from vllm.version import __version__ as VLLM_VERSION
TIMEOUT_KEEP_ALIVE = 5 # seconds
prometheus_multiproc_dir: tempfile.TemporaryDirectory
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
@ -1358,7 +1360,7 @@ async def run_server_worker(listen_address,
# NOTE: When the 'disable_uvicorn_access_log' value is True,
# no access log will be output.
access_log=not args.disable_uvicorn_access_log,
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,

View File

@ -71,7 +71,6 @@ if TYPE_CHECKING:
VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_RPC_TIMEOUT: int = 10000 # ms
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
VLLM_PLUGINS: Optional[list[str]] = None
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@ -111,7 +110,6 @@ if TYPE_CHECKING:
VLLM_DP_SIZE: int = 1
VLLM_DP_MASTER_IP: str = ""
VLLM_DP_MASTER_PORT: int = 0
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_V0_USE_OUTLINES_CACHE: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@ -558,10 +556,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_RPC_TIMEOUT":
lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
# Timeout in seconds for keeping HTTP connections alive in API server
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
# a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded
@ -767,10 +761,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DP_MASTER_PORT":
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
# Randomize inputs during dummy runs when using Data Parallel
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
# Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3":
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",

View File

@ -80,13 +80,11 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
topk: int,
num_experts: int,
) -> tuple[int, int, torch.dtype]:
block_m = self.block_shape[0]
M_sum = (M * topk) + num_experts * (block_m - 1)
M_sum = round_up(M_sum, block_m)
workspace1 = M_sum * max(N * 2, K)
workspace2 = M_sum * max(N, K)
workspace2 = M_sum * N
return (workspace1, workspace2, a.dtype)
def apply(
@ -137,31 +135,26 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
# Note: M_sum is different than the pre-permuted shape of a1q.
M_sum = a1q.size(0)
mm1_out = _resize_cache(workspace13, (M_sum, N))
act_out = _resize_cache(workspace2, (M_sum, N // 2))
quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
(M_sum, N // 2))
mm2_out = _resize_cache(workspace2, (M_sum, K))
out = _resize_cache(workspace13, (inv_perm.size(0), K))
workspace1 = _resize_cache(workspace13, (M_sum, N))
workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
workspace3 = _resize_cache(workspace13, (M_sum, K))
dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)
(a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)
self.activation(activation, act_out, mm1_out.view(-1, N))
self.activation(activation, workspace2, workspace1.view(-1, N))
a2q_scale: Optional[torch.Tensor] = None
a2q, a2q_scale = per_token_group_quant_fp8(act_out,
a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
self.block_shape[1],
column_major_scales=True,
out_q=quant_out)
column_major_scales=True)
dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids)
(a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
torch.index_select(mm2_out, 0, inv_perm, out=out)
workspace3 = workspace3[inv_perm, ...]
return out
return workspace3
def deep_gemm_moe_fp8(

View File

@ -5,7 +5,6 @@ import deep_ep
import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
@ -194,23 +193,20 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
apply_router_weight_on_input: bool,
output_dtype: torch.dtype):
hidden_dim = fused_expert_output.size(-1)
if fused_expert_output.ndim == 2:
hidden_dim = fused_expert_output.size(-1)
fused_expert_output = fused_expert_output.view(
num_tokens, -1, hidden_dim)
if not apply_router_weight_on_input:
# The DeepEP combine kernels don't do the topk weight
# multiplication. We multiply the weights locally.
m_x_topk = fused_expert_output.size(0)
fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1))
fused_expert_output = fused_expert_output.to(torch.float32)
fused_expert_output = fused_expert_output * topk_weights.view(
fused_expert_output.size(0), -1, 1)
fused_expert_output = fused_expert_output.to(output_dtype)
out = torch.empty((num_tokens, hidden_dim),
device=fused_expert_output.device,
dtype=output_dtype)
ops.moe_sum(fused_expert_output, out)
return out
return fused_expert_output.sum(dim=1).to(output_dtype)
def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,

View File

@ -18,7 +18,7 @@ def _moe_permute(
expert_map: Optional[torch.Tensor],
block_m: int,
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
torch.Tensor]:
Optional[torch.Tensor]]:
"""
Determine the sorted_token_ids, expert_ids for the given problem size.
Permute the hidden states and scales according to `sorted_token_ids`.

View File

@ -234,13 +234,8 @@ def _per_token_group_quant_fp8(
row = g_id // groups_per_row
row_g_id = g_id % groups_per_row
# Ensure offset calculations use int64 to prevent overflow
y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
group_size)
y_ptr += y_ptr_offset
y_q_ptr_offset = g_id.to(tl.int64) * group_size
y_q_ptr += y_q_ptr_offset
y_ptr += (row * y_row_stride) + (row_g_id * group_size)
y_q_ptr += g_id * group_size
y_s_ptr += g_id
cols = tl.arange(0, BLOCK) # N <= BLOCK
@ -287,23 +282,15 @@ def _per_token_group_quant_fp8_colmajor(
row = g_id // groups_per_row
row_g_id = g_id % groups_per_row
# Ensure offset calculations use int64 to prevent overflow
y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
group_size)
y_ptr += y_ptr_offset
y_q_ptr_offset = g_id.to(tl.int64) * group_size
y_q_ptr += y_q_ptr_offset
y_ptr += (row * y_row_stride) + (row_g_id * group_size)
y_q_ptr += g_id * group_size
# Convert g_id the flattened block coordinate to 2D so we can index
# into the output y_scales matrix
blocks_per_row = y_num_columns // group_size
scale_col = g_id % blocks_per_row
scale_row = g_id // blocks_per_row
# Ensure offset calculation uses int64 for y_s_ptr
y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
tl.int64)
y_s_ptr += y_s_ptr_offset
y_s_ptr += scale_col * y_s_col_stride + scale_row
cols = tl.arange(0, BLOCK) # group_size <= BLOCK
mask = cols < group_size
@ -324,7 +311,6 @@ def per_token_group_quant_fp8(
eps: float = 1e-10,
dtype: Optional[torch.dtype] = None,
column_major_scales: bool = False,
out_q: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""Function to perform per-token-group quantization on an input tensor `x`.
It converts the tensor values into signed float8 values and returns the
@ -335,8 +321,6 @@ def per_token_group_quant_fp8(
eps: The minimum to avoid dividing zero.
dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
is supported for now.
column_major_scales: Outputs scales in column major.
out_q: Optional output tensor. If not provided, function will create.
Returns:
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
scaling factor for quantization.
@ -351,11 +335,7 @@ def per_token_group_quant_fp8(
fp8_min = finfo.min
fp8_max = finfo.max
assert out_q is None or out_q.shape == x.shape
x_q = out_q
if x_q is None:
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
M = x.numel() // group_size
N = group_size
if column_major_scales:

View File

@ -233,10 +233,6 @@ class CudaPlatformBase(Platform):
logger.info_once("Using Triton backend on V1 engine.")
return ("vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend")
if dtype not in (torch.float16, torch.bfloat16):
logger.info_once(
f"Using FlexAttenion backend for {dtype} on V1 engine.")
return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
if cls.is_device_capability(100):
# Prefer FlashInfer for V1 on Blackwell GPUs if installed
try:

View File

@ -40,13 +40,12 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
blocksparse_params: Optional[dict[str, Any]],
logits_soft_cap: Optional[float],
attn_type: str,
kv_sharing_target_layer_name: Optional[str],
# MLA Specific Arguments
**mla_args) -> None:
super().__init__(num_heads, head_size, scale, num_kv_heads,
alibi_slopes, sliding_window, kv_cache_dtype,
blocksparse_params, logits_soft_cap, attn_type,
kv_sharing_target_layer_name, **mla_args)
**mla_args)
unsupported_features = [
alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap

View File

@ -5,7 +5,6 @@ import copy
import gc
import time
import weakref
from contextlib import contextmanager
from typing import TYPE_CHECKING, Any, Optional, Union
import numpy as np
@ -13,7 +12,6 @@ import torch
import torch.distributed
import torch.nn as nn
import vllm.envs as envs
from vllm.attention import AttentionType, get_attn_backend
from vllm.attention.backends.abstract import (AttentionBackend,
AttentionMetadataBuilder)
@ -657,10 +655,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Fill unused with -1. Needed for reshape_and_cache
self.seq_lens[num_reqs:].fill_(0)
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
self.query_start_loc[num_reqs + 1:].fill_(
self.query_start_loc_cpu[num_reqs].item())
self.query_start_loc[num_reqs + 1:].fill_(-1)
query_start_loc = self.query_start_loc[:num_reqs + 1]
seq_lens = self.seq_lens[:num_reqs]
@ -1729,35 +1724,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
return prompt_logprobs_dict
@contextmanager
def maybe_randomize_inputs(self, input_ids: torch.Tensor):
"""
Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
This is to help balance expert-selection
- during profile_run
- during DP rank dummy run
"""
dp_size = self.vllm_config.parallel_config.data_parallel_size
randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
if not randomize_inputs:
yield
else:
import functools
@functools.cache
def rand_input_ids() -> torch.Tensor:
return torch.randint_like(
self.input_ids,
low=0,
high=self.model_config.get_vocab_size(),
dtype=input_ids.dtype)
logger.debug("Randomizing dummy data for DP Rank")
input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
non_blocking=True)
yield
input_ids.fill_(0)
@torch.inference_mode()
def _dummy_run(
self,
@ -1838,7 +1804,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
num_tokens, None, False)
with self.maybe_randomize_inputs(input_ids), set_forward_context(
with set_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens,