Compare commits
1 Commits
v0.9.1rc1
...
mergify/ho
| Author | SHA1 | Date | |
|---|---|---|---|
| ca15f0afe6 |
357
.github/mergify.yml
vendored
357
.github/mergify.yml
vendored
@ -1,196 +1,181 @@
|
||||
pull_request_rules:
|
||||
- name: label-documentation
|
||||
description: Automatically apply documentation label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
- files~=^examples/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- documentation
|
||||
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^\.github/
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^docker/Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- ci/build
|
||||
|
||||
- name: label-frontend
|
||||
description: Automatically apply frontend label
|
||||
conditions:
|
||||
- files~=^vllm/entrypoints/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- frontend
|
||||
|
||||
- name: label-llama
|
||||
description: Automatically apply llama label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*llama.*\.py
|
||||
- files~=^tests/.*llama.*\.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
||||
- files~=^vllm/model_executor/models/.*llama.*\.py
|
||||
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- llama
|
||||
|
||||
- name: label-multi-modality
|
||||
description: Automatically apply multi-modality label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/multimodal/
|
||||
- files~=^tests/multimodal/
|
||||
- files~=^tests/models/multimodal/
|
||||
- files~=^tests/models/*/audio_language/
|
||||
- files~=^tests/models/*/vision_language/
|
||||
- files=tests/models/test_vision.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- multi-modality
|
||||
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^vllm/model_executor/guided_decoding/
|
||||
- files=tests/model_executor/test_guided_processors.py
|
||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- structured-output
|
||||
|
||||
- name: label-speculative-decoding
|
||||
description: Automatically apply speculative-decoding label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/spec_decode/
|
||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||
- files~=^tests/spec_decode/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- speculative-decoding
|
||||
|
||||
- name: label-v1
|
||||
description: Automatically apply v1 label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/v1/
|
||||
- files~=^tests/v1/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- v1
|
||||
|
||||
- name: label-tpu
|
||||
description: Automatically apply tpu label
|
||||
# Keep this list in sync with `label-tpu-remove` conditions
|
||||
conditions:
|
||||
- or:
|
||||
- files~=tpu.py
|
||||
- files~=_tpu
|
||||
- files~=tpu_
|
||||
- files~=/tpu/
|
||||
- files~=pallas
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tpu
|
||||
|
||||
- name: label-tpu-remove
|
||||
description: Automatically remove tpu label
|
||||
# Keep this list in sync with `label-tpu` conditions
|
||||
conditions:
|
||||
- and:
|
||||
- -files~=tpu.py
|
||||
- -files~=_tpu
|
||||
- -files~=tpu_
|
||||
- -files~=/tpu/
|
||||
- -files~=pallas
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- tpu
|
||||
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- name: label-llama
|
||||
description: Automatically apply llama label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*llama.*\.py
|
||||
- files~=^tests/.*llama.*\.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
||||
- files~=^vllm/model_executor/models/.*llama.*\.py
|
||||
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- llama
|
||||
- name: label-documentation
|
||||
description: Automatically apply documentation label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
- files~=^examples/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- documentation
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^\.github/
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^docker/Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- ci/build
|
||||
- name: label-frontend
|
||||
description: Automatically apply frontend label
|
||||
conditions:
|
||||
- files~=^vllm/entrypoints/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- frontend
|
||||
- name: label-multi-modality
|
||||
description: Automatically apply multi-modality label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/multimodal/
|
||||
- files~=^tests/multimodal/
|
||||
- files~=^tests/models/multimodal/
|
||||
- files~=^tests/models/*/audio_language/
|
||||
- files~=^tests/models/*/vision_language/
|
||||
- files=tests/models/test_vision.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- multi-modality
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^vllm/model_executor/guided_decoding/
|
||||
- files=tests/model_executor/test_guided_processors.py
|
||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- structured-output
|
||||
- name: label-speculative-decoding
|
||||
description: Automatically apply speculative-decoding label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/spec_decode/
|
||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||
- files~=^tests/spec_decode/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- speculative-decoding
|
||||
- name: label-v1
|
||||
description: Automatically apply v1 label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/v1/
|
||||
- files~=^tests/v1/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- v1
|
||||
- name: label-tpu
|
||||
description: Automatically apply tpu label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=tpu.py
|
||||
- files~=_tpu
|
||||
- files~=tpu_
|
||||
- files~=/tpu/
|
||||
- files~=pallas
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tpu
|
||||
- name: label-tpu-remove
|
||||
description: Automatically remove tpu label
|
||||
conditions:
|
||||
- and:
|
||||
- -files~=tpu.py
|
||||
- -files~=_tpu
|
||||
- -files~=tpu_
|
||||
- -files~=/tpu/
|
||||
- -files~=pallas
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- tpu
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
- -closed
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- needs-rebase
|
||||
comment:
|
||||
message: |
|
||||
This pull request has merge conflicts that must be resolved before it can be
|
||||
merged. Please rebase the PR, @{{author}}.
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- needs-rebase
|
||||
comment:
|
||||
message: |
|
||||
This pull request has merge conflicts that must be resolved before it can be
|
||||
merged. Please rebase the PR, @{{author}}.
|
||||
|
||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||
|
||||
- name: assign reviewer for tensorizer changes
|
||||
conditions:
|
||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||
- name: assign reviewer for tensorizer changes
|
||||
conditions:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
- "sangstar"
|
||||
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
- sangstar
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
- -conflict
|
||||
- -closed
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- needs-rebase
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- needs-rebase
|
||||
|
||||
@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
# (Build 8.9 for FP8)
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
|
||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
|
||||
@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is
|
||||
|
||||
**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
|
||||
|
||||
1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
|
||||
1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:
|
||||
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
|
||||
2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
|
||||
3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
|
||||
|
||||
@ -31,7 +31,6 @@ refer to the [PyTorch Security
|
||||
Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
|
||||
|
||||
Key points from the PyTorch security guide:
|
||||
|
||||
- PyTorch Distributed features are intended for internal communication only
|
||||
- They are not built for use in untrusted environments or networks
|
||||
- No authorization protocol is included for performance reasons
|
||||
|
||||
@ -37,7 +37,7 @@ pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
einops # Required for Qwen2-VL.
|
||||
compressed-tensors == 0.10.1 # required for compressed-tensors
|
||||
compressed-tensors == 0.10.0 # required for compressed-tensors
|
||||
depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
|
||||
@ -8,7 +8,6 @@ import uvicorn
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
import vllm.entrypoints.api_server
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -47,8 +46,9 @@ if __name__ == "__main__":
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
||||
vllm.entrypoints.api_server.engine = engine
|
||||
uvicorn.run(app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="debug",
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="debug",
|
||||
timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
|
||||
|
||||
@ -25,12 +25,6 @@ TOKEN_IDS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
@ -110,19 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
|
||||
# sampling_params is None, default params should be applied
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_max_model_len():
|
||||
max_model_len = 20
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
sampling_params = SamplingParams(max_tokens=max_model_len + 10)
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
for output in outputs:
|
||||
num_total_tokens = len(output.prompt_token_ids) + len(
|
||||
output.outputs[0].token_ids)
|
||||
assert num_total_tokens == max_model_len
|
||||
|
||||
@ -183,34 +183,6 @@ def test_env(
|
||||
assert backend.get_name() == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_fp32_fallback(
|
||||
device: str,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with fp32."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
|
||||
if device == "cpu":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, torch.float32,
|
||||
16, False)
|
||||
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
|
||||
if use_v1 else "TORCH_SDPA")
|
||||
|
||||
elif device == "cuda":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CudaPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, torch.float32,
|
||||
16, False)
|
||||
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||
if use_v1 else "XFORMERS")
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test FlashAttn validation."""
|
||||
# TODO: When testing for v1, pipe in `use_v1` as an argument to
|
||||
|
||||
@ -274,7 +274,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
|
||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
||||
|
||||
b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
|
||||
b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare(
|
||||
a_chunk,
|
||||
None,
|
||||
None,
|
||||
|
||||
@ -164,6 +164,11 @@ def mixtral_lora_files():
|
||||
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gemma_lora_files():
|
||||
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chatglm3_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
|
||||
|
||||
@ -4,6 +4,9 @@ import subprocess
|
||||
import sys
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
import vllm
|
||||
from vllm import LLM
|
||||
from vllm.lora.request import LoRARequest
|
||||
@ -118,6 +121,37 @@ def test_llama_lora(sql_lora_files):
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
|
||||
# used by the engine yet.
|
||||
@pytest.mark.skip_v1
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_warmup(sql_lora_files):
|
||||
"""Test that the LLM initialization works with a warmup LORA path and
|
||||
is more conservative"""
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def get_num_gpu_blocks_lora():
|
||||
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
|
||||
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
|
||||
return num_gpu_blocks_lora_warmup
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def get_num_gpu_blocks_no_lora():
|
||||
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
|
||||
num_gpu_blocks_no_lora_warmup = (
|
||||
llm.llm_engine.cache_config.num_gpu_blocks)
|
||||
return num_gpu_blocks_no_lora_warmup
|
||||
|
||||
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
|
||||
num_gpu_blocks_no_lora_warmup = ray.get(
|
||||
get_num_gpu_blocks_no_lora.remote())
|
||||
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
|
||||
"The warmup with lora should be more "
|
||||
"conservative than without lora, therefore the number of "
|
||||
"memory blocks for the KV cache should be "
|
||||
"less when using lora than when not using lora")
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
@ -15,6 +15,13 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
LORA_RANK = 8
|
||||
|
||||
# @pytest.fixture(autouse=True)
|
||||
# def v1(run_with_both_engines_lora):
|
||||
# # Simple autouse wrapper to run both engines for each test
|
||||
# # This can be promoted up to conftest.py to run for every
|
||||
# # test in a package
|
||||
# pass
|
||||
|
||||
|
||||
def make_lora_request(lora_id: int):
|
||||
return LoRARequest(lora_name=f"{lora_id}",
|
||||
|
||||
@ -11,6 +11,14 @@ MODEL_PATH = "microsoft/phi-2"
|
||||
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(
|
||||
@ -51,7 +59,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
# Skipping for V1 for now as we are hitting,
|
||||
# "Head size 80 is not supported by FlashAttention." error.
|
||||
@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
|
||||
@pytest.mark.skip_v1
|
||||
def test_phi2_lora(phi2_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
|
||||
@ -16,8 +16,6 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.worker.gpu_worker import Worker as V1Worker
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
NUM_LORAS = 16
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"RANK": "0"})
|
||||
def test_worker_apply_lora(sql_lora_files):
|
||||
@ -60,12 +58,12 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
device_config=DeviceConfig("cuda"),
|
||||
cache_config=CacheConfig(
|
||||
block_size=16,
|
||||
gpu_memory_utilization=1.0,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
),
|
||||
lora_config=LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=NUM_LORAS,
|
||||
max_loras=NUM_LORAS),
|
||||
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
|
||||
max_loras=32),
|
||||
)
|
||||
worker = worker_cls(
|
||||
vllm_config=vllm_config,
|
||||
@ -80,9 +78,9 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
set_active_loras(worker, [])
|
||||
assert worker.list_loras() == set()
|
||||
|
||||
n_loras = 32
|
||||
lora_requests = [
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files)
|
||||
for i in range(NUM_LORAS)
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
|
||||
]
|
||||
|
||||
set_active_loras(worker, lora_requests)
|
||||
@ -91,12 +89,12 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
for lora_request in lora_requests
|
||||
}
|
||||
|
||||
for i in range(NUM_LORAS):
|
||||
for i in range(32):
|
||||
random.seed(i)
|
||||
iter_lora_requests = random.choices(lora_requests,
|
||||
k=random.randint(1, NUM_LORAS))
|
||||
k=random.randint(1, n_loras))
|
||||
random.shuffle(iter_lora_requests)
|
||||
iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
|
||||
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
|
||||
set_active_loras(worker, lora_requests)
|
||||
assert worker.list_loras().issuperset(
|
||||
{lora_request.lora_int_id
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import unittest.mock as mock
|
||||
|
||||
import pytest
|
||||
|
||||
@ -16,8 +17,24 @@ from vllm.v1.worker.tpu_model_runner import (
|
||||
TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
|
||||
_get_padded_token_len, _get_req_paddings, _get_token_paddings)
|
||||
|
||||
# Mock torch_xla module since it may not be available in the test environments
|
||||
torch_xla_patcher = mock.patch.dict(
|
||||
"sys.modules", {
|
||||
"torch_xla": mock.MagicMock(),
|
||||
"torch_xla.core.xla_model": mock.MagicMock(),
|
||||
"torch_xla.runtime": mock.MagicMock(),
|
||||
})
|
||||
torch_xla_patcher.start()
|
||||
|
||||
def get_vllm_config():
|
||||
# Mock the PallasAttentionBackend
|
||||
pallas_attention_backend_patcher = mock.patch(
|
||||
"vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
|
||||
pallas_attention_backend_patcher.start()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_runner():
|
||||
# Patchers have already been started at module level.
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=10,
|
||||
max_num_batched_tokens=512,
|
||||
@ -43,19 +60,18 @@ def get_vllm_config():
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
return vllm_config
|
||||
|
||||
|
||||
def get_model_runner(vllm_config):
|
||||
device = "xla:0" # Mocking TPU device
|
||||
return TPUModelRunner(vllm_config, device)
|
||||
with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
|
||||
mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
|
||||
mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
|
||||
return TPUModelRunner(vllm_config, device)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_runner():
|
||||
# Patchers have already been started at module level.
|
||||
vllm_config = get_vllm_config()
|
||||
return get_model_runner(vllm_config)
|
||||
@pytest.fixture(autouse=True, scope="session")
|
||||
def cleanup_patches():
|
||||
yield
|
||||
torch_xla_patcher.stop()
|
||||
pallas_attention_backend_patcher.stop()
|
||||
|
||||
|
||||
def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
|
||||
@ -354,14 +370,12 @@ def test_get_req_paddings():
|
||||
assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
|
||||
model_runner):
|
||||
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
|
||||
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
error_msg = f"{layer_1} must come before the current layer"
|
||||
vllm_config = model_runner.vllm_config
|
||||
with pytest.raises(ValueError, match=error_msg), \
|
||||
set_current_vllm_config(vllm_config):
|
||||
with pytest.raises(ValueError, match=error_msg):
|
||||
fwd_context = {
|
||||
# initialization below will fail because target layer is invalid;
|
||||
# the target layer needs to come before layer 1
|
||||
@ -385,14 +399,13 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
|
||||
assert fwd_context is not None
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
|
||||
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
|
||||
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
invalid_layer = "model.layers.0.cross_attn.attn"
|
||||
error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
|
||||
vllm_config = model_runner.vllm_config
|
||||
with pytest.raises(ValueError, match=error_msg), \
|
||||
set_current_vllm_config(vllm_config):
|
||||
with pytest.raises(ValueError, match=error_msg):
|
||||
fwd_context = {
|
||||
layer_0:
|
||||
Attention(
|
||||
@ -415,13 +428,12 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
|
||||
assert fwd_context is not None
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
|
||||
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
|
||||
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
error_msg = f"{layer_1} cannot be the same as the current layer"
|
||||
vllm_config = model_runner.vllm_config
|
||||
with pytest.raises(ValueError, match=error_msg), \
|
||||
set_current_vllm_config(vllm_config):
|
||||
with pytest.raises(ValueError, match=error_msg):
|
||||
fwd_context = {
|
||||
# initialization below will fail because target layer is invalid;
|
||||
# the target layer needs to come before layer 1
|
||||
@ -445,10 +457,11 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
|
||||
assert fwd_context is not None
|
||||
|
||||
|
||||
def test_init_kv_cache_without_kv_sharing():
|
||||
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
|
||||
def test_init_kv_cache_without_kv_sharing(model_runner):
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
vllm_config = get_vllm_config()
|
||||
vllm_config = model_runner.vllm_config
|
||||
with set_current_vllm_config(vllm_config):
|
||||
fwd_context = {
|
||||
layer_0:
|
||||
@ -469,38 +482,33 @@ def test_init_kv_cache_without_kv_sharing():
|
||||
# suppress var not used error
|
||||
assert fwd_context is not None
|
||||
# Set high context length to test max context length estimation
|
||||
vllm_config.model_config.max_model_len = 1_000_000
|
||||
vllm_config.model_config.max_model_len = 3_000_000
|
||||
vllm_ctx = vllm_config.compilation_config.static_forward_context
|
||||
model_runner = get_model_runner(vllm_config)
|
||||
kv_cache_spec = model_runner.get_kv_cache_spec()
|
||||
assert len(kv_cache_spec) == 2
|
||||
assert len(model_runner.shared_kv_cache_layers) == 0
|
||||
|
||||
available_memory = 20 * GiB_bytes
|
||||
# page size for each layer KV can be calculated as
|
||||
# 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
|
||||
# * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
|
||||
num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers)
|
||||
# page size for layer 0's kv_cache_spec is 32KB
|
||||
num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers)
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 2
|
||||
assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
|
||||
assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
|
||||
assert len(kv_cache_config.tensors) == 2
|
||||
assert kv_cache_config.tensors[layer_0].size == available_memory // 2
|
||||
assert kv_cache_config.tensors[layer_1].size == available_memory // 2
|
||||
|
||||
max_context_len =\
|
||||
estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
|
||||
# max context len with KV sharing should be 2x as large as without
|
||||
# max_context_len = available_memory / (page_size / block_size) / num_caches
|
||||
# max_context_len = 5GB / (512KB / 128) / 2 = 655360
|
||||
assert max_context_len == 655360
|
||||
assert max_context_len == 1310720
|
||||
|
||||
# important: override tensor size to prevent large mem alloc during test
|
||||
# this will only allocate 2 block worth of memory (2 * 512kb)
|
||||
# this will only allocate 2 block worth of memory (2 * 32kb)
|
||||
kv_cache_config.num_blocks = 1
|
||||
for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
|
||||
kv_cache_tensor.size = (
|
||||
kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
|
||||
for layer in kv_cache_config.tensors:
|
||||
kv_cache_config.tensors[layer].size =\
|
||||
kv_cache_spec[layer].page_size_bytes
|
||||
|
||||
model_runner.initialize_kv_cache(kv_cache_config)
|
||||
|
||||
@ -516,10 +524,11 @@ def test_init_kv_cache_without_kv_sharing():
|
||||
assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_valid():
|
||||
@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
|
||||
def test_init_kv_cache_with_kv_sharing_valid(model_runner):
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
vllm_config = get_vllm_config()
|
||||
vllm_config = model_runner.vllm_config
|
||||
with set_current_vllm_config(vllm_config):
|
||||
fwd_context = {
|
||||
layer_0:
|
||||
@ -543,34 +552,33 @@ def test_init_kv_cache_with_kv_sharing_valid():
|
||||
# Set high context length to test max context length estimation
|
||||
vllm_config.model_config.max_model_len = 3_000_000
|
||||
vllm_ctx = vllm_config.compilation_config.static_forward_context
|
||||
model_runner = get_model_runner(vllm_config)
|
||||
kv_cache_spec = model_runner.get_kv_cache_spec()
|
||||
assert len(kv_cache_spec) == 1
|
||||
assert layer_0 in kv_cache_spec
|
||||
assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
|
||||
|
||||
available_memory = 20 * GiB_bytes
|
||||
# page size for layer 0's kv_cache_spec is 512KB
|
||||
# page size for layer 0's kv_cache_spec is 32KB
|
||||
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
|
||||
# which is twice as many as without KV sharing
|
||||
num_expected_blocks = 2 * 20480 # 20GB / 512KB
|
||||
num_expected_blocks = 655360 # 20GB / 32KB
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 1
|
||||
assert len(kv_cache_config.tensors) == 1
|
||||
# Each layer now has twice the available memory for KV cache
|
||||
# compared to no KV sharing
|
||||
assert kv_cache_config.kv_cache_tensors[0].size == available_memory
|
||||
assert kv_cache_config.tensors[layer_0].size == available_memory
|
||||
|
||||
max_context_len =\
|
||||
estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
|
||||
# max context len with KV sharing should be 2x as large as without
|
||||
assert max_context_len == (2 * 655360)
|
||||
assert max_context_len == 2 * 1310720
|
||||
|
||||
# important: override tensor size to prevent large mem alloc during test
|
||||
# this will only allocate 1 block worth of memory (512kb)
|
||||
# this will only allocate 1 block worth of memory (32kb)
|
||||
kv_cache_config.num_blocks = 1
|
||||
kv_cache_config.kv_cache_tensors[0].size =\
|
||||
kv_cache_config.tensors[layer_0].size =\
|
||||
kv_cache_spec[layer_0].page_size_bytes
|
||||
|
||||
model_runner.initialize_kv_cache(kv_cache_config)
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
||||
@ -400,7 +399,6 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
error_msg = f"{layer_1} must come before the current layer"
|
||||
@ -429,7 +427,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
invalid_layer = "model.layers.0.cross_attn.attn"
|
||||
@ -458,7 +455,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
error_msg = f"{layer_1} cannot be the same as the current layer"
|
||||
@ -487,7 +483,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
|
||||
|
||||
|
||||
def test_init_kv_cache_without_kv_sharing():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
vllm_config = get_vllm_config()
|
||||
@ -555,7 +550,6 @@ def test_init_kv_cache_without_kv_sharing():
|
||||
|
||||
|
||||
def test_init_kv_cache_with_kv_sharing_valid():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
layer_0 = "model.layers.0.self_attn.attn"
|
||||
layer_1 = "model.layers.1.self_attn.attn"
|
||||
vllm_config = get_vllm_config()
|
||||
|
||||
@ -4497,13 +4497,13 @@ class VllmConfig:
|
||||
# warning message here and will log it later.
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
self.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_transfer_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV transfer.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
self.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_events_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV events.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
self.disable_hybrid_kv_cache_manager = True
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
|
||||
@ -233,11 +233,16 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
|
||||
# Defaults for internode and intranode are taken from DeepEP tests.
|
||||
num_nvl_bytes = 1024 * 1024 * 1024
|
||||
num_qps_per_rank = num_local_experts
|
||||
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
|
||||
num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
|
||||
hidden=token_hidden_size,
|
||||
num_ranks=num_ep_ranks,
|
||||
num_experts=num_global_experts)
|
||||
num_rdma_bytes = None
|
||||
|
||||
if self.internode:
|
||||
num_rdma_bytes = 1024 * 1024 * 1024
|
||||
else:
|
||||
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
|
||||
num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
|
||||
hidden=token_hidden_size,
|
||||
num_ranks=num_ep_ranks,
|
||||
num_experts=num_global_experts)
|
||||
|
||||
assert num_rdma_bytes is not None
|
||||
return dict(group=self.cpu_group,
|
||||
|
||||
@ -1337,6 +1337,13 @@ class EngineArgs:
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
# Only Fp16 and Bf16 dtypes since we only support FA.
|
||||
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
|
||||
if model_config.dtype not in V1_SUPPORTED_DTYPES:
|
||||
_raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
# No Embedding Models so far.
|
||||
if model_config.task not in ["generate"]:
|
||||
_raise_or_fallback(feature_name=f"--task {model_config.task}",
|
||||
|
||||
@ -82,7 +82,7 @@ class StopChecker:
|
||||
return
|
||||
|
||||
# Check if the sequence has reached max_model_len.
|
||||
if seq.get_len() >= self._get_max_model_len(lora_req):
|
||||
if seq.get_len() > self._get_max_model_len(lora_req):
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
|
||||
@ -17,7 +17,6 @@ from typing import Any, Optional
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.launcher import serve_http
|
||||
@ -30,6 +29,7 @@ from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger("vllm.entrypoints.api_server")
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||
app = FastAPI()
|
||||
engine = None
|
||||
|
||||
@ -134,7 +134,7 @@ async def run_server(args: Namespace,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level=args.log_level,
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
|
||||
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
|
||||
@ -103,6 +103,8 @@ from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
|
||||
from vllm.v1.metrics.prometheus import get_prometheus_registry
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||
|
||||
prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
||||
|
||||
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
|
||||
@ -1358,7 +1360,7 @@ async def run_server_worker(listen_address,
|
||||
# NOTE: When the 'disable_uvicorn_access_log' value is True,
|
||||
# no access log will be output.
|
||||
access_log=not args.disable_uvicorn_access_log,
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
|
||||
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
|
||||
10
vllm/envs.py
10
vllm/envs.py
@ -71,7 +71,6 @@ if TYPE_CHECKING:
|
||||
VERBOSE: bool = False
|
||||
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
|
||||
VLLM_RPC_TIMEOUT: int = 10000 # ms
|
||||
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
||||
VLLM_PLUGINS: Optional[list[str]] = None
|
||||
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
|
||||
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
|
||||
@ -111,7 +110,6 @@ if TYPE_CHECKING:
|
||||
VLLM_DP_SIZE: int = 1
|
||||
VLLM_DP_MASTER_IP: str = ""
|
||||
VLLM_DP_MASTER_PORT: int = 0
|
||||
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
||||
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
||||
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
||||
@ -558,10 +556,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_RPC_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
|
||||
|
||||
# Timeout in seconds for keeping HTTP connections alive in API server
|
||||
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
|
||||
lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
|
||||
|
||||
# a list of plugin names to load, separated by commas.
|
||||
# if this is not set, it means all plugins will be loaded
|
||||
# if this is set to an empty string, no plugins will be loaded
|
||||
@ -767,10 +761,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DP_MASTER_PORT":
|
||||
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
|
||||
|
||||
# Randomize inputs during dummy runs when using Data Parallel
|
||||
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
|
||||
lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
|
||||
|
||||
# Whether to use S3 path for model loading in CI via RunAI Streamer
|
||||
"VLLM_CI_USE_S3":
|
||||
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
|
||||
|
||||
@ -80,13 +80,11 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
topk: int,
|
||||
num_experts: int,
|
||||
) -> tuple[int, int, torch.dtype]:
|
||||
|
||||
block_m = self.block_shape[0]
|
||||
M_sum = (M * topk) + num_experts * (block_m - 1)
|
||||
M_sum = round_up(M_sum, block_m)
|
||||
workspace1 = M_sum * max(N * 2, K)
|
||||
workspace2 = M_sum * max(N, K)
|
||||
|
||||
workspace2 = M_sum * N
|
||||
return (workspace1, workspace2, a.dtype)
|
||||
|
||||
def apply(
|
||||
@ -137,31 +135,26 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
|
||||
# Note: M_sum is different than the pre-permuted shape of a1q.
|
||||
M_sum = a1q.size(0)
|
||||
|
||||
mm1_out = _resize_cache(workspace13, (M_sum, N))
|
||||
act_out = _resize_cache(workspace2, (M_sum, N // 2))
|
||||
quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
|
||||
(M_sum, N // 2))
|
||||
mm2_out = _resize_cache(workspace2, (M_sum, K))
|
||||
out = _resize_cache(workspace13, (inv_perm.size(0), K))
|
||||
workspace1 = _resize_cache(workspace13, (M_sum, N))
|
||||
workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
|
||||
workspace3 = _resize_cache(workspace13, (M_sum, K))
|
||||
|
||||
dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)
|
||||
(a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)
|
||||
|
||||
self.activation(activation, act_out, mm1_out.view(-1, N))
|
||||
self.activation(activation, workspace2, workspace1.view(-1, N))
|
||||
|
||||
a2q_scale: Optional[torch.Tensor] = None
|
||||
a2q, a2q_scale = per_token_group_quant_fp8(act_out,
|
||||
a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
|
||||
self.block_shape[1],
|
||||
column_major_scales=True,
|
||||
out_q=quant_out)
|
||||
column_major_scales=True)
|
||||
|
||||
dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids)
|
||||
(a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
|
||||
|
||||
torch.index_select(mm2_out, 0, inv_perm, out=out)
|
||||
workspace3 = workspace3[inv_perm, ...]
|
||||
|
||||
return out
|
||||
return workspace3
|
||||
|
||||
|
||||
def deep_gemm_moe_fp8(
|
||||
|
||||
@ -5,7 +5,6 @@ import deep_ep
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.utils import (
|
||||
moe_kernel_quantize_input)
|
||||
|
||||
@ -194,23 +193,20 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
apply_router_weight_on_input: bool,
|
||||
output_dtype: torch.dtype):
|
||||
|
||||
hidden_dim = fused_expert_output.size(-1)
|
||||
if fused_expert_output.ndim == 2:
|
||||
hidden_dim = fused_expert_output.size(-1)
|
||||
fused_expert_output = fused_expert_output.view(
|
||||
num_tokens, -1, hidden_dim)
|
||||
|
||||
if not apply_router_weight_on_input:
|
||||
# The DeepEP combine kernels don't do the topk weight
|
||||
# multiplication. We multiply the weights locally.
|
||||
m_x_topk = fused_expert_output.size(0)
|
||||
fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1))
|
||||
fused_expert_output = fused_expert_output.to(torch.float32)
|
||||
fused_expert_output = fused_expert_output * topk_weights.view(
|
||||
fused_expert_output.size(0), -1, 1)
|
||||
fused_expert_output = fused_expert_output.to(output_dtype)
|
||||
|
||||
out = torch.empty((num_tokens, hidden_dim),
|
||||
device=fused_expert_output.device,
|
||||
dtype=output_dtype)
|
||||
ops.moe_sum(fused_expert_output, out)
|
||||
|
||||
return out
|
||||
return fused_expert_output.sum(dim=1).to(output_dtype)
|
||||
|
||||
def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
|
||||
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||
|
||||
@ -18,7 +18,7 @@ def _moe_permute(
|
||||
expert_map: Optional[torch.Tensor],
|
||||
block_m: int,
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
|
||||
torch.Tensor]:
|
||||
Optional[torch.Tensor]]:
|
||||
"""
|
||||
Determine the sorted_token_ids, expert_ids for the given problem size.
|
||||
Permute the hidden states and scales according to `sorted_token_ids`.
|
||||
|
||||
@ -234,13 +234,8 @@ def _per_token_group_quant_fp8(
|
||||
row = g_id // groups_per_row
|
||||
row_g_id = g_id % groups_per_row
|
||||
|
||||
# Ensure offset calculations use int64 to prevent overflow
|
||||
y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
|
||||
group_size)
|
||||
y_ptr += y_ptr_offset
|
||||
|
||||
y_q_ptr_offset = g_id.to(tl.int64) * group_size
|
||||
y_q_ptr += y_q_ptr_offset
|
||||
y_ptr += (row * y_row_stride) + (row_g_id * group_size)
|
||||
y_q_ptr += g_id * group_size
|
||||
y_s_ptr += g_id
|
||||
|
||||
cols = tl.arange(0, BLOCK) # N <= BLOCK
|
||||
@ -287,23 +282,15 @@ def _per_token_group_quant_fp8_colmajor(
|
||||
row = g_id // groups_per_row
|
||||
row_g_id = g_id % groups_per_row
|
||||
|
||||
# Ensure offset calculations use int64 to prevent overflow
|
||||
y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
|
||||
group_size)
|
||||
y_ptr += y_ptr_offset
|
||||
|
||||
y_q_ptr_offset = g_id.to(tl.int64) * group_size
|
||||
y_q_ptr += y_q_ptr_offset
|
||||
y_ptr += (row * y_row_stride) + (row_g_id * group_size)
|
||||
y_q_ptr += g_id * group_size
|
||||
|
||||
# Convert g_id the flattened block coordinate to 2D so we can index
|
||||
# into the output y_scales matrix
|
||||
blocks_per_row = y_num_columns // group_size
|
||||
scale_col = g_id % blocks_per_row
|
||||
scale_row = g_id // blocks_per_row
|
||||
# Ensure offset calculation uses int64 for y_s_ptr
|
||||
y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
|
||||
tl.int64)
|
||||
y_s_ptr += y_s_ptr_offset
|
||||
y_s_ptr += scale_col * y_s_col_stride + scale_row
|
||||
|
||||
cols = tl.arange(0, BLOCK) # group_size <= BLOCK
|
||||
mask = cols < group_size
|
||||
@ -324,7 +311,6 @@ def per_token_group_quant_fp8(
|
||||
eps: float = 1e-10,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
column_major_scales: bool = False,
|
||||
out_q: Optional[torch.Tensor] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Function to perform per-token-group quantization on an input tensor `x`.
|
||||
It converts the tensor values into signed float8 values and returns the
|
||||
@ -335,8 +321,6 @@ def per_token_group_quant_fp8(
|
||||
eps: The minimum to avoid dividing zero.
|
||||
dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
|
||||
is supported for now.
|
||||
column_major_scales: Outputs scales in column major.
|
||||
out_q: Optional output tensor. If not provided, function will create.
|
||||
Returns:
|
||||
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
|
||||
scaling factor for quantization.
|
||||
@ -351,11 +335,7 @@ def per_token_group_quant_fp8(
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
assert out_q is None or out_q.shape == x.shape
|
||||
x_q = out_q
|
||||
if x_q is None:
|
||||
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
|
||||
|
||||
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
|
||||
M = x.numel() // group_size
|
||||
N = group_size
|
||||
if column_major_scales:
|
||||
|
||||
@ -233,10 +233,6 @@ class CudaPlatformBase(Platform):
|
||||
logger.info_once("Using Triton backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"triton_attn.TritonAttentionBackend")
|
||||
if dtype not in (torch.float16, torch.bfloat16):
|
||||
logger.info_once(
|
||||
f"Using FlexAttenion backend for {dtype} on V1 engine.")
|
||||
return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
|
||||
if cls.is_device_capability(100):
|
||||
# Prefer FlashInfer for V1 on Blackwell GPUs if installed
|
||||
try:
|
||||
|
||||
@ -40,13 +40,12 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
|
||||
blocksparse_params: Optional[dict[str, Any]],
|
||||
logits_soft_cap: Optional[float],
|
||||
attn_type: str,
|
||||
kv_sharing_target_layer_name: Optional[str],
|
||||
# MLA Specific Arguments
|
||||
**mla_args) -> None:
|
||||
super().__init__(num_heads, head_size, scale, num_kv_heads,
|
||||
alibi_slopes, sliding_window, kv_cache_dtype,
|
||||
blocksparse_params, logits_soft_cap, attn_type,
|
||||
kv_sharing_target_layer_name, **mla_args)
|
||||
**mla_args)
|
||||
|
||||
unsupported_features = [
|
||||
alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
|
||||
|
||||
@ -5,7 +5,6 @@ import copy
|
||||
import gc
|
||||
import time
|
||||
import weakref
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@ -13,7 +12,6 @@ import torch
|
||||
import torch.distributed
|
||||
import torch.nn as nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.attention import AttentionType, get_attn_backend
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadataBuilder)
|
||||
@ -657,10 +655,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# Fill unused with -1. Needed for reshape_and_cache
|
||||
self.seq_lens[num_reqs:].fill_(0)
|
||||
# Note: pad query_start_loc to be non-decreasing, as kernels
|
||||
# like FlashAttention requires that
|
||||
self.query_start_loc[num_reqs + 1:].fill_(
|
||||
self.query_start_loc_cpu[num_reqs].item())
|
||||
self.query_start_loc[num_reqs + 1:].fill_(-1)
|
||||
|
||||
query_start_loc = self.query_start_loc[:num_reqs + 1]
|
||||
seq_lens = self.seq_lens[:num_reqs]
|
||||
@ -1729,35 +1724,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
return prompt_logprobs_dict
|
||||
|
||||
@contextmanager
|
||||
def maybe_randomize_inputs(self, input_ids: torch.Tensor):
|
||||
"""
|
||||
Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
|
||||
This is to help balance expert-selection
|
||||
- during profile_run
|
||||
- during DP rank dummy run
|
||||
"""
|
||||
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
||||
randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
|
||||
if not randomize_inputs:
|
||||
yield
|
||||
else:
|
||||
import functools
|
||||
|
||||
@functools.cache
|
||||
def rand_input_ids() -> torch.Tensor:
|
||||
return torch.randint_like(
|
||||
self.input_ids,
|
||||
low=0,
|
||||
high=self.model_config.get_vocab_size(),
|
||||
dtype=input_ids.dtype)
|
||||
|
||||
logger.debug("Randomizing dummy data for DP Rank")
|
||||
input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
|
||||
non_blocking=True)
|
||||
yield
|
||||
input_ids.fill_(0)
|
||||
|
||||
@torch.inference_mode()
|
||||
def _dummy_run(
|
||||
self,
|
||||
@ -1838,7 +1804,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
||||
num_tokens, None, False)
|
||||
|
||||
with self.maybe_randomize_inputs(input_ids), set_forward_context(
|
||||
with set_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens,
|
||||
|
||||
Reference in New Issue
Block a user