Compare commits
1 Commits
remove-reg
...
remove-asy
| Author | SHA1 | Date | |
|---|---|---|---|
| 0470cac520 |
@ -56,7 +56,6 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
- tests/async_engine
|
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
@ -66,7 +65,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
@ -227,6 +225,16 @@ steps:
|
|||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
|
- label: Regression Test # 5min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/test_regression
|
||||||
|
commands:
|
||||||
|
- pip install modelscope
|
||||||
|
- pytest -v -s test_regression.py
|
||||||
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
|||||||
@ -1,54 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""vllm.entrypoints.api_server with some extra logging for testing."""
|
|
||||||
from collections.abc import Iterable
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import uvicorn
|
|
||||||
from fastapi.responses import JSONResponse, Response
|
|
||||||
|
|
||||||
import vllm.entrypoints.api_server
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
app = vllm.entrypoints.api_server.app
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self._num_aborts = 0
|
|
||||||
|
|
||||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
|
||||||
ids = list(request_ids)
|
|
||||||
self._num_aborts += len(ids)
|
|
||||||
await super()._engine_abort(ids)
|
|
||||||
|
|
||||||
def testing_stats(self) -> dict[str, Any]:
|
|
||||||
return {"num_aborted_requests": self._num_aborts}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/stats")
|
|
||||||
def stats() -> Response:
|
|
||||||
"""Get the statistics of the engine."""
|
|
||||||
return JSONResponse(engine.testing_stats())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
|
||||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
|
||||||
vllm.entrypoints.api_server.engine = engine
|
|
||||||
uvicorn.run(app,
|
|
||||||
host=args.host,
|
|
||||||
port=args.port,
|
|
||||||
log_level="debug",
|
|
||||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
|
||||||
def use_v0_only(monkeypatch):
|
|
||||||
"""
|
|
||||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
|
||||||
all tests in the module.
|
|
||||||
"""
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
||||||
@ -1,113 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from multiprocessing import Pool
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
|
||||||
response = requests.post("http://localhost:8000/generate",
|
|
||||||
json={
|
|
||||||
"prompt": prompt,
|
|
||||||
"max_tokens": max_tokens,
|
|
||||||
"temperature": 0,
|
|
||||||
"ignore_eos": True
|
|
||||||
})
|
|
||||||
response.raise_for_status()
|
|
||||||
return response.json()
|
|
||||||
|
|
||||||
|
|
||||||
def _query_server_long(prompt: str) -> dict:
|
|
||||||
return _query_server(prompt, max_tokens=500)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def api_server(distributed_executor_backend: str):
|
|
||||||
script_path = Path(__file__).parent.joinpath(
|
|
||||||
"api_server_async_engine.py").absolute()
|
|
||||||
commands = [
|
|
||||||
sys.executable,
|
|
||||||
"-u",
|
|
||||||
str(script_path),
|
|
||||||
"--model",
|
|
||||||
"facebook/opt-125m",
|
|
||||||
"--host",
|
|
||||||
"127.0.0.1",
|
|
||||||
"--distributed-executor-backend",
|
|
||||||
distributed_executor_backend,
|
|
||||||
]
|
|
||||||
|
|
||||||
# API Server Test Requires V0.
|
|
||||||
my_env = os.environ.copy()
|
|
||||||
my_env["VLLM_USE_V1"] = "0"
|
|
||||||
uvicorn_process = subprocess.Popen(commands, env=my_env)
|
|
||||||
yield
|
|
||||||
uvicorn_process.terminate()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
|
||||||
def test_api_server(api_server, distributed_executor_backend: str):
|
|
||||||
"""
|
|
||||||
Run the API server and test it.
|
|
||||||
|
|
||||||
We run both the server and requests in separate processes.
|
|
||||||
|
|
||||||
We test that the server can handle incoming requests, including
|
|
||||||
multiple requests at the same time, and that it can handle requests
|
|
||||||
being cancelled without crashing.
|
|
||||||
"""
|
|
||||||
with Pool(32) as pool:
|
|
||||||
# Wait until the server is ready
|
|
||||||
prompts = ["warm up"] * 1
|
|
||||||
result = None
|
|
||||||
while not result:
|
|
||||||
try:
|
|
||||||
for r in pool.map(_query_server, prompts):
|
|
||||||
result = r
|
|
||||||
break
|
|
||||||
except requests.exceptions.ConnectionError:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# Actual tests start here
|
|
||||||
# Try with 1 prompt
|
|
||||||
for result in pool.map(_query_server, prompts):
|
|
||||||
assert result
|
|
||||||
|
|
||||||
num_aborted_requests = requests.get(
|
|
||||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
|
||||||
assert num_aborted_requests == 0
|
|
||||||
|
|
||||||
# Try with 100 prompts
|
|
||||||
prompts = ["test prompt"] * 100
|
|
||||||
for result in pool.map(_query_server, prompts):
|
|
||||||
assert result
|
|
||||||
|
|
||||||
with Pool(32) as pool:
|
|
||||||
# Cancel requests
|
|
||||||
prompts = ["canceled requests"] * 100
|
|
||||||
pool.map_async(_query_server_long, prompts)
|
|
||||||
time.sleep(0.01)
|
|
||||||
pool.terminate()
|
|
||||||
pool.join()
|
|
||||||
|
|
||||||
# check cancellation stats
|
|
||||||
# give it some times to update the stats
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
num_aborted_requests = requests.get(
|
|
||||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
|
||||||
assert num_aborted_requests > 0
|
|
||||||
|
|
||||||
# check that server still runs after cancellations
|
|
||||||
with Pool(32) as pool:
|
|
||||||
# Try with 100 prompts
|
|
||||||
prompts = ["test prompt after canceled"] * 100
|
|
||||||
for result in pool.map(_query_server, prompts):
|
|
||||||
assert result
|
|
||||||
@ -1,71 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from vllm.engine.async_llm_engine import RequestTracker
|
|
||||||
from vllm.outputs import RequestOutput
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_request_tracker():
|
|
||||||
tracker = RequestTracker()
|
|
||||||
stream_1 = tracker.add_request("1")
|
|
||||||
assert tracker.new_requests_event.is_set()
|
|
||||||
await tracker.wait_for_new_requests()
|
|
||||||
new, aborted = tracker.get_new_and_aborted_requests()
|
|
||||||
assert not tracker.new_requests_event.is_set()
|
|
||||||
assert len(new) == 1
|
|
||||||
assert new[0]["request_id"] == "1"
|
|
||||||
assert not aborted
|
|
||||||
assert not stream_1.finished
|
|
||||||
|
|
||||||
stream_2 = tracker.add_request("2")
|
|
||||||
stream_3 = tracker.add_request("3")
|
|
||||||
assert tracker.new_requests_event.is_set()
|
|
||||||
await tracker.wait_for_new_requests()
|
|
||||||
new, aborted = tracker.get_new_and_aborted_requests()
|
|
||||||
assert not tracker.new_requests_event.is_set()
|
|
||||||
assert len(new) == 2
|
|
||||||
assert new[0]["request_id"] == "2"
|
|
||||||
assert new[1]["request_id"] == "3"
|
|
||||||
assert not aborted
|
|
||||||
assert not stream_2.finished
|
|
||||||
assert not stream_3.finished
|
|
||||||
|
|
||||||
# request_ids must be unique
|
|
||||||
with pytest.raises(KeyError):
|
|
||||||
tracker.add_request("1")
|
|
||||||
assert not tracker.new_requests_event.is_set()
|
|
||||||
|
|
||||||
tracker.abort_request("1")
|
|
||||||
new, aborted = tracker.get_new_and_aborted_requests()
|
|
||||||
assert len(aborted) == 1
|
|
||||||
assert "1" in aborted
|
|
||||||
assert not new
|
|
||||||
assert stream_1.finished
|
|
||||||
|
|
||||||
stream_4 = tracker.add_request("4")
|
|
||||||
tracker.abort_request("4")
|
|
||||||
assert tracker.new_requests_event.is_set()
|
|
||||||
await tracker.wait_for_new_requests()
|
|
||||||
new, aborted = tracker.get_new_and_aborted_requests()
|
|
||||||
# aborted new requests will cancel each other out -
|
|
||||||
# there's no need for them to propagate into the
|
|
||||||
# engine
|
|
||||||
assert not aborted
|
|
||||||
assert not new
|
|
||||||
assert stream_4.finished
|
|
||||||
|
|
||||||
stream_5 = tracker.add_request("5")
|
|
||||||
assert tracker.new_requests_event.is_set()
|
|
||||||
tracker.process_request_output(
|
|
||||||
RequestOutput("2", "output", [], [], [], finished=True))
|
|
||||||
await tracker.wait_for_new_requests()
|
|
||||||
new, aborted = tracker.get_new_and_aborted_requests()
|
|
||||||
assert not tracker.new_requests_event.is_set()
|
|
||||||
assert not aborted
|
|
||||||
assert len(new) == 1
|
|
||||||
assert new[0]["request_id"] == "5"
|
|
||||||
assert stream_2.finished
|
|
||||||
assert not stream_5.finished
|
|
||||||
@ -54,54 +54,38 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
|||||||
op = context.operation
|
op = context.operation
|
||||||
assert op is not None
|
assert op is not None
|
||||||
|
|
||||||
def no_invalid_types(case: schemathesis.models.Case):
|
def no_file_type(case: schemathesis.models.Case):
|
||||||
"""
|
"""
|
||||||
This filter skips test cases with invalid data that schemathesis
|
This filter skips test cases for the `POST /tokenize` endpoint where the
|
||||||
incorrectly generates due to permissive schema configurations.
|
HTTP request body uses `"type": "file"` in any message's content.
|
||||||
|
We expect these cases to fail because that type isn't implemented here
|
||||||
1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
|
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
|
||||||
message content, which isn't implemented.
|
|
||||||
|
|
||||||
2. Skips tool_calls with `"type": "custom"` which schemathesis
|
|
||||||
incorrectly generates instead of the valid `"type": "function"`.
|
|
||||||
|
|
||||||
Example test cases that are skipped:
|
Example test cases that are skipped:
|
||||||
curl -X POST -H 'Content-Type: application/json' \
|
curl -X POST -H 'Content-Type: application/json' \
|
||||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||||
http://localhost:8000/tokenize
|
http://localhost:8000/tokenize
|
||||||
|
|
||||||
curl -X POST -H 'Content-Type: application/json' \
|
curl -X POST -H 'Content-Type: application/json' \
|
||||||
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
|
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||||
http://localhost:8000/v1/chat/completions
|
http://localhost:8000/tokenize
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
if (hasattr(case, "body") and isinstance(case.body, dict)
|
if (op.method.lower() == "post" and op.path == "/tokenize"
|
||||||
|
and hasattr(case, "body") and isinstance(case.body, dict)
|
||||||
and "messages" in case.body
|
and "messages" in case.body
|
||||||
and isinstance(case.body["messages"], list)
|
and isinstance(case.body["messages"], list)
|
||||||
and len(case.body["messages"]) > 0):
|
and len(case.body["messages"]) > 0):
|
||||||
|
|
||||||
for message in case.body["messages"]:
|
for message in case.body["messages"]:
|
||||||
if not isinstance(message, dict):
|
if not isinstance(message, dict):
|
||||||
continue
|
continue
|
||||||
|
content = message.get("content", [])
|
||||||
# Check for invalid file type in tokenize endpoint
|
if not isinstance(content, list) or len(content) == 0:
|
||||||
if op.method.lower() == "post" and op.path == "/tokenize":
|
continue
|
||||||
content = message.get("content", [])
|
if any(item.get("type") == "file" for item in content):
|
||||||
if (isinstance(content, list) and len(content) > 0 and any(
|
return False
|
||||||
item.get("type") == "file" for item in content)):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check for invalid tool_calls with non-function types
|
|
||||||
tool_calls = message.get("tool_calls", [])
|
|
||||||
if isinstance(tool_calls, list):
|
|
||||||
for tool_call in tool_calls:
|
|
||||||
if isinstance(tool_call, dict):
|
|
||||||
if tool_call.get("type") != "function":
|
|
||||||
return False
|
|
||||||
if "custom" in tool_call:
|
|
||||||
return False
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return strategy.filter(no_invalid_types)
|
return strategy.filter(no_file_type)
|
||||||
|
|
||||||
|
|
||||||
@schema.parametrize()
|
@schema.parametrize()
|
||||||
|
|||||||
78
tests/test_regression.py
Normal file
78
tests/test_regression.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""Containing tests that check for regressions in vLLM's behavior.
|
||||||
|
|
||||||
|
It should include tests that are reported by users and making sure they
|
||||||
|
will never happen again.
|
||||||
|
|
||||||
|
"""
|
||||||
|
import gc
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
|
||||||
|
def test_duplicated_ignored_sequence_group():
|
||||||
|
"""https://github.com/vllm-project/vllm/issues/1655"""
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0.01,
|
||||||
|
top_p=0.1,
|
||||||
|
max_tokens=256)
|
||||||
|
llm = LLM(model="distilbert/distilgpt2",
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
tensor_parallel_size=1)
|
||||||
|
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
|
||||||
|
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
assert len(prompts) == len(outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_tokens_none():
|
||||||
|
sampling_params = SamplingParams(temperature=0.01,
|
||||||
|
top_p=0.1,
|
||||||
|
max_tokens=None)
|
||||||
|
llm = LLM(model="distilbert/distilgpt2",
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
tensor_parallel_size=1)
|
||||||
|
prompts = ["Just say hello!"]
|
||||||
|
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
assert len(prompts) == len(outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gc():
|
||||||
|
llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
|
||||||
|
del llm
|
||||||
|
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
# The memory allocated for model and KV cache should be released.
|
||||||
|
# The memory allocated for PyTorch and others should be less than 50MB.
|
||||||
|
# Usually, it's around 10MB.
|
||||||
|
allocated = torch.cuda.memory_allocated()
|
||||||
|
assert allocated < 50 * 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_MODELSCOPE", "True")
|
||||||
|
# Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
|
||||||
|
# with 400 Client Error: Bad Request.
|
||||||
|
m.setenv("HF_TOKEN", "")
|
||||||
|
llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
assert len(outputs) == 4
|
||||||
@ -297,7 +297,7 @@ class CustomAllreduce:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def free_shared_buffer(pointers: list[int],
|
def free_shared_buffer(pointers: list[int],
|
||||||
group: Optional[ProcessGroup] = None,
|
group: Optional[ProcessGroup] = None,
|
||||||
rank: Optional[int] = None) -> None:
|
rank: Optional[int] = 0) -> None:
|
||||||
if rank is None:
|
if rank is None:
|
||||||
rank = dist.get_rank(group=group)
|
rank = dist.get_rank(group=group)
|
||||||
if ops is not None:
|
if ops is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user