updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
[CI] Fix tests/distributed/test_ca_buffer_sharing.py (#22849 )
2025-08-14 03:34:37 +00:00 · 2025-08-13 20:09:30 -07:00 · 2025-08-13 20:09:07 -07:00
9 changed files with 285 additions and 105 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -56,6 +56,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
+  - tests/async_engine
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
@ -65,6 +66,7 @@ steps:
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
@ -225,16 +227,6 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####

- label: Regression Test # 5min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/test_regression
-  commands:
-  - pip install modelscope
-  - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
 - label: Engine Test # 10min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
--- a/tests/async_engine/init.py
+++ b/tests/async_engine/init.py
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+from collections.abc import Iterable
+from typing import Any
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+import vllm.envs as envs
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        ids = list(request_ids)
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
+
+    def testing_stats(self) -> dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+
+
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
+    response = requests.post("http://localhost:8000/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": max_tokens,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
+
+
+@pytest.fixture
+def api_server(distributed_executor_backend: str):
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    commands = [
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
+        "--distributed-executor-backend",
+        distributed_executor_backend,
+    ]
+
+    # API Server Test Requires V0.
+    my_env = os.environ.copy()
+    my_env["VLLM_USE_V1"] = "0"
+    uvicorn_process = subprocess.Popen(commands, env=my_env)
+    yield
+    uvicorn_process.terminate()
+
+
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
+def test_api_server(api_server, distributed_executor_backend: str):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["warm up"] * 1
+        result = None
+        while not result:
+            try:
+                for r in pool.map(_query_server, prompts):
+                    result = r
+                    break
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["test prompt"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+    with Pool(32) as pool:
+        # Cancel requests
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["test prompt after canceled"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.engine.async_llm_engine import RequestTracker
+from vllm.outputs import RequestOutput
+
+
+@pytest.mark.asyncio
+async def test_request_tracker():
+    tracker = RequestTracker()
+    stream_1 = tracker.add_request("1")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 1
+    assert new[0]["request_id"] == "1"
+    assert not aborted
+    assert not stream_1.finished
+
+    stream_2 = tracker.add_request("2")
+    stream_3 = tracker.add_request("3")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 2
+    assert new[0]["request_id"] == "2"
+    assert new[1]["request_id"] == "3"
+    assert not aborted
+    assert not stream_2.finished
+    assert not stream_3.finished
+
+    # request_ids must be unique
+    with pytest.raises(KeyError):
+        tracker.add_request("1")
+    assert not tracker.new_requests_event.is_set()
+
+    tracker.abort_request("1")
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert len(aborted) == 1
+    assert "1" in aborted
+    assert not new
+    assert stream_1.finished
+
+    stream_4 = tracker.add_request("4")
+    tracker.abort_request("4")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    # aborted new requests will cancel each other out -
+    # there's no need for them to propagate into the
+    # engine
+    assert not aborted
+    assert not new
+    assert stream_4.finished
+
+    stream_5 = tracker.add_request("5")
+    assert tracker.new_requests_event.is_set()
+    tracker.process_request_output(
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert not aborted
+    assert len(new) == 1
+    assert new[0]["request_id"] == "5"
+    assert stream_2.finished
+    assert not stream_5.finished
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
    op = context.operation
    assert op is not None

-    def no_file_type(case: schemathesis.models.Case):
+    def no_invalid_types(case: schemathesis.models.Case):
        """
-        This filter skips test cases for the `POST /tokenize` endpoint where the
-        HTTP request body uses `"type": "file"` in any message's content.
-        We expect these cases to fail because that type isn't implemented here
-        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+        This filter skips test cases with invalid data that schemathesis
+        incorrectly generates due to permissive schema configurations.
+        
+        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
+           message content, which isn't implemented.
+        
+        2. Skips tool_calls with `"type": "custom"` which schemathesis 
+           incorrectly generates instead of the valid `"type": "function"`.

        Example test cases that are skipped:
        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
            http://localhost:8000/tokenize

        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
-            http://localhost:8000/tokenize
+            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+            http://localhost:8000/v1/chat/completions
        """  # noqa: E501
-        if (op.method.lower() == "post" and op.path == "/tokenize"
-                and hasattr(case, "body") and isinstance(case.body, dict)
+        if (hasattr(case, "body") and isinstance(case.body, dict)
                and "messages" in case.body
                and isinstance(case.body["messages"], list)
                and len(case.body["messages"]) > 0):
+
            for message in case.body["messages"]:
                if not isinstance(message, dict):
                    continue
-                content = message.get("content", [])
-                if not isinstance(content, list) or len(content) == 0:
-                    continue
-                if any(item.get("type") == "file" for item in content):
-                    return False
+
+                # Check for invalid file type in tokenize endpoint
+                if op.method.lower() == "post" and op.path == "/tokenize":
+                    content = message.get("content", [])
+                    if (isinstance(content, list) and len(content) > 0 and any(
+                            item.get("type") == "file" for item in content)):
+                        return False
+
+                # Check for invalid tool_calls with non-function types
+                tool_calls = message.get("tool_calls", [])
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if isinstance(tool_call, dict):
+                            if tool_call.get("type") != "function":
+                                return False
+                            if "custom" in tool_call:
+                                return False
        return True

-    return strategy.filter(no_file_type)
+    return strategy.filter(no_invalid_types)


@schema.parametrize()
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@ -1,78 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Containing tests that check for regressions in vLLM's behavior.
-
-It should include tests that are reported by users and making sure they
-will never happen again.
-
-"""
-import gc
-
-import pytest
-import torch
-
-from vllm import LLM, SamplingParams
-
-
-@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
-def test_duplicated_ignored_sequence_group():
-    """https://github.com/vllm-project/vllm/issues/1655"""
-
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
-    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    assert len(prompts) == len(outputs)
-
-
-def test_max_tokens_none():
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=None)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
-    prompts = ["Just say hello!"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    assert len(prompts) == len(outputs)
-
-
-def test_gc():
-    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
-    del llm
-
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    # The memory allocated for model and KV cache should be released.
-    # The memory allocated for PyTorch and others should be less than 50MB.
-    # Usually, it's around 10MB.
-    allocated = torch.cuda.memory_allocated()
-    assert allocated < 50 * 1024 * 1024
-
-
-def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
-    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_MODELSCOPE", "True")
-        # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
-        # with 400 Client Error: Bad Request.
-        m.setenv("HF_TOKEN", "")
-        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
-
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-        outputs = llm.generate(prompts, sampling_params)
-        assert len(outputs) == 4
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -297,7 +297,7 @@ class CustomAllreduce:
    @staticmethod
    def free_shared_buffer(pointers: list[int],
                           group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = 0) -> None:
+                           rank: Optional[int] = None) -> None:
        if rank is None:
            rank = dist.get_rank(group=group)
        if ops is not None:
Author	SHA1	Message	Date
Robert Shaw	36ccdcad2c	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-08-14 03:34:37 +00:00
Ilya Markov	1d20c34717	[CI] Fix `tests/distributed/test_ca_buffer_sharing.py` (#22849 ) Signed-off-by: ilmarkov <imarkov@redhat.com> Co-authored-by: ilmarkov <imarkov@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2025-08-13 20:09:30 -07:00
Will Eaton	b6af24fba7	[CI][Entrypoints]: add filter to generation to filter out invalid tool calls (#22826 ) Signed-off-by: Will Eaton <weaton@redhat.com>	2025-08-13 20:09:07 -07:00