updaed

Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-08-14 02:14:03 +00:00
9 changed files with 105 additions and 285 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -56,7 +56,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
  - tests/async_engine
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
@ -66,7 +65,6 @@ steps:
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
@ -227,6 +225,16 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####
 - label: Regression Test # 5min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
 - label: Engine Test # 10min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
--- a/tests/async_engine/init.py
+++ b/tests/async_engine/init.py
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@ -1,54 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from collections.abc import Iterable
 from typing import Any
 import uvicorn
 from fastapi.responses import JSONResponse, Response
 import vllm.entrypoints.api_server
 import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.utils import FlexibleArgumentParser
 app = vllm.entrypoints.api_server.app
 class AsyncLLMEngineWithStats(AsyncLLMEngine):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._num_aborts = 0
    async def _engine_abort(self, request_ids: Iterable[str]):
        ids = list(request_ids)
        self._num_aborts += len(ids)
        await super()._engine_abort(ids)
    def testing_stats(self) -> dict[str, Any]:
        return {"num_aborted_requests": self._num_aborts}
@app.get("/stats")
 def stats() -> Response:
    """Get the statistics of the engine."""
    return JSONResponse(engine.testing_stats())
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
    vllm.entrypoints.api_server.engine = engine
    uvicorn.run(app,
                host=args.host,
                port=args.port,
                log_level="debug",
                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@ -1,12 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
@pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -1,113 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import subprocess
 import sys
 import time
 from multiprocessing import Pool
 from pathlib import Path
 import pytest
 import requests
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
    response = requests.post("http://localhost:8000/generate",
                             json={
                                 "prompt": prompt,
                                 "max_tokens": max_tokens,
                                 "temperature": 0,
                                 "ignore_eos": True
                             })
    response.raise_for_status()
    return response.json()
 def _query_server_long(prompt: str) -> dict:
    return _query_server(prompt, max_tokens=500)
@pytest.fixture
 def api_server(distributed_executor_backend: str):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
        sys.executable,
        "-u",
        str(script_path),
        "--model",
        "facebook/opt-125m",
        "--host",
        "127.0.0.1",
        "--distributed-executor-backend",
        distributed_executor_backend,
    ]
    # API Server Test Requires V0.
    my_env = os.environ.copy()
    my_env["VLLM_USE_V1"] = "0"
    uvicorn_process = subprocess.Popen(commands, env=my_env)
    yield
    uvicorn_process.terminate()
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, distributed_executor_backend: str):
    """
    Run the API server and test it.
    We run both the server and requests in separate processes.
    We test that the server can handle incoming requests, including
    multiple requests at the same time, and that it can handle requests
    being cancelled without crashing.
    """
    with Pool(32) as pool:
        # Wait until the server is ready
        prompts = ["warm up"] * 1
        result = None
        while not result:
            try:
                for r in pool.map(_query_server, prompts):
                    result = r
                    break
            except requests.exceptions.ConnectionError:
                time.sleep(1)
        # Actual tests start here
        # Try with 1 prompt
        for result in pool.map(_query_server, prompts):
            assert result
        num_aborted_requests = requests.get(
            "http://localhost:8000/stats").json()["num_aborted_requests"]
        assert num_aborted_requests == 0
        # Try with 100 prompts
        prompts = ["test prompt"] * 100
        for result in pool.map(_query_server, prompts):
            assert result
    with Pool(32) as pool:
        # Cancel requests
        prompts = ["canceled requests"] * 100
        pool.map_async(_query_server_long, prompts)
        time.sleep(0.01)
        pool.terminate()
        pool.join()
        # check cancellation stats
        # give it some times to update the stats
        time.sleep(1)
        num_aborted_requests = requests.get(
            "http://localhost:8000/stats").json()["num_aborted_requests"]
        assert num_aborted_requests > 0
    # check that server still runs after cancellations
    with Pool(32) as pool:
        # Try with 100 prompts
        prompts = ["test prompt after canceled"] * 100
        for result in pool.map(_query_server, prompts):
            assert result
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@ -1,71 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from vllm.engine.async_llm_engine import RequestTracker
 from vllm.outputs import RequestOutput
@pytest.mark.asyncio
 async def test_request_tracker():
    tracker = RequestTracker()
    stream_1 = tracker.add_request("1")
    assert tracker.new_requests_event.is_set()
    await tracker.wait_for_new_requests()
    new, aborted = tracker.get_new_and_aborted_requests()
    assert not tracker.new_requests_event.is_set()
    assert len(new) == 1
    assert new[0]["request_id"] == "1"
    assert not aborted
    assert not stream_1.finished
    stream_2 = tracker.add_request("2")
    stream_3 = tracker.add_request("3")
    assert tracker.new_requests_event.is_set()
    await tracker.wait_for_new_requests()
    new, aborted = tracker.get_new_and_aborted_requests()
    assert not tracker.new_requests_event.is_set()
    assert len(new) == 2
    assert new[0]["request_id"] == "2"
    assert new[1]["request_id"] == "3"
    assert not aborted
    assert not stream_2.finished
    assert not stream_3.finished
    # request_ids must be unique
    with pytest.raises(KeyError):
        tracker.add_request("1")
    assert not tracker.new_requests_event.is_set()
    tracker.abort_request("1")
    new, aborted = tracker.get_new_and_aborted_requests()
    assert len(aborted) == 1
    assert "1" in aborted
    assert not new
    assert stream_1.finished
    stream_4 = tracker.add_request("4")
    tracker.abort_request("4")
    assert tracker.new_requests_event.is_set()
    await tracker.wait_for_new_requests()
    new, aborted = tracker.get_new_and_aborted_requests()
    # aborted new requests will cancel each other out -
    # there's no need for them to propagate into the
    # engine
    assert not aborted
    assert not new
    assert stream_4.finished
    stream_5 = tracker.add_request("5")
    assert tracker.new_requests_event.is_set()
    tracker.process_request_output(
        RequestOutput("2", "output", [], [], [], finished=True))
    await tracker.wait_for_new_requests()
    new, aborted = tracker.get_new_and_aborted_requests()
    assert not tracker.new_requests_event.is_set()
    assert not aborted
    assert len(new) == 1
    assert new[0]["request_id"] == "5"
    assert stream_2.finished
    assert not stream_5.finished
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@ -54,54 +54,38 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
    op = context.operation
    assert op is not None
-    def no_invalid_types(case: schemathesis.models.Case):
+    def no_file_type(case: schemathesis.models.Case):
        """
-        This filter skips test cases with invalid data that schemathesis
+        This filter skips test cases for the `POST /tokenize` endpoint where the
-        incorrectly generates due to permissive schema configurations.
+        HTTP request body uses `"type": "file"` in any message's content.
-        
+        We expect these cases to fail because that type isn't implemented here
-        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
+        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
           message content, which isn't implemented.
        2. Skips tool_calls with `"type": "custom"` which schemathesis 
           incorrectly generates instead of the valid `"type": "function"`.
        Example test cases that are skipped:
        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
            http://localhost:8000/tokenize
        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
-            http://localhost:8000/v1/chat/completions
+            http://localhost:8000/tokenize
        """  # noqa: E501
-        if (hasattr(case, "body") and isinstance(case.body, dict)
+        if (op.method.lower() == "post" and op.path == "/tokenize"
                and hasattr(case, "body") and isinstance(case.body, dict)
                and "messages" in case.body
                and isinstance(case.body["messages"], list)
                and len(case.body["messages"]) > 0):
            for message in case.body["messages"]:
                if not isinstance(message, dict):
                    continue
-
+                content = message.get("content", [])
-                # Check for invalid file type in tokenize endpoint
+                if not isinstance(content, list) or len(content) == 0:
-                if op.method.lower() == "post" and op.path == "/tokenize":
+                    continue
-                    content = message.get("content", [])
+                if any(item.get("type") == "file" for item in content):
-                    if (isinstance(content, list) and len(content) > 0 and any(
+                    return False
                            item.get("type") == "file" for item in content)):
                        return False
                # Check for invalid tool_calls with non-function types
                tool_calls = message.get("tool_calls", [])
                if isinstance(tool_calls, list):
                    for tool_call in tool_calls:
                        if isinstance(tool_call, dict):
                            if tool_call.get("type") != "function":
                                return False
                            if "custom" in tool_call:
                                return False
        return True
-    return strategy.filter(no_invalid_types)
+    return strategy.filter(no_file_type)
@schema.parametrize()
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@ -0,0 +1,78 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Containing tests that check for regressions in vLLM's behavior.
 It should include tests that are reported by users and making sure they
 will never happen again.
 """
 import gc
 import pytest
 import torch
 from vllm import LLM, SamplingParams
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
 def test_duplicated_ignored_sequence_group():
    """https://github.com/vllm-project/vllm/issues/1655"""
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
    llm = LLM(model="distilbert/distilgpt2",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
    outputs = llm.generate(prompts, sampling_params=sampling_params)
    assert len(prompts) == len(outputs)
 def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
    llm = LLM(model="distilbert/distilgpt2",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["Just say hello!"]
    outputs = llm.generate(prompts, sampling_params=sampling_params)
    assert len(prompts) == len(outputs)
 def test_gc():
    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
    del llm
    gc.collect()
    torch.cuda.empty_cache()
    # The memory allocated for model and KV cache should be released.
    # The memory allocated for PyTorch and others should be less than 50MB.
    # Usually, it's around 10MB.
    allocated = torch.cuda.memory_allocated()
    assert allocated < 50 * 1024 * 1024
 def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_MODELSCOPE", "True")
        # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
        # with 400 Client Error: Bad Request.
        m.setenv("HF_TOKEN", "")
        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
        prompts = [
            "Hello, my name is",
            "The president of the United States is",
            "The capital of France is",
            "The future of AI is",
        ]
        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
        outputs = llm.generate(prompts, sampling_params)
        assert len(outputs) == 4
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -297,7 +297,7 @@ class CustomAllreduce:
    @staticmethod
    def free_shared_buffer(pointers: list[int],
                           group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = None) -> None:
+                           rank: Optional[int] = 0) -> None:
        if rank is None:
            rank = dist.get_rank(group=group)
        if ops is not None: