Compare commits

...

3 Commits

Author SHA1 Message Date
36ccdcad2c updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-08-14 03:34:37 +00:00
1d20c34717 [CI] Fix tests/distributed/test_ca_buffer_sharing.py (#22849)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-13 20:09:30 -07:00
b6af24fba7 [CI][Entrypoints]: add filter to generation to filter out invalid tool calls (#22826)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-08-13 20:09:07 -07:00
4 changed files with 33 additions and 105 deletions

View File

@ -227,16 +227,6 @@ steps:
##### fast check tests ##### ##### fast check tests #####
##### 1 GPU test ##### ##### 1 GPU test #####
- label: Regression Test # 5min
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/test_regression
commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:

View File

@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
op = context.operation op = context.operation
assert op is not None assert op is not None
def no_file_type(case: schemathesis.models.Case): def no_invalid_types(case: schemathesis.models.Case):
""" """
This filter skips test cases for the `POST /tokenize` endpoint where the This filter skips test cases with invalid data that schemathesis
HTTP request body uses `"type": "file"` in any message's content. incorrectly generates due to permissive schema configurations.
We expect these cases to fail because that type isn't implemented here
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095 1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
message content, which isn't implemented.
2. Skips tool_calls with `"type": "custom"` which schemathesis
incorrectly generates instead of the valid `"type": "function"`.
Example test cases that are skipped: Example test cases that are skipped:
curl -X POST -H 'Content-Type: application/json' \ curl -X POST -H 'Content-Type: application/json' \
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
http://localhost:8000/tokenize http://localhost:8000/tokenize
curl -X POST -H 'Content-Type: application/json' \ curl -X POST -H 'Content-Type: application/json' \
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
http://localhost:8000/tokenize http://localhost:8000/v1/chat/completions
""" # noqa: E501 """ # noqa: E501
if (op.method.lower() == "post" and op.path == "/tokenize" if (hasattr(case, "body") and isinstance(case.body, dict)
and hasattr(case, "body") and isinstance(case.body, dict)
and "messages" in case.body and "messages" in case.body
and isinstance(case.body["messages"], list) and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0): and len(case.body["messages"]) > 0):
for message in case.body["messages"]: for message in case.body["messages"]:
if not isinstance(message, dict): if not isinstance(message, dict):
continue continue
content = message.get("content", [])
if not isinstance(content, list) or len(content) == 0: # Check for invalid file type in tokenize endpoint
continue if op.method.lower() == "post" and op.path == "/tokenize":
if any(item.get("type") == "file" for item in content): content = message.get("content", [])
return False if (isinstance(content, list) and len(content) > 0 and any(
item.get("type") == "file" for item in content)):
return False
# Check for invalid tool_calls with non-function types
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if isinstance(tool_call, dict):
if tool_call.get("type") != "function":
return False
if "custom" in tool_call:
return False
return True return True
return strategy.filter(no_file_type) return strategy.filter(no_invalid_types)
@schema.parametrize() @schema.parametrize()

View File

@ -1,78 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Containing tests that check for regressions in vLLM's behavior.
It should include tests that are reported by users and making sure they
will never happen again.
"""
import gc
import pytest
import torch
from vllm import LLM, SamplingParams
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
def test_duplicated_ignored_sequence_group():
"""https://github.com/vllm-project/vllm/issues/1655"""
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
llm = LLM(model="distilbert/distilgpt2",
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
outputs = llm.generate(prompts, sampling_params=sampling_params)
assert len(prompts) == len(outputs)
def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=None)
llm = LLM(model="distilbert/distilgpt2",
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["Just say hello!"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
assert len(prompts) == len(outputs)
def test_gc():
llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
del llm
gc.collect()
torch.cuda.empty_cache()
# The memory allocated for model and KV cache should be released.
# The memory allocated for PyTorch and others should be less than 50MB.
# Usually, it's around 10MB.
allocated = torch.cuda.memory_allocated()
assert allocated < 50 * 1024 * 1024
def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True")
# Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
# with 400 Client Error: Bad Request.
m.setenv("HF_TOKEN", "")
llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
assert len(outputs) == 4

View File

@ -297,7 +297,7 @@ class CustomAllreduce:
@staticmethod @staticmethod
def free_shared_buffer(pointers: list[int], def free_shared_buffer(pointers: list[int],
group: Optional[ProcessGroup] = None, group: Optional[ProcessGroup] = None,
rank: Optional[int] = 0) -> None: rank: Optional[int] = None) -> None:
if rank is None: if rank is None:
rank = dist.get_rank(group=group) rank = dist.get_rank(group=group)
if ops is not None: if ops is not None: