[Deprecation][2/N] Replace --task with --runner and --convert (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -148,9 +148,6 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(model=model_name,
|
||||
task="auto",
|
||||
tokenizer=model_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
seed=42)
|
||||
|
||||
@ -62,8 +62,8 @@ class TestSetting:
|
||||
TestSetting(
|
||||
model="BAAI/bge-multilingual-gemma2",
|
||||
model_args=[
|
||||
"--task", "embed", "--dtype", "bfloat16", "--max-model-len",
|
||||
"2048"
|
||||
"--runner", "pooling", "--dtype", "bfloat16",
|
||||
"--max-model-len", "2048"
|
||||
],
|
||||
pp_size=1,
|
||||
tp_size=1,
|
||||
@ -75,7 +75,7 @@ class TestSetting:
|
||||
# # encoder-based embedding model (BERT)
|
||||
# TestSetting(
|
||||
# model="BAAI/bge-base-en-v1.5",
|
||||
# model_args=["--task", "embed"],
|
||||
# model_args=["--runner", "pooling"],
|
||||
# pp_size=1,
|
||||
# tp_size=1,
|
||||
# attn_backend="XFORMERS",
|
||||
|
||||
@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(model=model_name,
|
||||
task="auto",
|
||||
tokenizer=model_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
seed=42)
|
||||
|
||||
@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model(
|
||||
# in the vllm_config, it's not really used.
|
||||
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(model=model_name,
|
||||
task="auto",
|
||||
tokenizer=model_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
seed=42)
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import TaskOption, _get_and_verify_dtype
|
||||
from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
@ -769,7 +769,8 @@ class VllmRunner:
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
convert: ConvertOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = True,
|
||||
@ -786,7 +787,8 @@ class VllmRunner:
|
||||
) -> None:
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
task=task,
|
||||
runner=runner,
|
||||
convert=convert,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=trust_remote_code,
|
||||
|
||||
@ -6,7 +6,7 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple):
|
||||
class EPTestSettings:
|
||||
parallel_setups: list[ParallelSetup]
|
||||
distributed_backends: list[str]
|
||||
task: TaskOption
|
||||
runner: RunnerOption
|
||||
test_options: EPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
@ -63,7 +63,7 @@ class EPTestSettings:
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
@ -74,7 +74,7 @@ class EPTestSettings:
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
@ -87,7 +87,7 @@ class EPTestSettings:
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
@ -100,7 +100,7 @@ class EPTestSettings:
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
self.task, opts)
|
||||
self.runner, opts)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
||||
@ -118,7 +118,7 @@ def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
@ -154,8 +154,8 @@ def _compare_tp(
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
@ -203,7 +203,7 @@ def _compare_tp(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
("model_name", "parallel_setup", "distributed_backend", "runner",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in TEST_MODELS.items()
|
||||
@ -215,14 +215,14 @@ def test_ep(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
|
||||
@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
@ -60,7 +60,7 @@ class PPTestSettings:
|
||||
distributed_backends: list[str]
|
||||
# vllm major version: "0" for V0, "1" for V1
|
||||
vllm_major_versions: list[str]
|
||||
task: TaskOption
|
||||
runner: RunnerOption
|
||||
test_options: PPTestOptions
|
||||
|
||||
def __post_init__(self):
|
||||
@ -76,7 +76,7 @@ class PPTestSettings:
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
multi_node_only: bool = False,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
@ -104,7 +104,7 @@ class PPTestSettings:
|
||||
],
|
||||
distributed_backends=["mp", "mp", "ray", "ray"],
|
||||
vllm_major_versions=["0", "1", "0", "1"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
)
|
||||
@ -114,7 +114,7 @@ class PPTestSettings:
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
@ -127,7 +127,7 @@ class PPTestSettings:
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
vllm_major_versions=["0"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
)
|
||||
@ -139,7 +139,7 @@ class PPTestSettings:
|
||||
for backend, vllm_major_version in zip(self.distributed_backends,
|
||||
self.vllm_major_versions):
|
||||
yield (model_id, parallel_setup, backend, vllm_major_version,
|
||||
self.task, opts)
|
||||
self.runner, opts)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
@ -211,10 +211,10 @@ TEXT_GENERATION_MODELS = {
|
||||
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
|
||||
load_format="dummy", task="embed"
|
||||
load_format="dummy", runner="pooling"
|
||||
),
|
||||
}
|
||||
|
||||
@ -269,7 +269,7 @@ def _compare_tp(
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
@ -335,8 +335,8 @@ def _compare_tp(
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
@ -415,7 +415,7 @@ def _compare_tp(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"task", "test_options"),
|
||||
"runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
@ -427,7 +427,7 @@ def test_tp_language_generation(
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
@ -435,7 +435,7 @@ def test_tp_language_generation(
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
@ -444,7 +444,7 @@ def test_tp_language_generation(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"task", "test_options"),
|
||||
"runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
@ -456,7 +456,7 @@ def test_tp_language_embedding(
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
@ -464,7 +464,7 @@ def test_tp_language_embedding(
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="encode",
|
||||
@ -473,7 +473,7 @@ def test_tp_language_embedding(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"task", "test_options"),
|
||||
"runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
@ -485,7 +485,7 @@ def test_tp_multimodal_generation(
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
@ -493,7 +493,7 @@ def test_tp_multimodal_generation(
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
|
||||
@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
@ -48,7 +48,7 @@ class SPTestSettings:
|
||||
distributed_backends: list[str]
|
||||
# vllm major version: "0" for V0, "1" for V1
|
||||
vllm_major_versions: list[str]
|
||||
task: TaskOption
|
||||
runner: RunnerOption
|
||||
test_options: SPTestOptions
|
||||
|
||||
def __post_init__(self):
|
||||
@ -64,7 +64,7 @@ class SPTestSettings:
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
@ -81,7 +81,7 @@ class SPTestSettings:
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
vllm_major_versions=["1", "1"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
)
|
||||
@ -91,7 +91,7 @@ class SPTestSettings:
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
@ -109,7 +109,7 @@ class SPTestSettings:
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
vllm_major_versions=["1", "1"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
)
|
||||
@ -119,7 +119,7 @@ class SPTestSettings:
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
@ -135,7 +135,7 @@ class SPTestSettings:
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
vllm_major_versions=["1", "1"],
|
||||
task=task,
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
)
|
||||
@ -147,7 +147,7 @@ class SPTestSettings:
|
||||
for backend, vllm_major_version in zip(self.distributed_backends,
|
||||
self.vllm_major_versions):
|
||||
yield (model_id, parallel_setup, backend, vllm_major_version,
|
||||
self.task, opts)
|
||||
self.runner, opts)
|
||||
|
||||
|
||||
def _compare_sp(
|
||||
@ -155,7 +155,7 @@ def _compare_sp(
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
@ -217,8 +217,8 @@ def _compare_sp(
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
@ -298,7 +298,7 @@ SP_TEST_MODELS = [
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"task", "test_options"),
|
||||
"runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
@ -311,7 +311,7 @@ def test_tp_sp_generation(
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
runner: RunnerOption,
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
@ -319,7 +319,7 @@ def test_tp_sp_generation(
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
|
||||
@ -19,7 +19,8 @@ MAIN_SCORE = 0.7422994752439667
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
|
||||
"--runner", "pooling", "--enforce-eager",
|
||||
"--disable-uvicorn-access-log"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
|
||||
@ -21,7 +21,8 @@ MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
|
||||
"--runner", "pooling", "--enforce-eager",
|
||||
"--disable-uvicorn-access-log"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
|
||||
@ -15,10 +15,6 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
def get_vocab_size(model_name):
|
||||
config = ModelConfig(
|
||||
model=model_name,
|
||||
task="auto",
|
||||
tokenizer=model_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
|
||||
@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
revision=model_info.revision,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
|
||||
@ -33,8 +33,8 @@ def v1(run_with_both_engines):
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
|
||||
@ -42,8 +42,8 @@ def dtype(request):
|
||||
@pytest.fixture(scope="module")
|
||||
def server(model_info, dtype: str):
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
dtype,
|
||||
|
||||
@ -21,7 +21,7 @@ LONG_TIMEOUT_SECONDS: Final[int] = 60
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
|
||||
@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest):
|
||||
passed_params = [passed_params]
|
||||
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
|
||||
@ -20,8 +20,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"reward",
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
|
||||
@ -26,8 +26,8 @@ def v1(run_with_both_engines):
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
|
||||
@ -29,8 +29,8 @@ input = """Immerse yourself in the enchanting chronicle of calculus, a
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enforce-eager",
|
||||
|
||||
@ -25,7 +25,7 @@ TEST_VIDEO_URLS = [
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"32768",
|
||||
|
||||
@ -48,7 +48,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
|
||||
@ -31,8 +31,8 @@ TEST_IMAGE_URLS = [
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
|
||||
@ -47,12 +47,8 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
@pytest.fixture(scope="function")
|
||||
def phi3v_model_config():
|
||||
return ModelConfig(PHI3V_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=PHI3V_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
runner="generate",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
})
|
||||
@ -61,12 +57,8 @@ def phi3v_model_config():
|
||||
@pytest.fixture(scope="function")
|
||||
def phi3v_model_config_mm_interleaved():
|
||||
return ModelConfig(PHI3V_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=PHI3V_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
runner="generate",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
interleave_mm_strings=True,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
@ -86,11 +78,7 @@ def phi3v_tokenizer():
|
||||
@pytest.fixture(scope="function")
|
||||
def qwen25omni_model_config_mm_interleaved():
|
||||
return ModelConfig(QWEN25OMNI_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=QWEN25OMNI_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
runner="generate",
|
||||
interleave_mm_strings=True,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
@ -112,12 +100,7 @@ def qwen25omni_tokenizer():
|
||||
@pytest.fixture(scope="module")
|
||||
def mllama_model_config():
|
||||
return ModelConfig(MLLAMA_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=MLLAMA_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
})
|
||||
@ -136,12 +119,7 @@ def mllama_tokenizer():
|
||||
@pytest.fixture(scope="function")
|
||||
def mistral_model_config():
|
||||
return ModelConfig(MISTRAL_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=MISTRAL_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
})
|
||||
@ -1105,12 +1083,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
# Build a config for the model
|
||||
model_config = ModelConfig(model,
|
||||
task="generate",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
})
|
||||
@ -1170,6 +1143,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
@ -1225,6 +1199,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
@ -1284,6 +1259,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
@ -38,13 +38,8 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
task="auto",
|
||||
tokenizer="meta-llama/Llama-2-7b-hf",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
enforce_eager=True,
|
||||
),
|
||||
load_config=LoadConfig(
|
||||
|
||||
@ -69,10 +69,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
|
||||
|
||||
config = ModelConfig(
|
||||
MODEL_NAME,
|
||||
task="generate",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
runner="generate",
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
@ -113,10 +110,7 @@ async def test_guided_logits_processor_with_reasoning(
|
||||
|
||||
config = ModelConfig(
|
||||
REASONING_MODEL_NAME,
|
||||
task="generate",
|
||||
tokenizer=REASONING_MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
runner="generate",
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
|
||||
@ -57,7 +57,6 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
# assert output
|
||||
assert output
|
||||
|
||||
|
||||
@ -99,7 +98,6 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
# assert output
|
||||
assert output
|
||||
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner,
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="score",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture in model_config.architectures)
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
|
||||
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
|
||||
|
||||
@ -85,7 +85,7 @@ def test_models(
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
@ -28,10 +28,7 @@ def test_find_array():
|
||||
|
||||
model_config = ModelConfig(
|
||||
MODEL_NAME,
|
||||
task="embed",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
runner="pooling",
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
)
|
||||
@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
async def test_gritlm_api_server_embedding():
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
client_embedding = server.get_async_client()
|
||||
@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
async def test_gritlm_api_server_generate():
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
client_generate = server.get_async_client()
|
||||
|
||||
@ -4,6 +4,7 @@ from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import PoolingParams
|
||||
|
||||
from ...utils import EmbedModelInfo, RerankModelInfo
|
||||
@ -62,6 +63,10 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
if (model_info.architecture == "XLMRobertaForSequenceClassification"
|
||||
and envs.VLLM_USE_V1):
|
||||
pytest.skip("Not supported yet")
|
||||
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@ -92,7 +97,7 @@ def test_matryoshka(
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
|
||||
|
||||
@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor)
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=None) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=256) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=1024):
|
||||
pass
|
||||
else:
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=1024) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
def test_set_max_model_len_illegal(model_info, vllm_runner):
|
||||
# set max_model_len > 2048
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, task="embed", max_model_len=4096):
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=4096):
|
||||
pass
|
||||
|
||||
# set max_model_len > 2048 by hf_overrides
|
||||
hf_overrides = {"max_model_len": 4096}
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
|
||||
}
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
# illegal max_model_len
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
# illegal max_model_len by hf_overrides
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
|
||||
@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict([text_pair]).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=DTYPE,
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=DTYPE,
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=DTYPE,
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner,
|
||||
|
||||
truncate_prompt_tokens = 10
|
||||
|
||||
with vllm_runner(model_name, task="embed",
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
vllm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner,
|
||||
input_str=input_str):
|
||||
truncate_prompt_tokens = -1
|
||||
|
||||
with vllm_runner(model_name, task="embed",
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
vllm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner,
|
||||
truncate_prompt_tokens = max_model_len + 1
|
||||
|
||||
with pytest.raises(ValueError), vllm_runner(
|
||||
model_name, task="embed",
|
||||
model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
|
||||
llm_output = vllm_model.llm.encode(
|
||||
|
||||
@ -222,7 +222,6 @@ VLM_TEST_SETTINGS = {
|
||||
},
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
# Check "auto" with fallback to transformers
|
||||
"internvl-transformers": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL3-1B-hf"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
@ -232,7 +231,7 @@ VLM_TEST_SETTINGS = {
|
||||
use_tokenizer_eos=True,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "auto",
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
marks=[pytest.mark.core_model],
|
||||
@ -638,7 +637,7 @@ VLM_TEST_SETTINGS = {
|
||||
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
# use sdpa mode for hf runner since phi3v didn't work with flash_attn
|
||||
hf_model_kwargs={"_attn_implementation": "sdpa"},
|
||||
use_tokenizer_eos=True,
|
||||
|
||||
@ -65,7 +65,7 @@ def run_test(
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
|
||||
@ -48,7 +48,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
]
|
||||
|
||||
with vllm_runner(model,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
|
||||
@ -99,7 +99,7 @@ def run_test(
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
|
||||
@ -267,7 +267,7 @@ def run_embedding_input_test(
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
|
||||
@ -6,7 +6,7 @@ from typing import Any, Callable, Optional
|
||||
import torch
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner
|
||||
@ -37,7 +37,7 @@ def run_test(
|
||||
vllm_runner_kwargs: Optional[dict[str, Any]],
|
||||
hf_model_kwargs: Optional[dict[str, Any]],
|
||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: Optional[torch.Tensor] = None,
|
||||
@ -83,7 +83,7 @@ def run_test(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
task=task,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ from pytest import MarkDecorator
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
@ -109,7 +109,7 @@ class VLMTestInfo(NamedTuple):
|
||||
enforce_eager: bool = True
|
||||
max_model_len: int = 1024
|
||||
max_num_seqs: int = 256
|
||||
task: TaskOption = "auto"
|
||||
runner: RunnerOption = "auto"
|
||||
tensor_parallel_size: int = 1
|
||||
vllm_runner_kwargs: Optional[dict[str, Any]] = None
|
||||
|
||||
@ -173,7 +173,7 @@ class VLMTestInfo(NamedTuple):
|
||||
"enforce_eager": self.enforce_eager,
|
||||
"max_model_len": self.max_model_len,
|
||||
"max_num_seqs": self.max_num_seqs,
|
||||
"task": self.task,
|
||||
"runner": self.runner,
|
||||
"tensor_parallel_size": self.tensor_parallel_size,
|
||||
"vllm_runner_kwargs": self.vllm_runner_kwargs,
|
||||
"hf_output_post_proc": self.hf_output_post_proc,
|
||||
|
||||
@ -92,7 +92,7 @@ def _run_test(
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192) as vllm_model:
|
||||
|
||||
@ -49,7 +49,7 @@ def vllm_reranker(
|
||||
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
task="score",
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_num_seqs=2,
|
||||
max_model_len=2048,
|
||||
|
||||
@ -64,7 +64,7 @@ def _run_test(
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True) as vllm_model:
|
||||
|
||||
@ -44,7 +44,7 @@ def _run_test(
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model, task="embed", dtype=dtype,
|
||||
with vllm_runner(model, runner="pooling", dtype=dtype,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ def _run_test(
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
|
||||
@ -58,13 +58,10 @@ def _test_processing_correctness(
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
seed=0,
|
||||
dtype="auto",
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
|
||||
@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
seed=0,
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
|
||||
@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model(
|
||||
|
||||
# Inflight 4bit quantization
|
||||
with vllm_runner(model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes") as vllm_model:
|
||||
|
||||
@ -7,13 +7,15 @@ import pytest
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import ModelImpl
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
|
||||
from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
|
||||
HF_EXAMPLE_MODELS, HfExampleModels)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@ -126,6 +128,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# these tests seem to produce leftover memory
|
||||
gpu_memory_utilization=0.80,
|
||||
load_format="dummy",
|
||||
model_impl=ModelImpl.TRANSFORMERS
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
|
||||
|
||||
@ -24,11 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
|
||||
def test_registry_imports(model_arch):
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# Ensure all model classes can be imported successfully
|
||||
model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
|
||||
model_cls = ModelRegistry._try_load_model_cls(model_arch)
|
||||
assert model_cls is not None
|
||||
|
||||
if model_arch in _SPECULATIVE_DECODING_MODELS:
|
||||
return # Ignore these models which do not have a unified format
|
||||
@ -56,14 +54,16 @@ def test_registry_imports(model_arch):
|
||||
("XLMRobertaForSequenceClassification", False, False, True),
|
||||
])
|
||||
def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
|
||||
assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
|
||||
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
|
||||
assert model_info is not None
|
||||
|
||||
assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
|
||||
assert model_info.supports_multimodal is is_mm
|
||||
assert model_info.supports_cross_encoding is is_ce
|
||||
|
||||
if init_cuda and current_platform.is_cuda_alike():
|
||||
assert not torch.cuda.is_initialized()
|
||||
|
||||
ModelRegistry.resolve_model_cls(model_arch)
|
||||
ModelRegistry._try_load_model_cls(model_arch)
|
||||
if not torch.cuda.is_initialized():
|
||||
warnings.warn(
|
||||
"This model no longer initializes CUDA on import. "
|
||||
@ -82,12 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
|
||||
("Qwen2VLForConditionalGeneration", True, True),
|
||||
])
|
||||
def test_registry_is_pp(model_arch, is_pp, init_cuda):
|
||||
assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
|
||||
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
|
||||
assert model_info is not None
|
||||
|
||||
assert model_info.supports_pp is is_pp
|
||||
|
||||
if init_cuda and current_platform.is_cuda_alike():
|
||||
assert not torch.cuda.is_initialized()
|
||||
|
||||
ModelRegistry.resolve_model_cls(model_arch)
|
||||
ModelRegistry._try_load_model_cls(model_arch)
|
||||
if not torch.cuda.is_initialized():
|
||||
warnings.warn(
|
||||
"This model no longer initializes CUDA on import. "
|
||||
|
||||
@ -33,6 +33,10 @@ def check_implementation(
|
||||
args = (example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with runner_test(model, **kwargs_test, **kwargs) as model_test:
|
||||
model_config = model_test.llm.llm_engine.model_config
|
||||
assert model_config.architecture == (
|
||||
model_config._get_transformers_backend_cls())
|
||||
|
||||
outputs_test = model_test.generate_greedy_logprobs(*args)
|
||||
|
||||
with runner_ref(model, **kwargs_ref) as model_ref:
|
||||
@ -130,8 +134,13 @@ def test_quantization(
|
||||
model_impl="transformers",
|
||||
enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.architecture == (
|
||||
model_config._get_transformers_backend_cls())
|
||||
|
||||
transformers_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=transformers_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
@ -151,7 +160,6 @@ def test_classify(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
@ -160,6 +168,10 @@ def test_classify(
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
model_impl="transformers") as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.architecture == (
|
||||
model_config._get_transformers_backend_cls())
|
||||
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
|
||||
@ -8,7 +8,7 @@ from typing import Any, NamedTuple, Optional, Union
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.config import ModelConfig, TaskOption
|
||||
from vllm.config import ModelConfig, RunnerOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
@ -255,7 +255,7 @@ def check_logprobs_close(
|
||||
|
||||
def build_model_context(
|
||||
model_id: str,
|
||||
task: TaskOption = "auto",
|
||||
runner: RunnerOption = "auto",
|
||||
dtype: Union[str, torch.dtype] = "auto",
|
||||
model_config_kwargs: Optional[dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
@ -280,9 +280,10 @@ def build_model_context(
|
||||
model_config_kwargs = model_config_kwargs or {}
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task=task,
|
||||
runner=runner,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=0,
|
||||
|
||||
@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
|
||||
@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
|
||||
@ -1061,16 +1047,7 @@ class _ProcessorProxy:
|
||||
)
|
||||
# yapf: enable
|
||||
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
||||
model_config = ModelConfig(
|
||||
model=model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
)
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
orig_get_hf_processor = processor.info.get_hf_processor
|
||||
|
||||
@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
|
||||
model_path, quantization_arg, expected_type = model_arg_exptype
|
||||
|
||||
try:
|
||||
model_config = ModelConfig(model_path,
|
||||
task="auto",
|
||||
tokenizer=model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
quantization=quantization_arg)
|
||||
model_config = ModelConfig(model_path, quantization=quantization_arg)
|
||||
found_quantization_type = model_config.quantization
|
||||
except ValueError:
|
||||
found_quantization_type = "ERROR"
|
||||
|
||||
@ -74,115 +74,116 @@ def test_update_config():
|
||||
new_config3 = update_config(config3, {"a": "new_value"})
|
||||
|
||||
|
||||
# Can remove once --task option is fully deprecated
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_task"),
|
||||
("model_id", "expected_runner_type", "expected_convert_type",
|
||||
"expected_task"),
|
||||
[
|
||||
("distilbert/distilgpt2", "generate", "generate"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "embed"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
|
||||
("openai/whisper-small", "generate", "transcription"),
|
||||
("distilbert/distilgpt2", "generate", "none", "generate"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
|
||||
"classify"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
|
||||
("openai/whisper-small", "generate", "none", "transcription"),
|
||||
],
|
||||
)
|
||||
def test_auto_task(model_id, expected_runner_type, expected_task):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
def test_auto_task(model_id, expected_runner_type, expected_convert_type,
|
||||
expected_task):
|
||||
config = ModelConfig(model_id, task="auto")
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
if config.runner_type == "pooling":
|
||||
assert config.task == expected_task
|
||||
else:
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
# Can remove once --task option is fully deprecated
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_convert_type",
|
||||
"expected_task"),
|
||||
[
|
||||
("distilbert/distilgpt2", "pooling", "embed", "embed"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
|
||||
"classify"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
|
||||
("openai/whisper-small", "pooling", "embed", "embed"),
|
||||
],
|
||||
)
|
||||
def test_score_task(model_id, expected_runner_type, expected_convert_type,
|
||||
expected_task):
|
||||
config = ModelConfig(model_id, task="score")
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
|
||||
# Can remove once --task option is fully deprecated
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_convert_type",
|
||||
"expected_task"),
|
||||
[
|
||||
("openai/whisper-small", "generate", "none", "transcription"),
|
||||
],
|
||||
)
|
||||
def test_transcription_task(model_id, expected_runner_type,
|
||||
expected_convert_type, expected_task):
|
||||
config = ModelConfig(model_id, task="transcription")
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_task"),
|
||||
("model_id", "expected_runner_type", "expected_convert_type"),
|
||||
[
|
||||
("distilbert/distilgpt2", "generate", "none"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "none"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
|
||||
("openai/whisper-small", "generate", "none"),
|
||||
],
|
||||
)
|
||||
def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
|
||||
config = ModelConfig(model_id, runner="auto")
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_convert_type"),
|
||||
[
|
||||
("distilbert/distilgpt2", "pooling", "embed"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "embed"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "none"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
|
||||
("openai/whisper-small", "pooling", "embed"),
|
||||
],
|
||||
)
|
||||
def test_score_task(model_id, expected_runner_type, expected_task):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="score",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
|
||||
config = ModelConfig(model_id, runner="pooling")
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.task == expected_task
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
|
||||
[
|
||||
("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
|
||||
])
|
||||
def test_draft_task(model_id, expected_runner_type, expected_task):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
runner="draft",
|
||||
tokenizer=model_id,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.task == expected_task
|
||||
assert config.convert_type == expected_convert_type
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_task"),
|
||||
("model_id", "expected_runner_type", "expected_convert_type"),
|
||||
[
|
||||
("openai/whisper-small", "generate", "transcription"),
|
||||
("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none"),
|
||||
],
|
||||
)
|
||||
def test_transcription_task(model_id, expected_runner_type, expected_task):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="transcription",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
def test_draft_runner(model_id, expected_runner_type, expected_convert_type):
|
||||
config = ModelConfig(model_id, runner="draft")
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.task == expected_task
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("model_id", "bad_task"), [
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "generate"),
|
||||
("Qwen/Qwen3-0.6B", "transcription"),
|
||||
])
|
||||
def test_incorrect_task(model_id, bad_task):
|
||||
with pytest.raises(ValueError, match=r"does not support task=.*"):
|
||||
ModelConfig(
|
||||
model_id,
|
||||
task=bad_task,
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
assert config.convert_type == expected_convert_type
|
||||
|
||||
|
||||
MODEL_IDS_EXPECTED = [
|
||||
@ -195,17 +196,7 @@ MODEL_IDS_EXPECTED = [
|
||||
@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
|
||||
def test_disable_sliding_window(model_id_expected):
|
||||
model_id, expected = model_id_expected
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
disable_sliding_window=True,
|
||||
)
|
||||
model_config = ModelConfig(model_id, disable_sliding_window=True)
|
||||
assert model_config.max_model_len == expected
|
||||
|
||||
|
||||
@ -214,16 +205,7 @@ def test_get_sliding_window():
|
||||
# Test that the sliding window is correctly computed.
|
||||
# For Qwen1.5/Qwen2, get_sliding_window() should be None
|
||||
# when use_sliding_window is False.
|
||||
qwen2_model_config = ModelConfig(
|
||||
"Qwen/Qwen1.5-7B",
|
||||
task="auto",
|
||||
tokenizer="Qwen/Qwen1.5-7B",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")
|
||||
|
||||
qwen2_model_config.hf_config.use_sliding_window = False
|
||||
qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
|
||||
@ -232,16 +214,7 @@ def test_get_sliding_window():
|
||||
qwen2_model_config.hf_config.use_sliding_window = True
|
||||
assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
|
||||
|
||||
mistral_model_config = ModelConfig(
|
||||
"mistralai/Mistral-7B-v0.1",
|
||||
task="auto",
|
||||
tokenizer="mistralai/Mistral-7B-v0.1",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
|
||||
mistral_model_config.hf_config.sliding_window = None
|
||||
assert mistral_model_config.get_sliding_window() is None
|
||||
|
||||
@ -253,16 +226,7 @@ def test_get_sliding_window():
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_get_pooling_config():
|
||||
model_id = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
pooling_config = model_config._init_pooler_config()
|
||||
assert pooling_config is not None
|
||||
@ -275,14 +239,7 @@ def test_get_pooling_config():
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_get_pooling_config_from_args():
|
||||
model_id = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
model_config = ModelConfig(model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None)
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
|
||||
model_config.override_pooler_config = override_pooler_config
|
||||
@ -295,16 +252,8 @@ def test_get_pooling_config_from_args():
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_get_bert_tokenization_sentence_transformer_config():
|
||||
bge_model_config = ModelConfig(
|
||||
model="BAAI/bge-base-en-v1.5",
|
||||
task="auto",
|
||||
tokenizer="BAAI/bge-base-en-v1.5",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
model_id = "BAAI/bge-base-en-v1.5"
|
||||
bge_model_config = ModelConfig(model_id)
|
||||
|
||||
bert_bge_model_config = bge_model_config._get_encoder_config()
|
||||
|
||||
@ -317,27 +266,13 @@ def test_rope_customization():
|
||||
TEST_ROPE_THETA = 16_000_000.0
|
||||
LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
|
||||
|
||||
llama_model_config = ModelConfig(
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
task="auto",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
seed=0,
|
||||
)
|
||||
llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
|
||||
assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
|
||||
assert llama_model_config.max_model_len == 8192
|
||||
|
||||
llama_model_config = ModelConfig(
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
task="auto",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
seed=0,
|
||||
hf_overrides={
|
||||
"rope_scaling": TEST_ROPE_SCALING,
|
||||
"rope_theta": TEST_ROPE_THETA,
|
||||
@ -349,15 +284,7 @@ def test_rope_customization():
|
||||
None) == TEST_ROPE_THETA
|
||||
assert llama_model_config.max_model_len == 16384
|
||||
|
||||
longchat_model_config = ModelConfig(
|
||||
"lmsys/longchat-13b-16k",
|
||||
task="auto",
|
||||
tokenizer="lmsys/longchat-13b-16k",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
seed=0,
|
||||
)
|
||||
longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
|
||||
# Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
|
||||
assert all(
|
||||
longchat_model_config.hf_config.rope_scaling.get(key) == value
|
||||
@ -366,12 +293,6 @@ def test_rope_customization():
|
||||
|
||||
longchat_model_config = ModelConfig(
|
||||
"lmsys/longchat-13b-16k",
|
||||
task="auto",
|
||||
tokenizer="lmsys/longchat-13b-16k",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
seed=0,
|
||||
hf_overrides={
|
||||
"rope_scaling": TEST_ROPE_SCALING,
|
||||
},
|
||||
@ -390,15 +311,7 @@ def test_rope_customization():
|
||||
("meta-llama/Llama-3.2-11B-Vision", True),
|
||||
])
|
||||
def test_is_encoder_decoder(model_id, is_encoder_decoder):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
seed=0,
|
||||
)
|
||||
config = ModelConfig(model_id)
|
||||
|
||||
assert config.is_encoder_decoder == is_encoder_decoder
|
||||
|
||||
@ -408,15 +321,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
|
||||
("Qwen/Qwen2-VL-2B-Instruct", True),
|
||||
])
|
||||
def test_uses_mrope(model_id, uses_mrope):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
seed=0,
|
||||
)
|
||||
config = ModelConfig(model_id)
|
||||
|
||||
assert config.uses_mrope == uses_mrope
|
||||
|
||||
@ -426,26 +331,12 @@ def test_generation_config_loading():
|
||||
|
||||
# When set generation_config to "vllm", the default generation config
|
||||
# will not be loaded.
|
||||
model_config = ModelConfig(model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
generation_config="vllm")
|
||||
model_config = ModelConfig(model_id, generation_config="vllm")
|
||||
assert model_config.get_diff_sampling_param() == {}
|
||||
|
||||
# When set generation_config to "auto", the default generation config
|
||||
# should be loaded.
|
||||
model_config = ModelConfig(model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
generation_config="auto")
|
||||
model_config = ModelConfig(model_id, generation_config="auto")
|
||||
|
||||
correct_generation_config = {
|
||||
"repetition_penalty": 1.1,
|
||||
@ -461,12 +352,6 @@ def test_generation_config_loading():
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
generation_config="auto",
|
||||
override_generation_config=override_generation_config)
|
||||
|
||||
@ -479,12 +364,6 @@ def test_generation_config_loading():
|
||||
# is set, the override_generation_config should be used directly.
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
generation_config="vllm",
|
||||
override_generation_config=override_generation_config)
|
||||
|
||||
@ -515,16 +394,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
|
||||
def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
|
||||
should_raise):
|
||||
"""Test get_and_verify_max_len with different configurations."""
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
if should_raise:
|
||||
with pytest.raises(ValueError):
|
||||
|
||||
@ -21,13 +21,8 @@ def test_max_tokens_none():
|
||||
def model_config():
|
||||
return ModelConfig(
|
||||
MODEL_NAME,
|
||||
task="auto",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -695,11 +695,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
|
||||
# Create a VllmConfig
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="generate",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
runner="generate",
|
||||
dtype="float16",
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
@ -733,11 +729,7 @@ def test_get_max_concurrency_for_kv_cache_config():
|
||||
max_model_len = 16384
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="generate",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
runner="generate",
|
||||
dtype="float16",
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
|
||||
@ -1248,9 +1248,6 @@ def create_scheduler_with_priority(
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
|
||||
@ -59,9 +59,6 @@ def create_scheduler(
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
|
||||
@ -68,9 +68,6 @@ def create_vllm_config(
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
|
||||
@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
|
||||
def _create_proposer(method: str, k: int) -> EagleProposer:
|
||||
model_config = ModelConfig(model=model_dir,
|
||||
task="generate",
|
||||
max_model_len=100,
|
||||
tokenizer=model_dir,
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
trust_remote_code=False)
|
||||
runner="generate",
|
||||
max_model_len=100)
|
||||
|
||||
# Choose model directory based on method
|
||||
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
|
||||
|
||||
@ -44,14 +44,7 @@ def test_ngram_proposer():
|
||||
|
||||
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
|
||||
# Dummy model config. Just to set max_model_len.
|
||||
model_config = ModelConfig(model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=100,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
trust_remote_code=False)
|
||||
model_config = ModelConfig(model="facebook/opt-125m")
|
||||
return NgramProposer(
|
||||
vllm_config=VllmConfig(model_config=model_config,
|
||||
speculative_config=SpeculativeConfig.
|
||||
|
||||
@ -26,10 +26,6 @@ def get_vllm_config():
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16", # TPUs typically use bfloat16
|
||||
seed=42,
|
||||
)
|
||||
|
||||
@ -76,10 +76,6 @@ def get_vllm_config():
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user