[Deprecation][2/N] Replace --task with --runner and --convert (#21470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Cyrus Leung
2025-07-28 10:42:40 +08:00
committed by GitHub
parent 8f605ee309
commit 86ae693f20
94 changed files with 1117 additions and 1083 deletions

View File

@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)

View File

@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name,
task="score",
runner="pooling",
max_model_len=None,
max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_config.hf_config.num_labels == 1
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),

View File

@ -85,7 +85,7 @@ def test_models(
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model,
task="embed",
runner="pooling",
max_model_len=max_model_len,
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)

View File

@ -28,10 +28,7 @@ def test_find_array():
model_config = ModelConfig(
MODEL_NAME,
task="embed",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
runner="pooling",
dtype="bfloat16",
seed=0,
)
@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
with vllm_runner(
MODEL_NAME,
task="embed",
runner="pooling",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm
@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
async def test_gritlm_api_server_embedding():
queries, q_instruction, documents, d_instruction = get_test_data()
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as server:
client_embedding = server.get_async_client()
@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
with vllm_runner(
MODEL_NAME,
task="generate",
runner="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm
@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as server:
client_generate = server.get_async_client()

View File

@ -4,6 +4,7 @@ from functools import partial
import pytest
import vllm.envs as envs
from vllm import PoolingParams
from ...utils import EmbedModelInfo, RerankModelInfo
@ -62,6 +63,10 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
if (model_info.architecture == "XLMRobertaForSequenceClassification"
and envs.VLLM_USE_V1):
pytest.skip("Not supported yet")
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
@ -92,7 +97,7 @@ def test_matryoshka(
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
dtype=dtype,
max_model_len=None) as vllm_model:
assert vllm_model.llm.llm_engine.model_config.is_matryoshka

View File

@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor)
@pytest.mark.parametrize("model_info", MODELS)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name, runner="pooling",
max_model_len=None) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
@pytest.mark.parametrize("model_info", MODELS)
def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name, runner="pooling",
max_model_len=256) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
with pytest.raises(ValueError):
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=1024):
pass
else:
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name, runner="pooling",
max_model_len=1024) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048
with pytest.raises(ValueError):
with vllm_runner(model_info.name, task="embed", max_model_len=4096):
with vllm_runner(model_info.name, runner="pooling",
max_model_len=4096):
pass
# set max_model_len > 2048 by hf_overrides
hf_overrides = {"max_model_len": 4096}
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
pass
@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
}
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
pass
@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
# illegal max_model_len
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides):
pass
@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
# illegal max_model_len by hf_overrides
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
pass

View File

@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE,
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE,
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE,
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
]
with vllm_runner(emb_model_name,
task="embed",
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
]
with vllm_runner(emb_model_name,
task="embed",
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
]
with vllm_runner(emb_model_name,
task="embed",
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

View File

@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner,
truncate_prompt_tokens = 10
with vllm_runner(model_name, task="embed",
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner,
input_str=input_str):
truncate_prompt_tokens = -1
with vllm_runner(model_name, task="embed",
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner,
truncate_prompt_tokens = max_model_len + 1
with pytest.raises(ValueError), vllm_runner(
model_name, task="embed",
model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.llm.encode(