[Deprecation][2/N] Replace --task with --runner and --convert (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner,
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="score",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture in model_config.architectures)
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
|
||||
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
|
||||
|
||||
@ -85,7 +85,7 @@ def test_models(
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
@ -28,10 +28,7 @@ def test_find_array():
|
||||
|
||||
model_config = ModelConfig(
|
||||
MODEL_NAME,
|
||||
task="embed",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
runner="pooling",
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
)
|
||||
@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
async def test_gritlm_api_server_embedding():
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
client_embedding = server.get_async_client()
|
||||
@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
task="generate",
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
async def test_gritlm_api_server_generate():
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
client_generate = server.get_async_client()
|
||||
|
||||
@ -4,6 +4,7 @@ from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import PoolingParams
|
||||
|
||||
from ...utils import EmbedModelInfo, RerankModelInfo
|
||||
@ -62,6 +63,10 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
if (model_info.architecture == "XLMRobertaForSequenceClassification"
|
||||
and envs.VLLM_USE_V1):
|
||||
pytest.skip("Not supported yet")
|
||||
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@ -92,7 +97,7 @@ def test_matryoshka(
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
|
||||
|
||||
@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor)
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=None) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=256) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=1024):
|
||||
pass
|
||||
else:
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=1024) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
def test_set_max_model_len_illegal(model_info, vllm_runner):
|
||||
# set max_model_len > 2048
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, task="embed", max_model_len=4096):
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=4096):
|
||||
pass
|
||||
|
||||
# set max_model_len > 2048 by hf_overrides
|
||||
hf_overrides = {"max_model_len": 4096}
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
|
||||
}
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
# illegal max_model_len
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
# illegal max_model_len by hf_overrides
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
pass
|
||||
|
||||
@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict([text_pair]).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=DTYPE,
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=DTYPE,
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=DTYPE,
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner,
|
||||
|
||||
truncate_prompt_tokens = 10
|
||||
|
||||
with vllm_runner(model_name, task="embed",
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
vllm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner,
|
||||
input_str=input_str):
|
||||
truncate_prompt_tokens = -1
|
||||
|
||||
with vllm_runner(model_name, task="embed",
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
vllm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner,
|
||||
truncate_prompt_tokens = max_model_len + 1
|
||||
|
||||
with pytest.raises(ValueError), vllm_runner(
|
||||
model_name, task="embed",
|
||||
model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
|
||||
llm_output = vllm_model.llm.encode(
|
||||
|
||||
Reference in New Issue
Block a user