Support encoder-only models without KV-Cache (#21270)
Signed-off-by: Max de Bayser <maxdebayser@gmail.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
committed by
GitHub
parent
f27fdfc3ed
commit
1cd6eaba54
@ -1062,8 +1062,17 @@ class VllmRunner:
|
||||
return [req_output.outputs.score for req_output in req_outputs]
|
||||
|
||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||
executor = self.llm.llm_engine.model_executor
|
||||
return executor.apply_model(func)
|
||||
if hasattr(self.llm.llm_engine, "model_executor"):
|
||||
# This works either in V0 or in V1 with
|
||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||
executor = self.llm.llm_engine.model_executor
|
||||
return executor.apply_model(func)
|
||||
|
||||
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
def _apply_model(self):
|
||||
return func(self.get_model())
|
||||
|
||||
return self.llm.llm_engine.collective_rpc(_apply_model)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_model_loading_with_params(vllm_runner):
|
||||
def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
"""
|
||||
# to use apply_model
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_name=MODEL_NAME,
|
||||
revision=REVISION,
|
||||
dtype="float16",
|
||||
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_roberta_model_loading_with_params(vllm_runner):
|
||||
def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
"""
|
||||
# to use apply_model
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
|
||||
revision=REVISION_ROBERTA,
|
||||
dtype="float16",
|
||||
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_facebook_roberta_model_loading_with_params(vllm_runner):
|
||||
def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"""
|
||||
Test loading roberta-base model with no lm_head.
|
||||
"""
|
||||
# to use apply_model
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
model_name = "FacebookAI/roberta-base"
|
||||
with vllm_runner(model_name=model_name,
|
||||
dtype="float16",
|
||||
|
||||
@ -39,17 +39,9 @@ def v1(run_with_both_engines):
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
|
||||
# [Encoder-only]
|
||||
pytest.param(
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
marks=[
|
||||
# CPU only supports V1
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.skip_v1
|
||||
]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
|
||||
marks=[pytest.mark.skip_v1]),
|
||||
pytest.param("intfloat/multilingual-e5-small",
|
||||
marks=[pytest.mark.skip_v1]),
|
||||
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
marks=[pytest.mark.skip_v1]),
|
||||
# [Cross-Encoder]
|
||||
|
||||
@ -23,6 +23,14 @@ RERANK_MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
|
||||
@ -93,6 +93,7 @@ def create_common_attn_metadata(
|
||||
max_query_len=max_query_len,
|
||||
block_table_tensor=block_table_tensor,
|
||||
slot_mapping=slot_mapping,
|
||||
causal=True,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
|
||||
"openai/whisper-large-v3", # transcription
|
||||
"facebook/bart-large-cnn", # encoder decoder
|
||||
"state-spaces/mamba-130m-hf", # mamba1
|
||||
"BAAI/bge-m3", # embedding
|
||||
]
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
import requests
|
||||
import torch
|
||||
|
||||
|
||||
Reference in New Issue
Block a user