Support encoder-only models without KV-Cache (#21270)

Signed-off-by: Max de Bayser <maxdebayser@gmail.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-07-26 10:09:52 -03:00
parent f27fdfc3ed
commit 1cd6eaba54
17 changed files with 352 additions and 99 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1062,8 +1062,17 @@ class VllmRunner:
        return [req_output.outputs.score for req_output in req_outputs]

    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.llm.llm_engine.model_executor
-        return executor.apply_model(func)
+        if hasattr(self.llm.llm_engine, "model_executor"):
+            # This works either in V0 or in V1 with
+            # VLLM_ENABLE_V1_MULTIPROCESSING=0
+            executor = self.llm.llm_engine.model_executor
+            return executor.apply_model(func)
+
+        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
+        def _apply_model(self):
+            return func(self.get_model())
+
+        return self.llm.llm_engine.collective_rpc(_apply_model)

    def __enter__(self):
        return self
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")

@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
-def test_model_loading_with_params(vllm_runner):
+def test_model_loading_with_params(vllm_runner, monkeypatch):
    """
    Test parameter weight loading with tp>1.
    """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    with vllm_runner(model_name=MODEL_NAME,
                     revision=REVISION,
                     dtype="float16",
@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):

@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
-def test_roberta_model_loading_with_params(vllm_runner):
+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
    """
    Test parameter weight loading with tp>1.
    """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                     revision=REVISION_ROBERTA,
                     dtype="float16",
@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):

@pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
-def test_facebook_roberta_model_loading_with_params(vllm_runner):
+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
    """
    Test loading roberta-base model with no lm_head.
    """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    model_name = "FacebookAI/roberta-base"
    with vllm_runner(model_name=model_name,
                     dtype="float16",
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -39,17 +39,9 @@ def v1(run_with_both_engines):
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
        # [Encoder-only]
-        pytest.param(
-            "BAAI/bge-base-en-v1.5",
-            marks=[
-                # CPU only supports V1
-                pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                     marks=[pytest.mark.skip_v1]),
        # [Cross-Encoder]
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@ -23,6 +23,14 @@ RERANK_MODELS = [
 ]


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                           model_info: EmbedModelInfo) -> None:
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -93,6 +93,7 @@ def create_common_attn_metadata(
        max_query_len=max_query_len,
        block_table_tensor=block_table_tensor,
        slot_mapping=slot_mapping,
+        causal=True,
    )


--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
    "openai/whisper-large-v3",  # transcription
    "facebook/bart-large-cnn",  # encoder decoder
    "state-spaces/mamba-130m-hf",  # mamba1
-    "BAAI/bge-m3",  # embedding
 ]

 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import re
-
 import pytest
+import regex as re
 import requests
 import torch