[Model] Pooling model activation supports per request control by PoolingParams (#20538)
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
67
tests/entrypoints/llm/test_classify.py
Normal file
67
tests/entrypoints/llm/test_classify.py
Normal file
@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
from ...models.utils import softmax
|
||||
|
||||
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(activation):
|
||||
outputs = llm.classify(
|
||||
prompts,
|
||||
pooling_params=PoolingParams(activation=activation),
|
||||
use_tqdm=False)
|
||||
return torch.tensor([x.outputs.probs for x in outputs])
|
||||
|
||||
default = get_outputs(activation=None)
|
||||
w_activation = get_outputs(activation=True)
|
||||
wo_activation = get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
56
tests/entrypoints/llm/test_embedding.py
Normal file
56
tests/entrypoints/llm/test_embedding.py
Normal file
@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(normalize):
|
||||
outputs = llm.embed(prompts,
|
||||
pooling_params=PoolingParams(normalize=normalize),
|
||||
use_tqdm=False)
|
||||
return torch.tensor([x.outputs.embedding for x in outputs])
|
||||
|
||||
default = get_outputs(normalize=None)
|
||||
w_normal = get_outputs(normalize=True)
|
||||
wo_normal = get_outputs(normalize=False)
|
||||
|
||||
assert torch.allclose(default, w_normal,
|
||||
atol=1e-2), "Default should use normal."
|
||||
assert not torch.allclose(w_normal, wo_normal,
|
||||
atol=1e-2), "wo_normal should not use normal."
|
||||
assert torch.allclose(
|
||||
w_normal, F.normalize(wo_normal, p=2, dim=-1),
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
66
tests/entrypoints/llm/test_reward.py
Normal file
66
tests/entrypoints/llm/test_reward.py
Normal file
@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
from ...models.utils import softmax
|
||||
|
||||
MODEL_NAME = "internlm/internlm2-1_8b-reward"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
seed=0)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(softmax):
|
||||
outputs = llm.reward(prompts,
|
||||
pooling_params=PoolingParams(softmax=softmax),
|
||||
use_tqdm=False)
|
||||
return torch.cat([x.outputs.data for x in outputs])
|
||||
|
||||
default = get_outputs(softmax=None)
|
||||
w_softmax = get_outputs(softmax=True)
|
||||
wo_softmax = get_outputs(softmax=False)
|
||||
|
||||
assert torch.allclose(default, w_softmax,
|
||||
atol=1e-2), "Default should use softmax."
|
||||
assert not torch.allclose(w_softmax, wo_softmax,
|
||||
atol=1e-2), "wo_softmax should not use softmax."
|
||||
assert torch.allclose(
|
||||
softmax(wo_softmax), w_softmax,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
69
tests/entrypoints/llm/test_score.py
Normal file
69
tests/entrypoints/llm/test_score.py
Normal file
@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
from ...models.utils import softmax
|
||||
|
||||
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(activation):
|
||||
text_1 = "What is the capital of France?"
|
||||
text_2 = "The capital of France is Paris."
|
||||
|
||||
outputs = llm.score(
|
||||
text_1,
|
||||
text_2,
|
||||
pooling_params=PoolingParams(activation=activation),
|
||||
use_tqdm=False)
|
||||
return torch.tensor([x.outputs.score for x in outputs])
|
||||
|
||||
default = get_outputs(activation=None)
|
||||
w_activation = get_outputs(activation=True)
|
||||
wo_activation = get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
@ -3,6 +3,8 @@
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ClassificationResponse
|
||||
|
||||
@ -181,3 +183,32 @@ async def test_invocations(server: RemoteOpenAIServer):
|
||||
assert classification_data.keys() == invocation_data.keys()
|
||||
assert classification_data["probs"] == pytest.approx(
|
||||
invocation_data["probs"], rel=0.01)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_activation(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = ["This product was excellent and exceeded my expectations"]
|
||||
|
||||
async def get_outputs(activation):
|
||||
response = requests.post(server.url_for("classify"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": input_text,
|
||||
"activation": activation
|
||||
})
|
||||
outputs = response.json()
|
||||
return torch.tensor([x['probs'] for x in outputs["data"]])
|
||||
|
||||
default = await get_outputs(activation=None)
|
||||
w_activation = await get_outputs(activation=True)
|
||||
wo_activation = await get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
|
||||
@ -8,6 +8,8 @@ import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
@ -369,3 +371,35 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="chat",
|
||||
name_1="invocation")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = ["The chef prepared a delicious meal."]
|
||||
|
||||
async def get_outputs(normalize):
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"normalize": normalize
|
||||
}
|
||||
|
||||
response = requests.post(server.url_for("v1/embeddings"),
|
||||
json=request_args)
|
||||
outputs = response.json()
|
||||
|
||||
return torch.tensor([x['embedding'] for x in outputs["data"]])
|
||||
|
||||
default = await get_outputs(normalize=None)
|
||||
w_normal = await get_outputs(normalize=True)
|
||||
wo_normal = await get_outputs(normalize=False)
|
||||
|
||||
assert torch.allclose(default, w_normal,
|
||||
atol=1e-2), "Default should use normal."
|
||||
assert not torch.allclose(w_normal, wo_normal,
|
||||
atol=1e-2), "wo_normal should not use normal."
|
||||
assert torch.allclose(
|
||||
w_normal, F.normalize(wo_normal, p=2, dim=-1),
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
|
||||
@ -3,6 +3,8 @@
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.entrypoints.openai.protocol import RerankResponse
|
||||
|
||||
@ -125,3 +127,39 @@ def test_invocations(server: RemoteOpenAIServer):
|
||||
assert rerank_result.keys() == invocations_result.keys()
|
||||
assert rerank_result["relevance_score"] == pytest.approx(
|
||||
invocations_result["relevance_score"], rel=0.01)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_activation(server: RemoteOpenAIServer, model_name: str):
|
||||
|
||||
async def get_outputs(activation):
|
||||
query = "What is the capital of France?"
|
||||
documents = [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
|
||||
response = requests.post(server.url_for("rerank"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"query": query,
|
||||
"documents": documents,
|
||||
"activation": activation
|
||||
})
|
||||
outputs = response.json()
|
||||
|
||||
return torch.tensor([x['relevance_score'] for x in outputs["results"]])
|
||||
|
||||
default = await get_outputs(activation=None)
|
||||
w_activation = await get_outputs(activation=True)
|
||||
wo_activation = await get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
F.sigmoid(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
|
||||
@ -4,6 +4,7 @@ from typing import Any
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import tensor
|
||||
|
||||
@ -220,3 +221,43 @@ class TestModel:
|
||||
assert score_data.keys() == invocation_data.keys()
|
||||
assert score_data["score"] == pytest.approx(
|
||||
invocation_data["score"], rel=0.01)
|
||||
|
||||
def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
|
||||
Any]):
|
||||
|
||||
def get_outputs(activation):
|
||||
text_1 = "What is the capital of France?"
|
||||
text_2 = "The capital of France is Paris."
|
||||
response = requests.post(server.url_for("score"),
|
||||
json={
|
||||
"model": model["name"],
|
||||
"text_1": text_1,
|
||||
"text_2": text_2,
|
||||
"activation": activation
|
||||
})
|
||||
if response.status_code != 200:
|
||||
return response
|
||||
|
||||
outputs = response.json()
|
||||
return torch.tensor([x['score'] for x in outputs["data"]])
|
||||
|
||||
if model["is_cross_encoder"]:
|
||||
|
||||
default = get_outputs(activation=None)
|
||||
w_activation = get_outputs(activation=True)
|
||||
wo_activation = get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
F.sigmoid(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
else:
|
||||
get_outputs(activation=None)
|
||||
|
||||
# The activation parameter only works for the is_cross_encoder model
|
||||
response = get_outputs(activation=True)
|
||||
assert response.status_code == 400
|
||||
|
||||
Reference in New Issue
Block a user