[Misc] IO Processor plugins for pooling models (#22820)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
This commit is contained in:
Christian Pinto
2025-09-01 07:07:12 +01:00
committed by GitHub
parent 437c3ce026
commit 1cb39dbcdd
25 changed files with 1183 additions and 43 deletions

View File

@ -6,7 +6,8 @@
import json
import time
from http import HTTPStatus
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
from typing import (Annotated, Any, ClassVar, Generic, Literal, Optional,
TypeVar, Union)
import regex as re
import torch
@ -1405,7 +1406,46 @@ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
PoolingCompletionRequest = EmbeddingCompletionRequest
PoolingChatRequest = EmbeddingChatRequest
PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
T = TypeVar("T")
class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
model: Optional[str] = None
priority: int = Field(default=0)
"""
The priority of the request (lower means earlier handling;
default: 0). Any priority other than 0 will raise an error
if the served model does not use priority scheduling.
"""
data: T
"""
When using plugins IOProcessor plugins, the actual input is processed
by the plugin itself. Hence, we use a generic type for the request data
"""
def to_pooling_params(self):
return PoolingParams(task="encode")
class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
request_id: Optional[str] = None
"""
The request_id associated with this response
"""
created_at: int = Field(default_factory=lambda: int(time.time()))
data: T
"""
When using plugins IOProcessor plugins, the actual output is generated
by the plugin itself. Hence, we use a generic type for the response data
"""
PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest,
IOProcessorRequest]
class ScoreRequest(OpenAIBaseModel):