[Doc] Improve MM Pooling model documentation (#25966)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -4,69 +4,137 @@
|
||||
"""Example Python client for multimodal embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported multimodal embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
from typing import Literal, Union
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
from PIL import Image
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
|
||||
def vlm2vec():
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
"""
|
||||
Convenience function for accessing vLLM's Chat Embeddings API,
|
||||
which is an extension of OpenAI's existing Embeddings API.
|
||||
"""
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
|
||||
|
||||
def dse_qwen2_vl(inp: dict):
|
||||
# Embedding an Image
|
||||
if inp["type"] == "image":
|
||||
messages = [
|
||||
def run_vlm2vec(client: OpenAI, model: str):
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Represent the given image with the following question: What is in the image.",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image+Text embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "A cat and a dog"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": inp["image_url"],
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
# Embedding a Text Query
|
||||
else:
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
messages = [
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
@ -76,23 +144,21 @@ def dse_qwen2_vl(inp: dict):
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": f"Query: {inp['content']}"},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "MrLight/dse-qwen2-2b-mrl-v1",
|
||||
"messages": messages,
|
||||
"encoding_format": "float",
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"dse_qwen2_vl": run_dse_qwen2_vl,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
@ -103,29 +169,24 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
choices=["vlm2vec", "dse_qwen2_vl"],
|
||||
choices=model_example_map.keys(),
|
||||
required=True,
|
||||
help="Which model to call.",
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.model == "vlm2vec":
|
||||
vlm2vec()
|
||||
elif args.model == "dse_qwen2_vl":
|
||||
dse_qwen2_vl(
|
||||
{
|
||||
"type": "image",
|
||||
"image_url": image_url,
|
||||
}
|
||||
)
|
||||
dse_qwen2_vl(
|
||||
{
|
||||
"type": "text",
|
||||
"content": "What is the weather like today?",
|
||||
}
|
||||
)
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model_id = models.data[0].id
|
||||
|
||||
model_example_map[args.model](client, model_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user