[Doc] Improve MM Pooling model documentation (#25966)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-01 02:58:29 +08:00
parent e6a226efba
commit 2f652e6cdf
9 changed files with 292 additions and 100 deletions
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@ -4,69 +4,137 @@
 """Example Python client for multimodal embedding API using vLLM API server
 NOTE:
    start a supported multimodal embeddings model server with `vllm serve`, e.g.
-    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
+    vllm serve TIGER-Lab/VLM2Vec-Full \
+        --runner pooling \
+        --trust-remote-code \
+        --max-model-len 4096 \
+        --chat-template examples/template_vlm2vec_phi3v.jinja
 """

 import argparse
 import base64
 import io
+from typing import Literal, Union

-import requests
+from openai import OpenAI
+from openai._types import NOT_GIVEN, NotGiven
+from openai.types.chat import ChatCompletionMessageParam
+from openai.types.create_embedding_response import CreateEmbeddingResponse
 from PIL import Image

+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"


-def vlm2vec():
-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "TIGER-Lab/VLM2Vec-Full",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": image_url}},
-                        {"type": "text", "text": "Represent the given image."},
-                    ],
-                }
-            ],
-            "encoding_format": "float",
-        },
+def create_chat_embeddings(
+    client: OpenAI,
+    *,
+    messages: list[ChatCompletionMessageParam],
+    model: str,
+    encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
+) -> CreateEmbeddingResponse:
+    """
+    Convenience function for accessing vLLM's Chat Embeddings API,
+    which is an extension of OpenAI's existing Embeddings API.
+    """
+    return client.post(
+        "/embeddings",
+        cast_to=CreateEmbeddingResponse,
+        body={"messages": messages, "model": model, "encoding_format": encoding_format},
    )
-    response.raise_for_status()
-    response_json = response.json()
-
-    print("Embedding output:", response_json["data"][0]["embedding"])


-def dse_qwen2_vl(inp: dict):
-    # Embedding an Image
-    if inp["type"] == "image":
-        messages = [
+def run_vlm2vec(client: OpenAI, model: str):
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Represent the given image."},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image embedding output:", response.data[0].embedding)
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "Represent the given image with the following question: What is in the image.",
+                    },
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image+Text embedding output:", response.data[0].embedding)
+
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "A cat and a dog"},
+                ],
+            }
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Text embedding output:", response.data[0].embedding)
+
+
+def run_dse_qwen2_vl(client: OpenAI, model: str):
+    response = create_chat_embeddings(
+        client,
+        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
-                            "url": inp["image_url"],
+                            "url": image_url,
                        },
                    },
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            }
-        ]
-    # Embedding a Text Query
-    else:
-        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
-        # of the minimum input size
-        buffer = io.BytesIO()
-        image_placeholder = Image.new("RGB", (56, 56))
-        image_placeholder.save(buffer, "png")
-        buffer.seek(0)
-        image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
-        messages = [
+        ],
+        model=model,
+        encoding_format="float",
+    )
+
+    print("Image embedding output:", response.data[0].embedding)
+
+    # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+    # of the minimum input size
+    buffer = io.BytesIO()
+    image_placeholder = Image.new("RGB", (56, 56))
+    image_placeholder.save(buffer, "png")
+    buffer.seek(0)
+    image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+    response = create_chat_embeddings(
+        client,
+        messages=[
            {
                "role": "user",
                "content": [
@ -76,23 +144,21 @@ def dse_qwen2_vl(inp: dict):
                            "url": f"data:image/jpeg;base64,{image_placeholder}",
                        },
                    },
-                    {"type": "text", "text": f"Query: {inp['content']}"},
+                    {"type": "text", "text": "Query: What is the weather like today?"},
                ],
            }
-        ]
-
-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "MrLight/dse-qwen2-2b-mrl-v1",
-            "messages": messages,
-            "encoding_format": "float",
-        },
+        ],
+        model=model,
+        encoding_format="float",
    )
-    response.raise_for_status()
-    response_json = response.json()

-    print("Embedding output:", response_json["data"][0]["embedding"])
+    print("Text embedding output:", response.data[0].embedding)
+
+
+model_example_map = {
+    "vlm2vec": run_vlm2vec,
+    "dse_qwen2_vl": run_dse_qwen2_vl,
+}


 def parse_args():
@ -103,29 +169,24 @@ def parse_args():
    parser.add_argument(
        "--model",
        type=str,
-        choices=["vlm2vec", "dse_qwen2_vl"],
+        choices=model_example_map.keys(),
        required=True,
-        help="Which model to call.",
+        help="The name of the embedding model.",
    )
    return parser.parse_args()


 def main(args):
-    if args.model == "vlm2vec":
-        vlm2vec()
-    elif args.model == "dse_qwen2_vl":
-        dse_qwen2_vl(
-            {
-                "type": "image",
-                "image_url": image_url,
-            }
-        )
-        dse_qwen2_vl(
-            {
-                "type": "text",
-                "content": "What is the weather like today?",
-            }
-        )
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model_id = models.data[0].id
+
+    model_example_map[args.model](client, model_id)


 if __name__ == "__main__":