[Misc] refactor prompt embedding examples (#18405)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-20 23:26:12 +08:00
parent be48360c1f
commit 8f55962a7f
3 changed files with 191 additions and 102 deletions
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+vLLM OpenAI-Compatible Client with Prompt Embeddings
+
+This script demonstrates how to:
+1. Generate prompt embeddings using Hugging Face Transformers
+2. Encode them in base64 format
+3. Send them to a vLLM server via the OpenAI-compatible Completions API
+
+Run the vLLM server first:
+vllm serve meta-llama/Llama-3.2-1B-Instruct \
+  --task generate \
+  --max-model-len 4096 \
+  --enable-prompt-embeds
+
+Run the client:
+python examples/online_serving/prompt_embed_inference_with_openai_client.py
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Dependencies:
+- transformers
+- torch
+- openai
+"""
+import base64
+import io
+
+import torch
+import transformers
+from openai import OpenAI
+
+
+def main():
+    client = OpenAI(
+        api_key="EMPTY",
+        base_url="http://localhost:8000/v1",
+    )
+
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    # Transformers
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_name)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    chat = [{
+        "role": "user",
+        "content": "Please tell me about the capital of France."
+    }]
+    token_ids = tokenizer.apply_chat_template(chat,
+                                              add_generation_prompt=True,
+                                              return_tensors='pt')
+
+    embedding_layer = transformers_model.get_input_embeddings()
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+
+    # Prompt embeddings
+    buffer = io.BytesIO()
+    torch.save(prompt_embeds, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
+
+    completion = client.completions.create(
+        model=model_name,
+        # NOTE: The OpenAI client does not allow `None` as an input to
+        # `prompt`. Use an empty string if you have no text prompts.
+        prompt="",
+        max_tokens=5,
+        temperature=0.0,
+        # NOTE: The OpenAI client allows passing in extra JSON body via the
+        # `extra_body` argument.
+        extra_body={"prompt_embeds": encoded_embeds})
+
+    print("-" * 30)
+    print(completion.choices[0].text)
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()