[Model] Add UltravoxModel and UltravoxConfig (#7615)

2024-08-21 15:49:39 -07:00
parent dd53c4b023
commit 1ca0d4f86b
33 changed files with 1087 additions and 261 deletions
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@ -0,0 +1,97 @@
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.utils import FlexibleArgumentParser
+
+# Input audio and question
+audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate
+question = "What is recited in the audio?"
+
+
+# Ultravox 0.3
+def run_ultravox(question):
+    model_name = "fixie-ai/ultravox-v0_3"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{
+        'role': 'user',
+        'content': f"<|reserved_special_token_0|>\n{question}"
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    llm = LLM(model=model_name)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "ultravox": run_ultravox,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompt, stop_token_ids = model_example_map[model](question)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=stop_token_ids)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": audio_and_sample_rate
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": audio_and_sample_rate
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="ultravox",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+
+    args = parser.parse_args()
+    main(args)
--- a/examples/openai_audio_api_client.py
+++ b/examples/openai_audio_api_client.py
@ -0,0 +1,90 @@
+"""An example showing how to use vLLM to serve VLMs.
+
+Launch the vLLM server with the following command:
+vllm serve fixie-ai/ultravox-v0_3
+"""
+import base64
+
+import requests
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Any format supported by librosa is supported
+audio_url = AudioAsset("winning_call").url
+
+# Use audio url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print(f"Chat completion output:{result}")
+
+
+# Use base64 encoded audio in the payload
+def encode_audio_base64_from_url(audio_url: str) -> str:
+    """Encode an audio retrieved from a remote url to base64 format."""
+
+    with requests.get(audio_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    # Any format supported by librosa is supported
+                    "url": f"data:audio/ogg;base64,{audio_base64}"
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print(f"Chat completion output:{result}")