[Core][VLM] Add precise multi-modal placeholder tracking (#8346)

Signed-off-by: Peter Salas <peter@fixie.ai>
2024-11-01 16:21:10 -07:00
parent d151fde834
commit 6c0b7f548d
53 changed files with 913 additions and 281 deletions
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@ -34,11 +34,7 @@ def run_ultravox(question: str, audio_count: int):
                                           tokenize=False,
                                           add_generation_prompt=True)

-    llm = LLM(model=model_name,
-              enforce_eager=True,
-              enable_chunked_prefill=False,
-              max_model_len=8192,
-              limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
    stop_token_ids = None
    return llm, prompt, stop_token_ids