[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-08 17:56:58 +08:00
parent f10797c0ce
commit 1ff4aed5bd
4 changed files with 271 additions and 24 deletions
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@ -382,10 +382,19 @@ def run_idefics3(question: str, modality: str):
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

-    llm = LLM(model=model_name,
-              max_model_len=8192,
-              max_num_seqs=2,
-              enforce_eager=True)
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+    )
    prompt = (
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
    )
@ -518,4 +527,4 @@ if __name__ == "__main__":
                        default=16,
                        help='Number of frames to extract from the video.')
    args = parser.parse_args()
-    main(args)
+    main(args)