[Frontend] Use engine argument to control MM cache size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 00:47:10 +08:00
parent 8c9da6be22
commit 139d155781
13 changed files with 101 additions and 47 deletions
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -68,7 +68,7 @@ def run_simple_demo(args: argparse.Namespace):
        max_model_len=4096,
        max_num_seqs=2,
        tensor_parallel_size=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
    )

    prompt = "Describe this image in one sentence."
@ -105,7 +105,7 @@ def run_advanced_demo(args: argparse.Namespace):
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
        tensor_parallel_size=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
    )

    prompt = "Describe the following image."
@ -164,7 +164,7 @@ def parse_args():
    )

    parser.add_argument(
-        "--disable-mm-preprocessor-cache",
+        "--disable-mm-processor-cache",
        action="store_true",
        help="If True, disables caching of multi-modal processor.",
    )
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -1563,7 +1563,7 @@ def parse_args():
    )

    parser.add_argument(
-        "--disable-mm-preprocessor-cache",
+        "--disable-mm-processor-cache",
        action="store_true",
        help="If True, disables caching of multi-modal processor.",
    )
@ -1603,7 +1603,7 @@ def main(args):

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
-        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
+        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
    }
    llm = LLM(**engine_args)