[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-12 00:44:34 +08:00
parent 51d41265ad
commit bcbe2a4d9e
5 changed files with 233 additions and 55 deletions
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@ -5,6 +5,7 @@ import pytest

 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend

 from ...utils import build_model_context

@ -50,3 +51,49 @@ def test_processor_override(

    assert grid_t == expected_grid_t
    assert video_tok_count == expected_toks_per_frame * grid_t
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("fps", [2])
+def test_video_loader_consistency(
+    model_id: str,
+    fps: int,
+):
+    """
+    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    loader (post-sampled by processor) produce same video processing outputs.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
+    with open(video_path, "rb") as f:
+        video_bytes = f.read()
+
+    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
+    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
+        video_bytes, requested_fps=fps)
+
+    # pre-sampled loader shouldn't read all frames
+    assert len(dynamic_video) < len(static_video)
+
+    static_mm_data = {"video": [(static_video, static_metadata)]}
+    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
+
+    static_outputs = processor.apply(prompt, static_mm_data,
+                                     hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
+                                      hf_processor_mm_kwargs)
+
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
+        "prompt_token_ids"]
+    assert static_outputs["mm_kwargs"].get_data(
+    ) == dynamic_outputs["mm_kwargs"].get_data()
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
    assert metadata_sync == metadata_async


+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("max_duration", [1, 60, 1800])
+@pytest.mark.parametrize("requested_fps", [2, 24])
+async def test_fetch_video_http_with_dynamic_loader(
+        video_url: str, max_duration: int, requested_fps: int,
+        monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+        connector = MediaConnector(
+            media_io_kwargs={
+                "video": {
+                    "max_duration": max_duration,
+                    "requested_fps": requested_fps,
+                }
+            })
+
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(
+            video_url)
+
+        assert np.array_equal(video_sync, video_async)
+        assert metadata_sync == metadata_async
+        assert metadata_sync["video_backend"] == "opencv_dynamic"
+
+
 # Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"