[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@ -5,6 +5,7 @@ import pytest
|
||||
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
@ -50,3 +51,49 @@ def test_processor_override(
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
|
||||
@pytest.mark.parametrize("fps", [2])
|
||||
def test_video_loader_consistency(
|
||||
model_id: str,
|
||||
fps: int,
|
||||
):
|
||||
"""
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
loader (post-sampled by processor) produce same video processing outputs.
|
||||
"""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
|
||||
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_bytes = f.read()
|
||||
|
||||
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||
video_bytes, requested_fps=fps)
|
||||
|
||||
# pre-sampled loader shouldn't read all frames
|
||||
assert len(dynamic_video) < len(static_video)
|
||||
|
||||
static_mm_data = {"video": [(static_video, static_metadata)]}
|
||||
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
|
||||
|
||||
static_outputs = processor.apply(prompt, static_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
|
||||
"prompt_token_ids"]
|
||||
assert static_outputs["mm_kwargs"].get_data(
|
||||
) == dynamic_outputs["mm_kwargs"].get_data()
|
||||
|
||||
@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
|
||||
assert metadata_sync == metadata_async
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
|
||||
@pytest.mark.parametrize("requested_fps", [2, 24])
|
||||
async def test_fetch_video_http_with_dynamic_loader(
|
||||
video_url: str, max_duration: int, requested_fps: int,
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
|
||||
connector = MediaConnector(
|
||||
media_io_kwargs={
|
||||
"video": {
|
||||
"max_duration": max_duration,
|
||||
"requested_fps": requested_fps,
|
||||
}
|
||||
})
|
||||
|
||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||
video_async, metadata_async = await connector.fetch_video_async(
|
||||
video_url)
|
||||
|
||||
assert np.array_equal(video_sync, video_async)
|
||||
assert metadata_sync == metadata_async
|
||||
assert metadata_sync["video_backend"] == "opencv_dynamic"
|
||||
|
||||
|
||||
# Used for `test_argsort_mm_positions`.
|
||||
class TestCase(NamedTuple):
|
||||
mm_positions: "MultiModalPlaceholderDict"
|
||||
|
||||
Reference in New Issue
Block a user