[VLM] Merged multimodal processor for Qwen2-Audio (#11303)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-19 14:14:17 +08:00
parent c6b0a7d3ba
commit 6142ef0ada
11 changed files with 414 additions and 358 deletions
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@ -5,6 +5,7 @@ import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding

+from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

@ -130,16 +131,14 @@ def run_test(
                   dtype=dtype,
                   postprocess_inputs=process,
                   auto_cls=AutoModel) as hf_model:
-        import librosa
-
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],
                max_tokens,
                num_logprobs=num_logprobs,
-                audios=[(librosa.resample(audio[0],
-                                          orig_sr=audio[1],
-                                          target_sr=16000), 16000)])
+                audios=[(resample_audio(audio[0],
+                                        orig_sr=audio[1],
+                                        target_sr=16000), 16000)])
            for _, hf_prompt, audio in prompts_and_audios
        ]