[CI/Build] Split up models tests (#10069)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -56,11 +56,13 @@ def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
|
||||
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
||||
seq_len = 5000 # bigger than the max feature size for any image
|
||||
|
||||
seq_data, mm_data = dummy_data_for_llava_next(
|
||||
dummy_data = dummy_data_for_llava_next(
|
||||
ctx,
|
||||
seq_len=seq_len,
|
||||
mm_counts={"image": 1},
|
||||
)
|
||||
seq_data = dummy_data.seq_data
|
||||
mm_data = dummy_data.multi_modal_data
|
||||
|
||||
# The dummy data dims should match the gridpoint with the biggest feat size
|
||||
assert mm_data["image"].height == expected_size[0]
|
||||
|
||||
@ -131,12 +131,13 @@ def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
sequence_data, _, = dummy_data_for_phi3v(
|
||||
dummy_data = dummy_data_for_phi3v(
|
||||
ctx=ctx,
|
||||
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
||||
mm_counts={"image": num_imgs},
|
||||
num_crops=num_crops,
|
||||
)
|
||||
sequence_data = dummy_data.seq_data
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
|
||||
assert img_tok_count == toks_per_img * num_imgs
|
||||
|
||||
@ -86,10 +86,17 @@ def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
|
||||
|
||||
# NOTE: video value is required, but isn't actually used
|
||||
# when making the dummy data except for error handling currently
|
||||
seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, {
|
||||
"image": 1,
|
||||
"video": 0
|
||||
}, **mm_processor_kwargs)
|
||||
dummy_data = dummy_data_for_qwen2_vl(
|
||||
ctx=qwen2_vl_context,
|
||||
seq_len=seq_len,
|
||||
mm_counts={
|
||||
"image": 1,
|
||||
"video": 0
|
||||
},
|
||||
**mm_processor_kwargs,
|
||||
)
|
||||
seq_data = dummy_data.seq_data
|
||||
mm_data = dummy_data.multi_modal_data
|
||||
|
||||
# Ensure we have the right number of placeholders for min/max pixel values
|
||||
assert seq_data.get_token_ids().count(image_token_id) == token_count
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -19,7 +19,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
def run_awq_test(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
models: Tuple[str, str],
|
||||
source_model: str,
|
||||
quant_model: str,
|
||||
*,
|
||||
size_factors: List[float],
|
||||
dtype: str,
|
||||
@ -28,8 +29,6 @@ def run_awq_test(
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
source_model, quant_model = models
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
@ -84,8 +83,11 @@ def run_awq_test(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.parametrize(
|
||||
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
|
||||
("source_model", "quant_model"),
|
||||
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
@ -103,12 +105,13 @@ def run_awq_test(
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_awq_models(vllm_runner, image_assets, models, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
|
||||
size_factors, dtype, max_tokens, num_logprobs) -> None:
|
||||
run_awq_test(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
models,
|
||||
source_model,
|
||||
quant_model,
|
||||
size_factors=size_factors,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
@ -11,21 +11,17 @@ from ....conftest import _ImageAssets
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
|
||||
models = [
|
||||
snapshot_download("OpenGVLab/InternViT-300M-448px",
|
||||
allow_patterns=DOWNLOAD_PATTERN),
|
||||
snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5",
|
||||
allow_patterns=DOWNLOAD_PATTERN),
|
||||
]
|
||||
|
||||
|
||||
def run_intern_vit_test(
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
model_id: str,
|
||||
*,
|
||||
dtype: str,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
|
||||
|
||||
img_processor = CLIPImageProcessor.from_pretrained(model)
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
pixel_values = [
|
||||
@ -67,12 +63,15 @@ def run_intern_vit_test(
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"OpenGVLab/InternViT-300M-448px",
|
||||
"OpenGVLab/InternViT-6B-448px-V1-5",
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", [torch.half])
|
||||
@torch.inference_mode()
|
||||
def test_models(dist_init, image_assets, model, dtype: str) -> None:
|
||||
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
|
||||
run_intern_vit_test(
|
||||
image_assets,
|
||||
model,
|
||||
model_id,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
@ -130,8 +130,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
#### Extended model tests
|
||||
"blip2": VLMTestInfo(
|
||||
@ -159,9 +159,9 @@ VLM_TEST_SETTINGS = {
|
||||
dtype="bfloat16",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__.startswith("4.46"),
|
||||
transformers.__version__ < "4.46.2",
|
||||
reason="Model broken in HF, see huggingface/transformers#34379"
|
||||
)
|
||||
),
|
||||
]
|
||||
),
|
||||
"fuyu": VLMTestInfo(
|
||||
@ -185,8 +185,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
patch_hf_runner=model_utils.glm_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
@ -205,6 +205,22 @@ VLM_TEST_SETTINGS = {
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
|
||||
),
|
||||
"idefics3": VLMTestInfo(
|
||||
models=["HuggingFaceM4/Idefics3-8B-Llama3"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0"
|
||||
),
|
||||
large_gpu_mark(min_gb=48),
|
||||
],
|
||||
),
|
||||
"intern_vl": VLMTestInfo(
|
||||
models=[
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
@ -263,7 +279,6 @@ VLM_TEST_SETTINGS = {
|
||||
runner_mm_key="videos",
|
||||
)],
|
||||
),
|
||||
# FIXME
|
||||
"llava_next_video": VLMTestInfo(
|
||||
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
||||
test_type=VLMTestType.VIDEO,
|
||||
@ -275,7 +290,7 @@ VLM_TEST_SETTINGS = {
|
||||
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__.startswith("4.46"),
|
||||
transformers.__version__ < "4.46.2",
|
||||
reason="Model broken with changes in transformers 4.46"
|
||||
)
|
||||
],
|
||||
@ -316,6 +331,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
),
|
||||
"qwen": VLMTestInfo(
|
||||
models=["Qwen/Qwen-VL"],
|
||||
@ -327,22 +343,6 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
|
||||
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
|
||||
),
|
||||
"idefics3": VLMTestInfo(
|
||||
models=["HuggingFaceM4/Idefics3-8B-Llama3"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__ < "4.46.0",
|
||||
reason="Model introduced in HF >= 4.46.0"
|
||||
),
|
||||
large_gpu_mark(min_gb=48),
|
||||
],
|
||||
),
|
||||
### Tensor parallel / multi-gpu broadcast tests
|
||||
"broadcast-chameleon": VLMTestInfo(
|
||||
models=["facebook/chameleon-7b"],
|
||||
@ -362,7 +362,7 @@ VLM_TEST_SETTINGS = {
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
transformers.__version__.startswith("4.46"),
|
||||
transformers.__version__ < "4.46.2",
|
||||
reason="Model broken in HF, see huggingface/transformers#34379"
|
||||
)
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user