[Model] Enable quantization support for transformers backend (#12960)

2025-02-18 11:52:47 +08:00
parent efbe854448
commit 67ef8f666a
3 changed files with 66 additions and 23 deletions
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@ -45,10 +45,14 @@ def check_implementation(
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
        ("openai-community/gpt2", "transformers"),
        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
-        ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
    ])  # trust_remote_code=True by default
-def test_models(hf_runner, vllm_runner, example_prompts, model,
-                model_impl) -> None:
+def test_models(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    model_impl: str,
+) -> None:

    maybe_raises = nullcontext()
    if model == "openai-community/gpt2" and model_impl == "transformers":
@ -67,10 +71,50 @@ def test_models(hf_runner, vllm_runner, example_prompts, model,

@multi_gpu_test(num_gpus=2)
 def test_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
    example_prompts,
 ):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
    check_implementation(hf_runner, vllm_runner, example_prompts,
                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
+
+
+@pytest.mark.parametrize("model, quantization_kwargs", [
+    (
+        "meta-llama/Llama-3.2-1B-Instruct",
+        {
+            "quantization": "bitsandbytes",
+            "load_format": "bitsandbytes",
+        },
+    ),
+])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_quantization(
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    quantization_kwargs: dict[str, str],
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+            model, model_impl="auto", enforce_eager=True,
+            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+
+    with vllm_runner(
+            model,
+            model_impl="transformers",
+            enforce_eager=True,
+            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        transformers_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=transformers_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="transformers",
+        name_1="vllm",
+    )