Fix GPTQ model loading in Transformers backend (#25770)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-27 13:18:20 +01:00
parent 7977e5027c
commit ec152c8748
3 changed files with 29 additions and 10 deletions
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@ -100,10 +100,9 @@ def test_distributed(
                         kwargs_test=kwargs)


-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="bitsandbytes quantization is currently not supported in rocm.")
@pytest.mark.parametrize("model, quantization_kwargs", [
+    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
+    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
    (
        "meta-llama/Llama-3.2-1B-Instruct",
        {
@ -121,6 +120,11 @@ def test_quantization(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
+    if (current_platform.is_rocm()
+            and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
+        pytest.skip(
+            "bitsandbytes quantization is currently not supported in rocm.")
+
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]