Fix GPTQ model loading in Transformers backend (#25770)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Harry Mellor
2025-09-27 13:18:20 +01:00
committed by GitHub
parent 7977e5027c
commit ec152c8748
3 changed files with 29 additions and 10 deletions

View File

@ -100,10 +100,9 @@ def test_distributed(
kwargs_test=kwargs)
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="bitsandbytes quantization is currently not supported in rocm.")
@pytest.mark.parametrize("model, quantization_kwargs", [
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
(
"meta-llama/Llama-3.2-1B-Instruct",
{
@ -121,6 +120,11 @@ def test_quantization(
max_tokens: int,
num_logprobs: int,
) -> None:
if (current_platform.is_rocm()
and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
pytest.skip(
"bitsandbytes quantization is currently not supported in rocm.")
with vllm_runner(
model, model_impl="auto", enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]