[Model] Enable quantization support for transformers backend (#12960)

This commit is contained in:
Isotr0py
2025-02-18 11:52:47 +08:00
committed by GitHub
parent efbe854448
commit 67ef8f666a
3 changed files with 66 additions and 23 deletions

View File

@ -45,10 +45,14 @@ def check_implementation(
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("openai-community/gpt2", "transformers"),
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
("meta-llama/Llama-3.2-1B-Instruct", "auto"),
]) # trust_remote_code=True by default
def test_models(hf_runner, vllm_runner, example_prompts, model,
model_impl) -> None:
def test_models(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
example_prompts: list[str],
model: str,
model_impl: str,
) -> None:
maybe_raises = nullcontext()
if model == "openai-community/gpt2" and model_impl == "transformers":
@ -67,10 +71,50 @@ def test_models(hf_runner, vllm_runner, example_prompts, model,
@multi_gpu_test(num_gpus=2)
def test_distributed(
hf_runner,
vllm_runner,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
example_prompts,
):
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
check_implementation(hf_runner, vllm_runner, example_prompts,
"meta-llama/Llama-3.2-1B-Instruct", **kwargs)
@pytest.mark.parametrize("model, quantization_kwargs", [
(
"meta-llama/Llama-3.2-1B-Instruct",
{
"quantization": "bitsandbytes",
"load_format": "bitsandbytes",
},
),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
vllm_runner: Type[VllmRunner],
example_prompts: list[str],
model: str,
quantization_kwargs: dict[str, str],
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(
model, model_impl="auto", enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
with vllm_runner(
model,
model_impl="transformers",
enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
transformers_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
check_logprobs_close(
outputs_0_lst=transformers_outputs,
outputs_1_lst=vllm_outputs,
name_0="transformers",
name_1="vllm",
)