[Model] Enable quantization support for transformers backend (#12960)
This commit is contained in:
@ -45,10 +45,14 @@ def check_implementation(
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
|
||||
("openai-community/gpt2", "transformers"),
|
||||
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "auto"),
|
||||
]) # trust_remote_code=True by default
|
||||
def test_models(hf_runner, vllm_runner, example_prompts, model,
|
||||
model_impl) -> None:
|
||||
def test_models(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
model: str,
|
||||
model_impl: str,
|
||||
) -> None:
|
||||
|
||||
maybe_raises = nullcontext()
|
||||
if model == "openai-community/gpt2" and model_impl == "transformers":
|
||||
@ -67,10 +71,50 @@ def test_models(hf_runner, vllm_runner, example_prompts, model,
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
example_prompts,
|
||||
):
|
||||
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
|
||||
check_implementation(hf_runner, vllm_runner, example_prompts,
|
||||
"meta-llama/Llama-3.2-1B-Instruct", **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model, quantization_kwargs", [
|
||||
(
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
{
|
||||
"quantization": "bitsandbytes",
|
||||
"load_format": "bitsandbytes",
|
||||
},
|
||||
),
|
||||
])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_quantization(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
model: str,
|
||||
quantization_kwargs: dict[str, str],
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, model_impl="auto", enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
model_impl="transformers",
|
||||
enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
transformers_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=transformers_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="transformers",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user