[bugfix] Fix static asymmetric quantization case (#10334)
Signed-off-by: Daniël de Kok <me@danieldk.eu> Signed-off-by: luka <luka@neuralmagic.com> Co-authored-by: Daniël de Kok <me@danieldk.eu>
This commit is contained in:
@ -8,6 +8,7 @@ import pytest
|
||||
import torch
|
||||
from compressed_tensors.quantization import QuantizationType
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
|
||||
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
||||
@ -74,6 +75,35 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
|
||||
# TODO static & asymmetric
|
||||
])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
|
||||
example_prompts, model_path,
|
||||
max_tokens, num_logprobs):
|
||||
dtype = "bfloat16"
|
||||
|
||||
with hf_runner(model_path, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model_path, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
||||
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
|
||||
with vllm_runner(model_path) as llm:
|
||||
|
||||
Reference in New Issue
Block a user