[Quantization] Remove FP4 emulation; Fall-back to marlin for device < 100 (#19563)

This commit is contained in:
Dipika Sikka
2025-06-16 17:33:51 -04:00
committed by GitHub
parent 90f9c2eb5c
commit 6bc7b57315
5 changed files with 79 additions and 60 deletions

View File

@ -667,7 +667,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, scheme)
if isinstance(qkv_proj.scheme, scheme) or isinstance(
qkv_proj.scheme, CompressedTensorsW4A16Fp4
) and not CompressedTensorsW4A4Fp4.cutlass_fp4_supported():
assert True
else:
raise AssertionError("FP4 Scheme Mismatch")
assert qkv_proj.scheme.group_size == 16
llm.apply_model(check_model)