[BugFix][torch.compile] KV scale calculation issues with FP8 quantization (#25513)

Signed-off-by: adabeyta <aabeyta@redhat.com>
This commit is contained in:
Adrian Abeyta
2025-09-29 14:52:04 -05:00
committed by GitHub
parent d5ab28511c
commit c42ff4f4fd
3 changed files with 64 additions and 3 deletions

View File

@ -139,6 +139,21 @@ def test_custom_compile_config(
run_model(compilation_config, model, model_kwargs)
@pytest.mark.parametrize(
"optimization_level",
[CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
)
def test_fp8_kv_scale_compile(optimization_level: int):
model = "Qwen/Qwen2-0.5B"
model_kwargs = {
"quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3",
"calculate_kv_scales": True,
"max_model_len": 512,
}
run_model(optimization_level, model, model_kwargs)
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
if not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available "