[BugFix][torch.compile] KV scale calculation issues with FP8 quantization (#25513)

Signed-off-by: adabeyta <aabeyta@redhat.com>
2025-09-29 14:52:04 -05:00
parent d5ab28511c
commit c42ff4f4fd
3 changed files with 64 additions and 3 deletions
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -139,6 +139,21 @@ def test_custom_compile_config(
    run_model(compilation_config, model, model_kwargs)


+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
+)
+def test_fp8_kv_scale_compile(optimization_level: int):
+    model = "Qwen/Qwen2-0.5B"
+    model_kwargs = {
+        "quantization": "fp8",
+        "kv_cache_dtype": "fp8_e4m3",
+        "calculate_kv_scales": True,
+        "max_model_len": 512,
+    }
+    run_model(optimization_level, model, model_kwargs)
+
+
 def test_inductor_graph_partition_attn_fusion(caplog_vllm):
    if not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available "