[ROCm][FP8][Kernel] FP8 quantization fused into Custom Paged Attention (#17139)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-07 10:12:35 -04:00
parent 7377dd0307
commit 32aa74c09c
4 changed files with 73 additions and 43 deletions
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
      "                int max_context_len,"
      "                Tensor? alibi_slopes,"
      "                str kv_cache_dtype,"
-      "                Tensor k_scale, Tensor v_scale) -> ()");
+      "                Tensor k_scale, Tensor v_scale,"
+      "                Tensor? fp8_out_scale) -> ()");
  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
 }