[CPU] Refactor CPU W8A8 scaled_mm (#23071)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-21 09:34:24 +08:00
parent b029de9902
commit 7be5d113d8
17 changed files with 1525 additions and 1273 deletions
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -6,25 +6,20 @@

 std::string init_cpu_threads_env(const std::string& cpu_ids);

-void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
-                    const torch::Tensor& b, const torch::Tensor& a_scales,
-                    const torch::Tensor& b_scales,
-                    const std::optional<torch::Tensor>& bias);
+void release_dnnl_matmul_handler(int64_t handler);

-void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
-                        const torch::Tensor& b, const torch::Tensor& a_scales,
-                        const torch::Tensor& b_scales,
-                        const torch::Tensor& azp_adj,
-                        const std::optional<torch::Tensor>& azp,
-                        const std::optional<torch::Tensor>& bias);
+int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
+                                        const torch::Tensor& b_scales,
+                                        at::ScalarType output_type,
+                                        bool dynamic_act_quant, bool use_azp,
+                                        int64_t primitive_cache_size);

-#if defined(__powerpc64__)
-void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a,
-                            const torch::Tensor& b,
-                            const torch::Tensor& a_scales,
-                            const torch::Tensor& b_scales,
-                            const std::optional<torch::Tensor>& bias);
-#endif
+void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+                      const torch::Tensor& a_scales,
+                      const std::optional<torch::Tensor>& azp,
+                      const std::optional<torch::Tensor>& azp_adj,
+                      const std::optional<torch::Tensor>& bias,
+                      int64_t handler);

 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
@ -151,8 +146,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);

  // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
+    defined(__powerpc64__)
  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
+  // Helper function to release oneDNN handlers
+  ops.def("release_dnnl_matmul_handler(int handler) -> ()",
+          &release_dnnl_matmul_handler);
+
+  // Create oneDNN W8A8 handler
+  ops.def(
+      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
+      "output_type, bool dynamic_act_quant, bool use_azp, int "
+      "primitive_cache_size) -> int",
+      &create_onednn_scaled_mm_handler);
+
+  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
+  ops.def(
+      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
+      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
+  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);

  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
@ -168,50 +180,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      {stride_tag});
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
-  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
-  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
-  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()",
-      {stride_tag});
-  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
-#elif defined(__powerpc64__)
-  // Compute int8 quantized tensor for given scaling factor.
-  ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()");
-  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
-
-  // Compute int8 quantized tensor and scaling factor
-  ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()");
-  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
-           &dynamic_scaled_int8_quant);
-  // W8A8 GEMM, supporting symmetric quantization.
-  ops.def(
-      "cutlass_scaled_mm(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le);
-  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif

 // SHM CCL