[Kernel] Initial Activation Quantization Support (#4525)

Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-05-23 17:29:18 -04:00
parent 5eda2ea02a
commit a1242324c9
17 changed files with 683 additions and 94 deletions
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@ -67,6 +67,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
          "Aligning the number of tokens to be processed by each expert such "
          "that it is divisible by the block size.");

+  ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
+          "Compute int8 quantized tensor for given scaling factor");
+
  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
  cache_ops.def("swap_blocks", &swap_blocks,