Fused MOE for Mixtral (#2542)

Co-authored-by: chen shen <scv119@gmail.com>
2024-01-29 22:43:37 -08:00
parent 5d60def02c
commit ab40644669
4 changed files with 114 additions and 108 deletions
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@ -57,9 +57,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
  ops.def(
-      "moe_align_block_size",
-      &moe_align_block_size,
-      "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
+    "moe_align_block_size",
+    &moe_align_block_size,
+    "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");

  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");