Add fused top-K softmax kernel for MoE (#2769)

This commit is contained in:
Woosuk Kwon
2024-02-05 17:38:02 -08:00
committed by GitHub
parent 2ccee3def6
commit f0d4e14557
9 changed files with 591 additions and 50 deletions

View File

@ -339,6 +339,17 @@ if _is_cuda():
vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
vllm_extension_sources.append("csrc/custom_all_reduce.cu")
# Add MoE kernels.
ext_modules.append(
CUDAExtension(
name="vllm._moe_C",
sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
))
if not _is_neuron():
vllm_extension = CUDAExtension(
name="vllm._C",