Add fused top-K softmax kernel for MoE (#2769)
This commit is contained in:
11
setup.py
11
setup.py
@ -339,6 +339,17 @@ if _is_cuda():
|
||||
vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
|
||||
vllm_extension_sources.append("csrc/custom_all_reduce.cu")
|
||||
|
||||
# Add MoE kernels.
|
||||
ext_modules.append(
|
||||
CUDAExtension(
|
||||
name="vllm._moe_C",
|
||||
sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
|
||||
extra_compile_args={
|
||||
"cxx": CXX_FLAGS,
|
||||
"nvcc": NVCC_FLAGS,
|
||||
},
|
||||
))
|
||||
|
||||
if not _is_neuron():
|
||||
vllm_extension = CUDAExtension(
|
||||
name="vllm._C",
|
||||
|
||||
Reference in New Issue
Block a user