Replace 0x1f with 0xffffffff in __shfl_sync (#1097)
This fixes compatibility with H100 and resolves #1094
This commit is contained in:
@ -331,7 +331,7 @@ class gen_Kernel:
|
||||
operator_code += " " + helper.var_idx("FusedAddBiasEpilogue", i ) + helper.var_idx(" epilogue_", i ) + ";\n"
|
||||
|
||||
|
||||
operator_code += " " + "int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);\n"
|
||||
operator_code += " " + "int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);\n"
|
||||
operator_code += " " + "int lane_idx = threadIdx.x % 32;\n"
|
||||
|
||||
for i in range (self.b2bnum - 1):
|
||||
|
||||
Reference in New Issue
Block a user