Implements dual-chunk-flash-attn backend for dual chunk attention with sparse attention support (#11844)
This commit is contained in:
@ -77,6 +77,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor suffix_output,"
|
||||
" Tensor suffix_lse) -> ()");
|
||||
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
|
||||
|
||||
ops.def(
|
||||
"convert_vertical_slash_indexes("
|
||||
" Tensor! block_count, Tensor! block_offset, "
|
||||
" Tensor! column_count, Tensor! column_index, "
|
||||
" Tensor q_seqlens, Tensor q_seqlens, "
|
||||
" Tensor vertical_indexes, Tensor slash_indexes, "
|
||||
" int context_size, int block_size_M, int block_size_N, "
|
||||
" bool causal) -> ()");
|
||||
ops.impl("convert_vertical_slash_indexes", torch::kCUDA,
|
||||
&convert_vertical_slash_indexes);
|
||||
|
||||
ops.def(
|
||||
"convert_vertical_slash_indexes_mergehead("
|
||||
" Tensor! block_count, Tensor! block_offset, "
|
||||
" Tensor! column_count, Tensor! column_index, "
|
||||
" Tensor q_seqlens, Tensor q_seqlens, "
|
||||
" Tensor vertical_indexes, Tensor slash_indexes, "
|
||||
" Tensor vertical_indices_count, Tensor slash_indices_count, "
|
||||
" int context_size, int block_size_M, int block_size_N, "
|
||||
" bool causal) -> ()");
|
||||
ops.impl("convert_vertical_slash_indexes_mergehead", torch::kCUDA,
|
||||
&convert_vertical_slash_indexes_mergehead);
|
||||
#endif
|
||||
|
||||
// Activation ops
|
||||
|
||||
Reference in New Issue
Block a user