Add movmatrix support (movmatrix.sync.aligned.m8n8.trans.b16) (#2562)

This commit is contained in:
Inoday Yadav
2025-08-19 22:22:02 -04:00
committed by GitHub
parent ec18e8043b
commit 42e7c546c4
4 changed files with 197 additions and 0 deletions

View File

@ -60,6 +60,12 @@
#define CUTE_ARCH_LDSM_SM75_ACTIVATED 1
#endif
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
#define CUTE_ARCH_MOVM_SM75_ACTIVATED 1
#else
#define CUTE_ARCH_MOVM_SM75_ACTIVATED 0
#endif
namespace cute
{
@ -183,6 +189,25 @@ struct SM75_U16x8_LDSM_T
}
};
struct SM75_U32x1_MOVM_T
{
using SRegisters = uint32_t[1];
using DRegisters = uint32_t[1];
CUTE_HOST_DEVICE static void
copy(uint32_t src,
uint32_t &dst)
{
#if CUTE_ARCH_MOVM_SM75_ACTIVATED
asm volatile("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;\n"
: "=r"(dst)
: "r"(src));
#else
CUTE_INVALID_CONTROL_PATH("Trying to use movmatrix without CUTE_ARCH_MOVM_SM75_ACTIVATED.");
#endif
}
};
//
// Legacy LDSM interfaces that aren't very useful
//

View File

@ -140,4 +140,21 @@ struct Copy_Traits<SM75_U16x8_LDSM_T>
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM75_U32x1_MOVM_T>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <_32,_32>,
Stride<_32, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,_32>,
Stride<_32, _1>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
} // end namespace cute