================
@@ -615,3 +615,38 @@ scale variant.
   matrix A or B data can be reused from a previous WMMA instruction.
 }];
 }
+
+//===----------------------------------------------------------------------===//
+// Tensor DMA Builtins
+//===----------------------------------------------------------------------===//
+
+def DocCatTensorDMA : DocumentationCategory<"Tensor DMA Builtins"> {
+  let Content = [{
+Asynchronous tensor DMA transfers between global memory and LDS, tracked by
+the ``TENSOR_CNT`` hardware counter. The caller must order the transfer
+against later LDS accesses, either via ``s_wait_tensorcnt`` or via
+``__builtin_amdgcn_wait_asyncmark``. See the LLVM ``AMDGPUAsyncOperations``
+document for the async-operation model.
+}];
+}
+
+def DocTensorLoadToLDS_GFX1250 : Documentation {
+  let Category = DocCatTensorDMA;
+  let Content = [{
+Asynchronously copies a tensor from global memory into LDS.
+
+``D0``..``D4`` are the five chunks of the hardware tensor descriptor. The
+``_d2`` machine instruction is selected when ``D2`` and ``D3`` are zero-
+initialized; otherwise ``_d4`` is used. ``D4`` is reserved and silently
+ignored on gfx1250. ``cpol`` is a compile-time cache-policy bitfield
+(``th`` in bits [0:2], scope in bits [3:4]).
----------------
ssahasra wrote:

Why not move this to the common doc section earlier?

https://github.com/llvm/llvm-project/pull/200775
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to