[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-25 Thread Artem Belevich via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rC359248: [CUDA] Implemented _[bi]mma* builtins. (authored by 
tra, committed by ).
Herald added a subscriber: kristina.
Herald added a project: clang.

Changed prior to commit:
  https://reviews.llvm.org/D60279?vs=194226=196738#toc

Repository:
  rC Clang

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D60279/new/

https://reviews.llvm.org/D60279

Files:
  include/clang/Basic/BuiltinsNVPTX.def
  lib/Basic/Targets/NVPTX.cpp
  lib/CodeGen/CGBuiltin.cpp
  lib/Driver/ToolChains/Cuda.cpp
  test/CodeGen/builtins-nvptx-mma.cu
  test/CodeGen/builtins-nvptx-mma.py

Index: include/clang/Basic/BuiltinsNVPTX.def
===
--- include/clang/Basic/BuiltinsNVPTX.def
+++ include/clang/Basic/BuiltinsNVPTX.def
@@ -18,13 +18,22 @@
 #endif
 
 #pragma push_macro("SM_70")
-#define SM_70 "sm_70|sm_71"
+#pragma push_macro("SM_72")
+#pragma push_macro("SM_75")
+#define SM_75 "sm_75"
+#define SM_72 "sm_72|" SM_75
+#define SM_70 "sm_70|" SM_72
+
 #pragma push_macro("SM_60")
 #define SM_60 "sm_60|sm_61|sm_62|" SM_70
 
-#pragma push_macro("PTX61")
-#define PTX61 "ptx61"
 #pragma push_macro("PTX60")
+#pragma push_macro("PTX61")
+#pragma push_macro("PTX63")
+#pragma push_macro("PTX64")
+#define PTX64 "ptx64"
+#define PTX63 "ptx63|" PTX64
+#define PTX61 "ptx61|" PTX63
 #define PTX60 "ptx60|" PTX61
 
 #pragma push_macro("AND")
@@ -666,10 +675,53 @@
 TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
 TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
 
+// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75
+TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_a_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_a_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_b_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_b_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
 #pragma pop_macro("AND")
 #pragma pop_macro("SM_60")
 #pragma pop_macro("SM_70")
+#pragma pop_macro("SM_72")
+#pragma pop_macro("SM_75")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
+#pragma pop_macro("PTX63")
+#pragma pop_macro("PTX64")
Index: test/CodeGen/builtins-nvptx-mma.cu

[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-08 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 194226.
tra added a comment.

- Converted class to struct+function as Tim suggested.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D60279/new/

https://reviews.llvm.org/D60279

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/Basic/Targets/NVPTX.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/CodeGen/builtins-nvptx-mma.cu
  clang/test/CodeGen/builtins-nvptx-mma.py
  llvm/lib/Target/NVPTX/NVPTX.td

Index: llvm/lib/Target/NVPTX/NVPTX.td
===
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -75,6 +75,8 @@
  "Use PTX version 6.1">;
 def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
  "Use PTX version 6.3">;
+def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
+ "Use PTX version 6.4">;
 
 //===--===//
 // NVPTX supported processors.
Index: clang/test/CodeGen/builtins-nvptx-mma.py
===
--- /dev/null
+++ clang/test/CodeGen/builtins-nvptx-mma.py
@@ -0,0 +1,343 @@
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+  def __init__(self, geom, frag, ptx_elt_type):
+self.geom = geom
+self.frag = frag
+self.ptx_type = ptx_elt_type;
+
+  def __repr__(self):
+return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+  def __init__(self, a, b, c, d):
+self.a = a
+self.b = b
+self.c = c
+self.d = d
+
+  def __repr__(self):
+return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+  ops = []
+  for geom, type_a, type_c in product( geoms,  types_a, types_c):
+for type_b, type_d in product(types_b if types_b else [type_a],
+  types_d if types_d else [type_c]):
+  ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+   MMAFrag(geom, "b", type_b),
+   MMAFrag(geom, "c", type_c),
+   MMAFrag(geom, "d", type_d)))
+  return ops
+
+def make_ldst_ops(geoms, frags, types):
+  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+  in product(geoms, frags, types)]
+
+def get_mma_ops():
+  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+  make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["s8", "u8"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k32"],
+   ["s4", "u4"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k128"],
+   ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+  return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["a", "b"], ["f16", "u8", "s8"]) +
+  make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["c", "d"], ["f16", "f32", "s32"]) +
+  make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+  make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+  make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+  # geometries for FP and ints.
+  if geom in ["m8n32k16", "m32n8k16"]:
+return ptx_version >= 61
+  # geometries for sub-ints.
+  if geom in ["m8n8k32", "m8n8k128"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  if geom == "m16n16k16":
+return ptx_version >= 60
+  assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+  if ptx_type in ["s8", "u8", "s32"]:
+return ptx_version >= 63 and gpu_arch >= 72
+  if ptx_type in ["s4", "u4", "b1"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op, layout_a, layout_b, satf):
+  if not (is_type_supported(op.a.ptx_type)
+  and is_geom_supported(op.a.geom)):
+return False
+  # sub-integer require row/col layout, and no satf.
+  if op.a.ptx_type in ["s4", "u4", "b1"]:
+if op.a.ptx_type == "b1" and satf:
+  return False
+return layout_a == "row" and layout_b == "col"
+  return True
+
+def is_ldst_variant_supported(frag, layout):
+  if not (is_type_supported(frag.ptx_type)
+  and is_geom_supported(frag.geom)):
+

[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-05 Thread Tim Shen via Phabricator via cfe-commits
timshen added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:12884
+// Helper classes for mapping MMA builtins to particular LLVM intrinsic 
variant.
+class NVPTXMmaLdstInfo {
+public:

How about having a simple struct and a function?
```
struct NvptxMmaLdstInfo {
  unsigned NumResults;
  unsigned IID_col;
  unsigned IID_row;
};

NvptxMmaLdstInfo getNvptxMmaLdstInfo(unsigned BuiltinID) { ... }
```

I don't see the need for classes here.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:13020
+
+class NVPTXMmaInfo {
+private:

ditto (struct + function)?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D60279/new/

https://reviews.llvm.org/D60279



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-04 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 193809.
tra added a comment.

- Added PTX64 to the list of builtins' constraints.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D60279/new/

https://reviews.llvm.org/D60279

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/Basic/Targets/NVPTX.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/CodeGen/builtins-nvptx-mma.cu
  clang/test/CodeGen/builtins-nvptx-mma.py
  llvm/lib/Target/NVPTX/NVPTX.td

Index: llvm/lib/Target/NVPTX/NVPTX.td
===
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -75,6 +75,8 @@
  "Use PTX version 6.1">;
 def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
  "Use PTX version 6.3">;
+def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
+ "Use PTX version 6.4">;
 
 //===--===//
 // NVPTX supported processors.
Index: clang/test/CodeGen/builtins-nvptx-mma.py
===
--- /dev/null
+++ clang/test/CodeGen/builtins-nvptx-mma.py
@@ -0,0 +1,343 @@
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+  def __init__(self, geom, frag, ptx_elt_type):
+self.geom = geom
+self.frag = frag
+self.ptx_type = ptx_elt_type;
+
+  def __repr__(self):
+return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+  def __init__(self, a, b, c, d):
+self.a = a
+self.b = b
+self.c = c
+self.d = d
+
+  def __repr__(self):
+return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+  ops = []
+  for geom, type_a, type_c in product( geoms,  types_a, types_c):
+for type_b, type_d in product(types_b if types_b else [type_a],
+  types_d if types_d else [type_c]):
+  ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+   MMAFrag(geom, "b", type_b),
+   MMAFrag(geom, "c", type_c),
+   MMAFrag(geom, "d", type_d)))
+  return ops
+
+def make_ldst_ops(geoms, frags, types):
+  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+  in product(geoms, frags, types)]
+
+def get_mma_ops():
+  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+  make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["s8", "u8"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k32"],
+   ["s4", "u4"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k128"],
+   ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+  return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["a", "b"], ["f16", "u8", "s8"]) +
+  make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["c", "d"], ["f16", "f32", "s32"]) +
+  make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+  make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+  make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+  # geometries for FP and ints.
+  if geom in ["m8n32k16", "m32n8k16"]:
+return ptx_version >= 61
+  # geometries for sub-ints.
+  if geom in ["m8n8k32", "m8n8k128"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  if geom == "m16n16k16":
+return ptx_version >= 60
+  assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+  if ptx_type in ["s8", "u8", "s32"]:
+return ptx_version >= 63 and gpu_arch >= 72
+  if ptx_type in ["s4", "u4", "b1"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op, layout_a, layout_b, satf):
+  if not (is_type_supported(op.a.ptx_type)
+  and is_geom_supported(op.a.geom)):
+return False
+  # sub-integer require row/col layout, and no satf.
+  if op.a.ptx_type in ["s4", "u4", "b1"]:
+if op.a.ptx_type == "b1" and satf:
+  return False
+return layout_a == "row" and layout_b == "col"
+  return True
+
+def is_ldst_variant_supported(frag, layout):
+  if not (is_type_supported(frag.ptx_type)
+  and is_geom_supported(frag.geom)):
+

[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-04 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 193796.
tra added a comment.

- Fixed minor issues with parameters of the new builtins:
  - __imma*_st_c_i32 builtins have 'const int * src'
  - __bmma_m8n8k128_mma_xor_popc_b1 does not have 'satf' argument.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D60279/new/

https://reviews.llvm.org/D60279

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/Basic/Targets/NVPTX.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/CodeGen/builtins-nvptx-mma.cu
  clang/test/CodeGen/builtins-nvptx-mma.py
  llvm/lib/Target/NVPTX/NVPTX.td

Index: llvm/lib/Target/NVPTX/NVPTX.td
===
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -75,6 +75,8 @@
  "Use PTX version 6.1">;
 def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
  "Use PTX version 6.3">;
+def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
+ "Use PTX version 6.4">;
 
 //===--===//
 // NVPTX supported processors.
Index: clang/test/CodeGen/builtins-nvptx-mma.py
===
--- /dev/null
+++ clang/test/CodeGen/builtins-nvptx-mma.py
@@ -0,0 +1,343 @@
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+  def __init__(self, geom, frag, ptx_elt_type):
+self.geom = geom
+self.frag = frag
+self.ptx_type = ptx_elt_type;
+
+  def __repr__(self):
+return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+  def __init__(self, a, b, c, d):
+self.a = a
+self.b = b
+self.c = c
+self.d = d
+
+  def __repr__(self):
+return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+  ops = []
+  for geom, type_a, type_c in product( geoms,  types_a, types_c):
+for type_b, type_d in product(types_b if types_b else [type_a],
+  types_d if types_d else [type_c]):
+  ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+   MMAFrag(geom, "b", type_b),
+   MMAFrag(geom, "c", type_c),
+   MMAFrag(geom, "d", type_d)))
+  return ops
+
+def make_ldst_ops(geoms, frags, types):
+  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+  in product(geoms, frags, types)]
+
+def get_mma_ops():
+  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+  make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["s8", "u8"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k32"],
+   ["s4", "u4"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k128"],
+   ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+  return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["a", "b"], ["f16", "u8", "s8"]) +
+  make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["c", "d"], ["f16", "f32", "s32"]) +
+  make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+  make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+  make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+  # geometries for FP and ints.
+  if geom in ["m8n32k16", "m32n8k16"]:
+return ptx_version >= 61
+  # geometries for sub-ints.
+  if geom in ["m8n8k32", "m8n8k128"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  if geom == "m16n16k16":
+return ptx_version >= 60
+  assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+  if ptx_type in ["s8", "u8", "s32"]:
+return ptx_version >= 63 and gpu_arch >= 72
+  if ptx_type in ["s4", "u4", "b1"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op, layout_a, layout_b, satf):
+  if not (is_type_supported(op.a.ptx_type)
+  and is_geom_supported(op.a.geom)):
+return False
+  # sub-integer require row/col layout, and no satf.
+  if op.a.ptx_type in ["s4", "u4", "b1"]:
+if op.a.ptx_type == "b1" and satf:
+  return False
+return layout_a == "row" and layout_b == "col"
+  return True
+
+def 

[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-04 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 193774.
tra edited the summary of this revision.
tra added a comment.

Cleaned up mma test generation.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D60279/new/

https://reviews.llvm.org/D60279

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/Basic/Targets/NVPTX.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/CodeGen/builtins-nvptx-mma.cu
  clang/test/CodeGen/builtins-nvptx-mma.py
  llvm/lib/Target/NVPTX/NVPTX.td

Index: llvm/lib/Target/NVPTX/NVPTX.td
===
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -75,6 +75,8 @@
  "Use PTX version 6.1">;
 def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
  "Use PTX version 6.3">;
+def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
+ "Use PTX version 6.4">;
 
 //===--===//
 // NVPTX supported processors.
Index: clang/test/CodeGen/builtins-nvptx-mma.py
===
--- /dev/null
+++ clang/test/CodeGen/builtins-nvptx-mma.py
@@ -0,0 +1,339 @@
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+  def __init__(self, geom, frag, ptx_elt_type):
+self.geom = geom
+self.frag = frag
+self.ptx_type = ptx_elt_type;
+
+  def __repr__(self):
+return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+  def __init__(self, a, b, c, d):
+self.a = a
+self.b = b
+self.c = c
+self.d = d
+
+  def __repr__(self):
+return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+  ops = []
+  for geom, type_a, type_c in product( geoms,  types_a, types_c):
+for type_b, type_d in product(types_b if types_b else [type_a],
+  types_d if types_d else [type_c]):
+  ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+   MMAFrag(geom, "b", type_b),
+   MMAFrag(geom, "c", type_c),
+   MMAFrag(geom, "d", type_d)))
+  return ops
+
+def make_ldst_ops(geoms, frags, types):
+  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+  in product(geoms, frags, types)]
+
+def get_mma_ops():
+  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+  make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["s8", "u8"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k32"],
+   ["s4", "u4"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k128"],
+   ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+  return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["a", "b"], ["f16", "u8", "s8"]) +
+  make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["c", "d"], ["f16", "f32", "s32"]) +
+  make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+  make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+  make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+  # geometries for FP and ints.
+  if geom in ["m8n32k16", "m32n8k16"]:
+return ptx_version >= 61
+  # geometries for sub-ints.
+  if geom in ["m8n8k32", "m8n8k128"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  if geom == "m16n16k16":
+return ptx_version >= 60
+  assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+  if ptx_type in ["s8", "u8", "s32"]:
+return ptx_version >= 63 and gpu_arch >= 72
+  if ptx_type in ["s4", "u4", "b1"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op, layout_a, layout_b, satf):
+  if not (is_type_supported(op.a.ptx_type)
+  and is_geom_supported(op.a.geom)):
+return False
+  # sub-integer require row/col layout, and no satf.
+  if op.a.ptx_type in ["s4", "u4", "b1"]:
+if op.a.ptx_type == "b1" and satf:
+  return False
+return layout_a == "row" and layout_b == "col"
+  return True
+
+def is_ldst_variant_supported(frag, layout):
+  if not (is_type_supported(frag.ptx_type)
+  and 

[PATCH] D60279: [CUDA] Implemented _[bi]mma* builtins.

2019-04-04 Thread Artem Belevich via Phabricator via cfe-commits
tra created this revision.
tra added reviewers: timshen, jlebar.
Herald added subscribers: llvm-commits, bixia, hiraditya, jholewinski.
Herald added a project: LLVM.

These builtins provide access to the new integer and
sub-integer variants of MMA (matrix multiply-accumulate) instructions
provided by CUDA-10.x on sm_75 (AKA Turing) GPUs.

Also added a feature for PTX 6.4. While Clang/LLVM does not generate
any PTX instructions that need it, we still need to pass it through to
ptxas in order to be able to compile code that uses the new 'mma'
instruction as inline assembly (e.g used by NVIDIA's CUTLASS library
https://github.com/NVIDIA/cutlass/blob/master/cutlass/arch/mma.h#L101)


https://reviews.llvm.org/D60279

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/Basic/Targets/NVPTX.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/CodeGen/builtins-nvptx-mma.cu
  clang/test/CodeGen/builtins-nvptx-mma.py
  llvm/lib/Target/NVPTX/NVPTX.td

Index: llvm/lib/Target/NVPTX/NVPTX.td
===
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -75,6 +75,8 @@
  "Use PTX version 6.1">;
 def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
  "Use PTX version 6.3">;
+def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
+ "Use PTX version 6.4">;
 
 //===--===//
 // NVPTX supported processors.
Index: clang/test/CodeGen/builtins-nvptx-mma.py
===
--- /dev/null
+++ clang/test/CodeGen/builtins-nvptx-mma.py
@@ -0,0 +1,339 @@
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+  def __init__(self, geom, frag, ptx_elt_type):
+self.geom = geom
+self.frag = frag
+self.ptx_type = ptx_elt_type;
+
+  def __repr__(self):
+return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+  def __init__(self, a, b, c, d):
+self.a = a
+self.b = b
+self.c = c
+self.d = d
+
+  def __repr__(self):
+return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+  ops = []
+  for geom, type_a, type_c in product( geoms,  types_a, types_c):
+for type_b, type_d in product(types_b if types_b else [type_a],
+  types_d if types_d else [type_c]):
+  ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+   MMAFrag(geom, "b", type_b),
+   MMAFrag(geom, "c", type_c),
+   MMAFrag(geom, "d", type_d)))
+  return ops
+
+def make_ldst_ops(geoms, frags, types):
+  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+  in product(geoms, frags, types)]
+
+def get_mma_ops():
+  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+  make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+   ["s8", "u8"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k32"],
+   ["s4", "u4"], [], ["s32"], []) +
+  make_mma_ops(["m8n8k128"],
+   ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+  return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["a", "b"], ["f16", "u8", "s8"]) +
+  make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+["c", "d"], ["f16", "f32", "s32"]) +
+  make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+  make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+  make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+  # geometries for FP and ints.
+  if geom in ["m8n32k16", "m32n8k16"]:
+return ptx_version >= 61
+  # geometries for sub-ints.
+  if geom in ["m8n8k32", "m8n8k128"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  if geom == "m16n16k16":
+return ptx_version >= 60
+  assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+  if ptx_type in ["s8", "u8", "s32"]:
+return ptx_version >= 63 and gpu_arch >= 72
+  if ptx_type in ["s4", "u4", "b1"]:
+return ptx_version >= 63 and gpu_arch >= 75
+  return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op,