tra created this revision.
Herald added subscribers: mattd, gchakrabarti, asavonic, bixia, hiraditya, 
yaxunl.
Herald added a project: All.
tra updated this revision to Diff 525307.
tra added a comment.
tra published this revision for review.
tra added a reviewer: jlebar.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, jholewinski.
Herald added projects: clang, LLVM.

Re-enabled .relaxed test, now that ptx80 is available.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D151363

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenCUDA/builtins-sm90.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll

Index: llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
+++ llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s
-; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
 
 ; CHECK-LABEL: test_isspacep
 define i1 @test_isspacep_shared_cluster(ptr %p) {
@@ -120,6 +120,19 @@
         ret i1 %x
 }
 
+; CHECK-LABEL: test_barrier_cluster(
+define void @test_barrier_cluster() {
+; CHECK: barrier.cluster.arrive;
+       call void @llvm.nvvm.barrier.cluster.arrive()
+; CHECK: barrier.cluster.arrive.relaxed;
+       call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+; CHECK: barrier.cluster.wait;
+       call void @llvm.nvvm.barrier.cluster.wait()
+; CHECK: fence.sc.cluster
+       call void @llvm.nvvm.fence.sc.cluster()
+       ret void
+}
+
 
 declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p);
 declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r);
@@ -137,3 +150,7 @@
 declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
 declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
 declare i1 @llvm.nvvm.is_explicit_cluster()
+declare void @llvm.nvvm.barrier.cluster.arrive()
+declare void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+declare void @llvm.nvvm.barrier.cluster.wait()
+declare void @llvm.nvvm.fence.sc.cluster()
Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -132,6 +132,18 @@
                  "barrier.sync \t$id, $cnt;",
                  [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
         Requires<[hasPTX<60>, hasSM<30>]>;
+class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
+                          list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
+        NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
+        Requires<Preds>;
+
+def barrier_cluster_arrive:
+        INT_BARRIER_CLUSTER<"arrive", int_nvvm_barrier_cluster_arrive>;
+def barrier_cluster_arrive_relaxed:
+        INT_BARRIER_CLUSTER<"arrive.relaxed",
+        int_nvvm_barrier_cluster_arrive_relaxed, [hasPTX<80>, hasSM<90>]>;
+def barrier_cluster_wait:
+        INT_BARRIER_CLUSTER<"wait", int_nvvm_barrier_cluster_wait>;
 
 class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
                  bit offset_imm, bit mask_imm, bit threadmask_imm>
@@ -303,6 +315,9 @@
 def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
 def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
 
+def INT_FENCE_SC_CLUSTER:
+       MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
+       Requires<[hasPTX<78>, hasSM<90>]>;
 
 //-----------------------------------
 // Async Copy Functions
Index: llvm/include/llvm/IR/IntrinsicsNVVM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1358,6 +1358,14 @@
       Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
       ClangBuiltin<"__nvvm_barrier_sync_cnt">;
 
+  // barrier.cluster.[wait, arrive, arrive.relaxed]
+  def int_nvvm_barrier_cluster_arrive :
+      Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
+  def int_nvvm_barrier_cluster_arrive_relaxed :
+      Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
+  def int_nvvm_barrier_cluster_wait :
+      Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
+
   // Membar
   def int_nvvm_membar_cta : ClangBuiltin<"__nvvm_membar_cta">,
       Intrinsic<[], [], [IntrNoCallback]>;
@@ -1365,6 +1373,8 @@
       Intrinsic<[], [], [IntrNoCallback]>;
   def int_nvvm_membar_sys : ClangBuiltin<"__nvvm_membar_sys">,
       Intrinsic<[], [], [IntrNoCallback]>;
+  def int_nvvm_fence_sc_cluster:
+      Intrinsic<[], [], [IntrNoCallback]>;
 
 // Async Copy
 def int_nvvm_cp_async_mbarrier_arrive :
Index: clang/test/CodeGenCUDA/builtins-sm90.cu
===================================================================
--- clang/test/CodeGenCUDA/builtins-sm90.cu
+++ clang/test/CodeGenCUDA/builtins-sm90.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx78" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx80" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
 
 // CHECK: define{{.*}} void @_Z6kernelPlPvj(
 __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
@@ -57,5 +57,14 @@
   // CHECK: call i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) {{.*}})
   out[i++] = __nvvm_getctarank_shared_cluster(sptr);
 
+  // CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
+  __nvvm_barrier_cluster_arrive();
+  // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+  __nvvm_barrier_cluster_arrive_relaxed();
+  // CHECK: call void @llvm.nvvm.barrier.cluster.wait()
+  __nvvm_barrier_cluster_wait();
+    // CHECK: call void @llvm.nvvm.fence.sc.cluster()
+  __nvvm_fence_sc_cluster();
+
   // CHECK: ret void
 }
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -18962,6 +18962,18 @@
     return Builder.CreateCall(
         CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
         EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_barrier_cluster_arrive:
+    return Builder.CreateCall(
+        CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
+  case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
+    return Builder.CreateCall(
+        CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
+  case NVPTX::BI__nvvm_barrier_cluster_wait:
+    return Builder.CreateCall(
+        CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
+  case NVPTX::BI__nvvm_fence_sc_cluster:
+    return Builder.CreateCall(
+        CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
   default:
     return nullptr;
   }
Index: clang/include/clang/Basic/BuiltinsNVPTX.def
===================================================================
--- clang/include/clang/Basic/BuiltinsNVPTX.def
+++ clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -582,6 +582,11 @@
 TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60)
 TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60)
 
+TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78))
+TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80))
+TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78))
+TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78))
+
 // Shuffle
 
 BUILTIN(__nvvm_shfl_down_i32, "iiii", "")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to