[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-19 Thread Artem Belevich via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG6963c61f0f6e: [NVPTX/CUDA] added an optional src_size 
argument to __nvvm_cp_async* (authored by tra).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/builtins-nvptx.c
  clang/test/SemaCUDA/builtins.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/async-copy.ll

Index: llvm/test/CodeGen/NVPTX/async-copy.ll
===
--- llvm/test/CodeGen/NVPTX/async-copy.ll
+++ llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 
-; ALL-LABEL: asyncwaitgroup
+; CHECK-LABEL: asyncwaitgroup
 define void @asyncwaitgroup() {
-  ; ALL: cp.async.wait_group 8;
+  ; CHECK: cp.async.wait_group 8;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
-  ; ALL: cp.async.wait_group 0;
+  ; CHECK: cp.async.wait_group 0;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ; ALL: cp.async.wait_group 16;
+  ; CHECK: cp.async.wait_group 16;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.wait.all()
 
-; ALL-LABEL: asyncwaitall
+; CHECK-LABEL: asyncwaitall
 define void @asyncwaitall() {
-; ALL: cp.async.wait_all
+; CHECK: cp.async.wait_all
   tail call void @llvm.nvvm.cp.async.wait.all()
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.commit.group()
 
-; ALL-LABEL: asynccommitgroup
+; CHECK-LABEL: asynccommitgroup
 define void @asynccommitgroup() {
-; ALL: cp.async.commit_group
+; CHECK: cp.async.commit_group
   tail call void @llvm.nvvm.cp.async.commit.group()
   ret void
 }
@@ -41,72 +41,87 @@
 
 ; CHECK-LABEL: asyncmbarrier
 define void @asyncmbarrier(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+; The distinction between PTX32/PTX64 here is only to capture pointer register type
+; in R to be used in subsequent tests.
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriershared
 define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoinc
 define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoincshared
 define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
 
 ; CHECK-LABEL: asynccasharedglobal4i8
-define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
-; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
-; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
+define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK: 

[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Justin Lebar via Phabricator via cfe-commits
jlebar accepted this revision.
jlebar added a comment.
This revision is now accepted and ready to land.

Re-approval.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Artem Belevich via Phabricator via cfe-commits
tra requested review of this revision.
tra added a comment.

PTAL.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 523566.
tra added a comment.

Instead of changing existing intrinsic, introduce a new set which takes an
additional src_size argument. This should keep existing users working.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/builtins-nvptx.c
  clang/test/SemaCUDA/builtins.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/async-copy.ll

Index: llvm/test/CodeGen/NVPTX/async-copy.ll
===
--- llvm/test/CodeGen/NVPTX/async-copy.ll
+++ llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 
-; ALL-LABEL: asyncwaitgroup
+; CHECK-LABEL: asyncwaitgroup
 define void @asyncwaitgroup() {
-  ; ALL: cp.async.wait_group 8;
+  ; CHECK: cp.async.wait_group 8;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
-  ; ALL: cp.async.wait_group 0;
+  ; CHECK: cp.async.wait_group 0;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ; ALL: cp.async.wait_group 16;
+  ; CHECK: cp.async.wait_group 16;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.wait.all()
 
-; ALL-LABEL: asyncwaitall
+; CHECK-LABEL: asyncwaitall
 define void @asyncwaitall() {
-; ALL: cp.async.wait_all
+; CHECK: cp.async.wait_all
   tail call void @llvm.nvvm.cp.async.wait.all()
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.commit.group()
 
-; ALL-LABEL: asynccommitgroup
+; CHECK-LABEL: asynccommitgroup
 define void @asynccommitgroup() {
-; ALL: cp.async.commit_group
+; CHECK: cp.async.commit_group
   tail call void @llvm.nvvm.cp.async.commit.group()
   ret void
 }
@@ -41,72 +41,87 @@
 
 ; CHECK-LABEL: asyncmbarrier
 define void @asyncmbarrier(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+; The distinction between PTX32/PTX64 here is only to capture pointer register type
+; in R to be used in subsequent tests.
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriershared
 define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoinc
 define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoincshared
 define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
 
 ; CHECK-LABEL: asynccasharedglobal4i8
-define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
-; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
-; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
+define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], 

[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Looks like the extra intrinsic argument broke MLIR. I'll need to figure out how 
to deal with that.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Artem Belevich via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGe7b9c2f00fa0: [NVPTX/CUDA] added an optional src_size 
argument to __nvvm_cp_async* (authored by tra).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/builtins-nvptx.c
  clang/test/SemaCUDA/builtins.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/async-copy.ll

Index: llvm/test/CodeGen/NVPTX/async-copy.ll
===
--- llvm/test/CodeGen/NVPTX/async-copy.ll
+++ llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 
-; ALL-LABEL: asyncwaitgroup
+; CHECK-LABEL: asyncwaitgroup
 define void @asyncwaitgroup() {
-  ; ALL: cp.async.wait_group 8;
+  ; CHECK: cp.async.wait_group 8;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
-  ; ALL: cp.async.wait_group 0;
+  ; CHECK: cp.async.wait_group 0;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ; ALL: cp.async.wait_group 16;
+  ; CHECK: cp.async.wait_group 16;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.wait.all()
 
-; ALL-LABEL: asyncwaitall
+; CHECK-LABEL: asyncwaitall
 define void @asyncwaitall() {
-; ALL: cp.async.wait_all
+; CHECK: cp.async.wait_all
   tail call void @llvm.nvvm.cp.async.wait.all()
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.commit.group()
 
-; ALL-LABEL: asynccommitgroup
+; CHECK-LABEL: asynccommitgroup
 define void @asynccommitgroup() {
-; ALL: cp.async.commit_group
+; CHECK: cp.async.commit_group
   tail call void @llvm.nvvm.cp.async.commit.group()
   ret void
 }
@@ -41,72 +41,75 @@
 
 ; CHECK-LABEL: asyncmbarrier
 define void @asyncmbarrier(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+; The distinction between PTX32/PTX64 here is only to capture pointer register type
+; in R to be used in subsequent tests.
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriershared
 define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoinc
 define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoincshared
 define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
   ret void
 }
 
-declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
 
 ; CHECK-LABEL: asynccasharedglobal4i8
-define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
-; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
-; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
-  tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+define void 

[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 523428.
tra added a comment.

Cosmetic test cleanup.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/builtins-nvptx.c
  clang/test/SemaCUDA/builtins.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/async-copy.ll

Index: llvm/test/CodeGen/NVPTX/async-copy.ll
===
--- llvm/test/CodeGen/NVPTX/async-copy.ll
+++ llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 
-; ALL-LABEL: asyncwaitgroup
+; CHECK-LABEL: asyncwaitgroup
 define void @asyncwaitgroup() {
-  ; ALL: cp.async.wait_group 8;
+  ; CHECK: cp.async.wait_group 8;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
-  ; ALL: cp.async.wait_group 0;
+  ; CHECK: cp.async.wait_group 0;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ; ALL: cp.async.wait_group 16;
+  ; CHECK: cp.async.wait_group 16;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.wait.all()
 
-; ALL-LABEL: asyncwaitall
+; CHECK-LABEL: asyncwaitall
 define void @asyncwaitall() {
-; ALL: cp.async.wait_all
+; CHECK: cp.async.wait_all
   tail call void @llvm.nvvm.cp.async.wait.all()
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.commit.group()
 
-; ALL-LABEL: asynccommitgroup
+; CHECK-LABEL: asynccommitgroup
 define void @asynccommitgroup() {
-; ALL: cp.async.commit_group
+; CHECK: cp.async.commit_group
   tail call void @llvm.nvvm.cp.async.commit.group()
   ret void
 }
@@ -41,72 +41,75 @@
 
 ; CHECK-LABEL: asyncmbarrier
 define void @asyncmbarrier(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+; The distinction between PTX32/PTX64 here is only to capture pointer register type
+; in R to be used in subsequent tests.
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriershared
 define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoinc
 define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoincshared
 define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
   ret void
 }
 
-declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
 
 ; CHECK-LABEL: asynccasharedglobal4i8
-define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
-; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
-; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
-  tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, 

[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-18 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 523426.
tra added a comment.

Actually connected the Sema check for the optional argument, and added a test 
to cover it.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150820/new/

https://reviews.llvm.org/D150820

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/builtins-nvptx.c
  clang/test/SemaCUDA/builtins.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/async-copy.ll

Index: llvm/test/CodeGen/NVPTX/async-copy.ll
===
--- llvm/test/CodeGen/NVPTX/async-copy.ll
+++ llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 
-; ALL-LABEL: asyncwaitgroup
+; CHECK-LABEL: asyncwaitgroup
 define void @asyncwaitgroup() {
-  ; ALL: cp.async.wait_group 8;
+  ; CHECK: cp.async.wait_group 8;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
-  ; ALL: cp.async.wait_group 0;
+  ; CHECK: cp.async.wait_group 0;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ; ALL: cp.async.wait_group 16;
+  ; CHECK: cp.async.wait_group 16;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.wait.all()
 
-; ALL-LABEL: asyncwaitall
+; CHECK-LABEL: asyncwaitall
 define void @asyncwaitall() {
-; ALL: cp.async.wait_all
+; CHECK: cp.async.wait_all
   tail call void @llvm.nvvm.cp.async.wait.all()
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.commit.group()
 
-; ALL-LABEL: asynccommitgroup
+; CHECK-LABEL: asynccommitgroup
 define void @asynccommitgroup() {
-; ALL: cp.async.commit_group
+; CHECK: cp.async.commit_group
   tail call void @llvm.nvvm.cp.async.commit.group()
   ret void
 }
@@ -41,72 +41,75 @@
 
 ; CHECK-LABEL: asyncmbarrier
 define void @asyncmbarrier(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+; The distinction between PTX32/PTX64 here is only to capture pointer register type
+; in R to be used in subsequent tests.
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriershared
 define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoinc
 define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoincshared
 define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
   ret void
 }
 
-declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
 
 ; CHECK-LABEL: asynccasharedglobal4i8
-define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
-; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
-; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
-  tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
+define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK: 

[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*

2023-05-17 Thread Artem Belevich via Phabricator via cfe-commits
tra created this revision.
Herald added subscribers: mattd, gchakrabarti, asavonic, bixia, hiraditya.
Herald added a project: All.
tra updated this revision to Diff 523216.
tra added a comment.
tra retitled this revision from "[NVPTX] added src_size argument to 
__nvvm_cp_async* intrinsics." to "[NVPTX, CUDA] added optional src_size 
argument to __nvvm_cp_async*".
tra edited the summary of this revision.
Herald added a subscriber: yaxunl.
tra published this revision for review.
tra added reviewers: jlebar, nyalloc.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, jholewinski.
Herald added projects: clang, LLVM.

Updated clang side.


The optional argument is needed for CUDA-11+ headers when we're compiling for  
sm_80+ GPUs.

For the intrinsics, the src_size argument is required now. Old calls w/o the 
src_size argument can be upgraded by using src_size=transfer size of the 
intrinsic.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D150820

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/async-copy.ll

Index: llvm/test/CodeGen/NVPTX/async-copy.ll
===
--- llvm/test/CodeGen/NVPTX/async-copy.ll
+++ llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 
-; ALL-LABEL: asyncwaitgroup
+; CHECK-LABEL: asyncwaitgroup
 define void @asyncwaitgroup() {
-  ; ALL: cp.async.wait_group 8;
+  ; CHECK: cp.async.wait_group 8;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
-  ; ALL: cp.async.wait_group 0;
+  ; CHECK: cp.async.wait_group 0;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ; ALL: cp.async.wait_group 16;
+  ; CHECK: cp.async.wait_group 16;
   tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.wait.all()
 
-; ALL-LABEL: asyncwaitall
+; CHECK-LABEL: asyncwaitall
 define void @asyncwaitall() {
-; ALL: cp.async.wait_all
+; CHECK: cp.async.wait_all
   tail call void @llvm.nvvm.cp.async.wait.all()
   ret void
 }
 
 declare void @llvm.nvvm.cp.async.commit.group()
 
-; ALL-LABEL: asynccommitgroup
+; CHECK-LABEL: asynccommitgroup
 define void @asynccommitgroup() {
-; ALL: cp.async.commit_group
+; CHECK: cp.async.commit_group
   tail call void @llvm.nvvm.cp.async.commit.group()
   ret void
 }
@@ -41,72 +41,75 @@
 
 ; CHECK-LABEL: asyncmbarrier
 define void @asyncmbarrier(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+; The distinction between PTX32/PTX64 here is only to capture pointer register type
+; in R to be used in subsequent tests.
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriershared
 define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoinc
 define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
   ret void
 }
 
 ; CHECK-LABEL: asyncmbarriernoincshared
 define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
   tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
   ret void
 }
 
-declare void