[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-05-11 Thread Austin Kerbow via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG2db700215a2e: [AMDGPU] Add llvm.amdgcn.sched.barrier 
intrinsic (authored by kerbowa).
Herald added a subscriber: kosarev.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched_barrier.mir

Index: llvm/test/CodeGen/AMDGPU/sched_barrier.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched_barrier.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_NOP 0
+%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_ENDPGM 0
+...
+
+---
+name: sched_barrier_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: sched_barrier_0
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: SCHED_BARRIER 0
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-05-06 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec accepted this revision.
rampitec added a comment.

LGTM


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-05-06 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 427747.
kerbowa added a comment.
Herald added a subscriber: jsilvanus.

Use i32.
Output hex.
Fix hazard rec tests for pseudo instructions.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched_barrier.mir

Index: llvm/test/CodeGen/AMDGPU/sched_barrier.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched_barrier.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_NOP 0
+%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_ENDPGM 0
+...
+
+---
+name: sched_barrier_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: sched_barrier_0
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: SCHED_BARRIER 0
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

Can you add a test to make sure the hazard recognizer and code size estimate 
don’t think this is a real instruction


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec accepted this revision.
rampitec added a comment.
This revision is now accepted and ready to land.

In D124700#3483715 , @kerbowa wrote:

> In D124700#3483633 , @rampitec 
> wrote:
>
>> In D124700#3483609 , @kerbowa 
>> wrote:
>>
>>> In D124700#3483556 , @rampitec 
>>> wrote:
>>>
 You do not handle masks other than 0 yet?
>>>
>>> We handle 0 and 1 only.
>>
>> Do you mean 1 is supported simply because it has side effects? If I 
>> understand it right you will need to remove this to support more flexible 
>> masks, right?
>
> Yes.

LGTM given that. But change imm to i32 before committing.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

In D124700#3483633 , @rampitec wrote:

> In D124700#3483609 , @kerbowa wrote:
>
>> In D124700#3483556 , @rampitec 
>> wrote:
>>
>>> You do not handle masks other than 0 yet?
>>
>> We handle 0 and 1 only.
>
> Do you mean 1 is supported simply because it has side effects? If I 
> understand it right you will need to remove this to support more flexible 
> masks, right?

Yes.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added a comment.

In D124700#3483609 , @kerbowa wrote:

> In D124700#3483556 , @rampitec 
> wrote:
>
>> You do not handle masks other than 0 yet?
>
> We handle 0 and 1 only.

Do you mean 1 is supported simply because it has side effects? If I understand 
it right you will need to remove this to support more flexible masks, right?




Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:219
+// MASK = 0: No instructions may be scheduled across SCHED_BARRIER.
+// MASK = 1: Non-memory, non-side-effect producing instructions may be
+//   scheduled across SCHED_BARRIER, i.e. allow ALU instructions 
to pass.

kerbowa wrote:
> rampitec wrote:
> > Since you are going to extend it I'd suggest this is -1. Then you will 
> > start carving bits outs of it. That way if someone start to use it it will 
> > still work after update.
> Since the most common use case will be to block all instruction types I 
> thought having that be MASK = 0 made the most sense. After that, we carve out 
> bits for types of instructions that should be scheduled across it.
> 
> There may be modes where we restrict certain types of memops, so we cannot 
> have MASK = 1 above changed to -1. Since this (MASK = 1) is allowing all ALU 
> across we could define which bits mean VALU/SALU/MFMA etc and use that mask 
> if you think it's better. I'm worried we won't be able to anticipate all the 
> types that we could want to be maskable. It might be better to just have a 
> single bit that can mean all ALU, or all MemOps, and so on to avoid this 
> problem.
Ok. Let it be 1.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

In D124700#3483556 , @rampitec wrote:

> You do not handle masks other than 0 yet?

We handle 0 and 1 only.




Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:219
+// MASK = 0: No instructions may be scheduled across SCHED_BARRIER.
+// MASK = 1: Non-memory, non-side-effect producing instructions may be
+//   scheduled across SCHED_BARRIER, i.e. allow ALU instructions 
to pass.

rampitec wrote:
> Since you are going to extend it I'd suggest this is -1. Then you will start 
> carving bits outs of it. That way if someone start to use it it will still 
> work after update.
Since the most common use case will be to block all instruction types I thought 
having that be MASK = 0 made the most sense. After that, we carve out bits for 
types of instructions that should be scheduled across it.

There may be modes where we restrict certain types of memops, so we cannot have 
MASK = 1 above changed to -1. Since this (MASK = 1) is allowing all ALU across 
we could define which bits mean VALU/SALU/MFMA etc and use that mask if you 
think it's better. I'm worried we won't be able to anticipate all the types 
that we could want to be maskable. It might be better to just have a single bit 
that can mean all ALU, or all MemOps, and so on to avoid this problem.



Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:222
+def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem,
+IntrHasSideEffects, IntrConvergent, 
IntrWillReturn]>;

rampitec wrote:
> Why not full i32? This is immediate anyway but you will have more bits for 
> the future.
Good point thanks.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added a comment.

You do not handle masks other than 0 yet?




Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:219
+// MASK = 0: No instructions may be scheduled across SCHED_BARRIER.
+// MASK = 1: Non-memory, non-side-effect producing instructions may be
+//   scheduled across SCHED_BARRIER, i.e. allow ALU instructions 
to pass.

Since you are going to extend it I'd suggest this is -1. Then you will start 
carving bits outs of it. That way if someone start to use it it will still work 
after update.



Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:222
+def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem,
+IntrHasSideEffects, IntrConvergent, 
IntrWillReturn]>;

Why not full i32? This is immediate anyway but you will have more bits for the 
future.



Comment at: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp:213
+OutStreamer->emitRawComment(" sched_barrier mask(" +
+Twine(MI->getOperand(0).getImm()) + ")");
+  }

Use hex?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 426169.
kerbowa added a comment.

Add mir tests.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched_barrier.mir

Index: llvm/test/CodeGen/AMDGPU/sched_barrier.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched_barrier.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_NOP 0
+%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_ENDPGM 0
+...
+
+---
+name: sched_barrier_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: sched_barrier_0
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: SCHED_BARRIER 0
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa created this revision.
Herald added subscribers: hsmhsm, foad, hiraditya, t-tye, tpr, dstuttard, 
yaxunl, nhaehnle, jvesely, kzhuravl, arsenm.
Herald added a project: All.
kerbowa requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, wdng.
Herald added projects: clang, LLVM.

Adds an intrinsic/builtin that can be used to fine tune scheduler behavior. If
there is a need to have highly optimized codegen and kernel developers have
knowledge of inter-wave runtime behavior which is unknown to the compiler this
builtin can be used to tune scheduling.

This intrinsic creates a barrier between scheduling regions. The immediate
parameter is a mask to determine the types of instructions that should be
prevented from crossing the sched_barrier. In this initial patch, there are only
two variations. A mask of 0 means that no instructions may be scheduled across
the sched_barrier. A mask of 1 means that non-memory, non-side-effect inducing
instructions may cross the sched_barrier.

Note that this intrinsic is only meant to work with the scheduling passes. Any
other transformations that may move code will not be impacted in the ways
described above.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll

Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_wave_barrier() #0 {
+; GCN-LABEL: test_wave_barrier:
+; GCN:   ; %bb.0: ; %entry
+; GCN-NEXT:; sched_barrier mask(0)
+; GCN-NEXT:; sched_barrier mask(1)
+; GCN-NEXT:; sched_barrier mask(4)
+; GCN-NEXT:; sched_barrier mask(15)
+; GCN-NEXT:s_endpgm
+entry:
+  call void @llvm.amdgcn.sched.barrier(i16 0) #1
+  call void @llvm.amdgcn.sched.barrier(i16 1) #1
+  call void @llvm.amdgcn.sched.barrier(i16 4) #1
+  call void @llvm.amdgcn.sched.barrier(i16 15) #1
+  ret void
+}
+
+declare void @llvm.amdgcn.sched.barrier(i16) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
===
--- llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -148,6 +148,7 @@
 switch (II->getIntrinsicID()) {
 case Intrinsic::amdgcn_s_barrier:
 case Intrinsic::amdgcn_wave_barrier:
+case Intrinsic::amdgcn_sched_barrier:
   return false;
 default:
   break;
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -313,6 +313,18 @@
   let Size = 0;
 }
 
+def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i16imm:$mask),
+  [(int_amdgcn_sched_barrier (i16 timm:$mask))]> {
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
+}
+
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
===
--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1773,6 +1773,7 @@
   // hazard, even if one exist, won't really be visible. Should we handle it?
   case AMDGPU::SI_MASKED_UNREACHABLE:
   case AMDGPU::WAVE_BARRIER:
+  case AMDGPU::SCHED_BARRIER:
 return 0;
   }
 }
@@ -3490,6 +3491,9 @@
   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
 return true;
 
+  if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
+return true;
+
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
Index: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
===
--- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -207,6 +207,14 @@
   return;