[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
kerbowa marked an inline comment as done. kerbowa added inline comments. Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:314 + +bool SchedGroup::isFull() const { + return MaxSize && Collection.size() >= *MaxSize; uabelho wrote: > Compiling with gcc, I get a warning that this function is unused. > I'm wondering, there seems to be both a const and a non-const version of the > isFull method now, but they are identical? Perhaps the non-const version > could be removed? Removed in 7898426a72, thanks! Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
uabelho added inline comments. Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:314 + +bool SchedGroup::isFull() const { + return MaxSize && Collection.size() >= *MaxSize; Compiling with gcc, I get a warning that this function is unused. I'm wondering, there seems to be both a const and a non-const version of the isFull method now, but they are identical? Perhaps the non-const version could be removed? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rGf5b21680d122: [AMDGPU] Add amdgcn_sched_group_barrier builtin (authored by kerbowa). Changed prior to commit: https://reviews.llvm.org/D128158?vs=445965=448378#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 Files: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir Index: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir === --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir @@ -0,0 +1,254 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_MFMA_VALU_and_SALU_alternating(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_group_barrier +tracksRegLiveness: true +body: | + bb.0: +; CHECK-LABEL: name: no_sched_group_barrier +; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF +; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF +; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF +; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec +; CHECK-NEXT: S_NOP 0 +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) +; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] +%0:sreg_64 = IMPLICIT_DEF +%1:vgpr_32 = IMPLICIT_DEF +%2:areg_128 = IMPLICIT_DEF +%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
jrbyrnes added a comment. LGTM Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
kerbowa updated this revision to Diff 445965. kerbowa added a comment. Fix some bugs. Add better pipeline fitting. Address comments. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 Files: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir Index: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir === --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir @@ -0,0 +1,254 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_MFMA_VALU_and_SALU_alternating(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_group_barrier +tracksRegLiveness: true +body: | + bb.0: +; CHECK-LABEL: name: no_sched_group_barrier +; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF +; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF +; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF +; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec +; CHECK-NEXT: S_NOP 0 +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) +; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] +%0:sreg_64 = IMPLICIT_DEF +%1:vgpr_32 = IMPLICIT_DEF +%2:areg_128 = IMPLICIT_DEF +%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec +GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32)
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
arsenm added inline comments. Comment at: clang/test/SemaOpenCL/builtins-amdgcn-error.cl:70 +{ + __builtin_amdgcn_sched_group_barrier(x, 0, 1); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}} +} Test error for each argument? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
jrbyrnes added a comment. Hey Austin -- I like the removal of canAddMIs. In the original design, I was leaving open the possibility for users to pass in canAddMIs rather than a mask / SchedGroup name, but it looks like this isn't the direction we're going, and the classification functions defined in a general canAddMI makes things easier. I see this is a WIP, but I've added some thoughts I had from reading it over. I may have more as I use the design for my patch. Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:199 + // SchedGroupMask of instructions that should be barred. + SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; + I find it confusing that SchedBarrier uses inversion while SchedGroupBarrier doesn't. Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:306 +bool SchedGroup::isFull() const { + return MaxSize.hasValue() && Collection.size() >= *MaxSize; +} As in the update to IGroupLP.cpp in trunk, seems like we are not supposed to use hasValue. Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:349 + add(InitSU); + assert(MaxSize.hasValue()); + (*MaxSize)++; Not possible to have unsized groups? Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:445 + // initialized all of the SCHED_GROUP_BARRIER SchedGroups. + addSchedGroupBarrierEdges(); } If both types of barriers are present -- the SchedBarriers are handled first. However, if there is a conflict between SchedBarrier and SchedGroupBarrier, should SchedBarrier always get the priority? Maybe SchedBarrier should only handle groups not present in SchedGroupBarrier? Comment at: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir:104 +GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) +; 1 VMEM_READ +SCHED_GROUP_BARRIER 32, 1, 0 I think you are aware of this issue. But the ability for the mutation to match the pipeline is dependent upon which instructions go into which group (when an instruction can be mapped to multiple groups). If we had SchedGroups: 2 VMEM_READ, 1 VALU, 1 MFMA, 2 VMEM_READ and initial schedule: VMEMR, VALU, VMEMR, MFMA, VMEMR, with a dependency between middle VMEMR->MFMA. initSchedGroup will add the middle VMEMR to the last VMEMR group, but we could get a more accurate pipeline by adding it to the first group. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
kerbowa added a comment. Somewhat WIP needs more tests and cleanup. Posted for dependent work. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128158/new/ https://reviews.llvm.org/D128158 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin
kerbowa created this revision. kerbowa added reviewers: rampitec, jrbyrnes, vangthao95, arsenm. Herald added subscribers: kosarev, jsilvanus, foad, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl. Herald added a project: All. kerbowa requested review of this revision. Herald added subscribers: llvm-commits, cfe-commits, wdng. Herald added projects: clang, LLVM. This builtin allows the creation of custom scheduling pipelines on a per-region basis. Like the sched_barrier builtin this is intended to be used either for testing, in situations where the default scheduler heuristics cannot be improved, or in critical kernels where users are trying to get performance that is close to handwritten assembly. Obviously using these builtins will require extra work from the kernel writer to maintain the desired behavior. The builtin can be used to create groups of instructions called "scheduling groups" where ordering between the groups is enforced by the scheduler. __builtin_amdgcn_sched_group_barrier takes three parameters. The first parameter is a mask that determines the types of instructions that you would like to synchronize around and add to a scheduling group. These instructions will be selected from the bottom up starting from the sched_group_barrier's location during instruction scheduling. The second parameter is the number of matching instructions that will be associated with this sched_group_barrier. The third parameter is an identifier which is used to describe what other sched_group_barriers should be synchronized with. Note that multiple sched_group_barriers must be added in order for them to be useful since they only synchronize with other sched_group_barriers. Only "scheduling groups" with a matching third parameter will have any enforced ordering between them. As an example, the code below tries to create a pipeline of 1 VMEM_READ instruction followed by 1 VALU instruction followed by 5 MFMA instructions... // 1 VMEM_READ __builtin_amdgcn_sched_group_barrier(32, 1, 0) // 1 VALU __builtin_amdgcn_sched_group_barrier(2, 1, 0) // 5 MFMA __builtin_amdgcn_sched_group_barrier(8, 5, 0) // 1 VMEM_READ __builtin_amdgcn_sched_group_barrier(32, 1, 0) // 3 VALU __builtin_amdgcn_sched_group_barrier(2, 3, 0) // 2 VMEM_WRITE __builtin_amdgcn_sched_group_barrier(64, 2, 0) Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D128158 Files: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir Index: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir === --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir @@ -0,0 +1,173 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -amdgpu-disable-power-sched=true -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_group_barrier +tracksRegLiveness: true +body: | + bb.0: +; CHECK-LABEL: name: no_sched_group_barrier +; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF +; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF +; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) +; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF +; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec +; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec +; CHECK-NEXT: