kerbowa created this revision. Herald added subscribers: hsmhsm, foad, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl, arsenm. Herald added a project: All. kerbowa requested review of this revision. Herald added subscribers: llvm-commits, cfe-commits, wdng. Herald added projects: clang, LLVM.
Adds an intrinsic/builtin that can be used to fine tune scheduler behavior. If there is a need to have highly optimized codegen and kernel developers have knowledge of inter-wave runtime behavior which is unknown to the compiler this builtin can be used to tune scheduling. This intrinsic creates a barrier between scheduling regions. The immediate parameter is a mask to determine the types of instructions that should be prevented from crossing the sched_barrier. In this initial patch, there are only two variations. A mask of 0 means that no instructions may be scheduled across the sched_barrier. A mask of 1 means that non-memory, non-side-effect inducing instructions may cross the sched_barrier. Note that this intrinsic is only meant to work with the scheduling passes. Any other transformations that may move code will not be impacted in the ways described above. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D124700 Files: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @test_wave_barrier() #0 { +; GCN-LABEL: test_wave_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; sched_barrier mask(0) +; GCN-NEXT: ; sched_barrier mask(1) +; GCN-NEXT: ; sched_barrier mask(4) +; GCN-NEXT: ; sched_barrier mask(15) +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.sched.barrier(i16 0) #1 + call void @llvm.amdgcn.sched.barrier(i16 1) #1 + call void @llvm.amdgcn.sched.barrier(i16 4) #1 + call void @llvm.amdgcn.sched.barrier(i16 15) #1 + ret void +} + +declare void @llvm.amdgcn.sched.barrier(i16) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -148,6 +148,7 @@ switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_s_barrier: case Intrinsic::amdgcn_wave_barrier: + case Intrinsic::amdgcn_sched_barrier: return false; default: break; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -313,6 +313,18 @@ let Size = 0; } +def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i16imm:$mask), + [(int_amdgcn_sched_barrier (i16 timm:$mask))]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1773,6 +1773,7 @@ // hazard, even if one exist, won't really be visible. Should we handle it? case AMDGPU::SI_MASKED_UNREACHABLE: case AMDGPU::WAVE_BARRIER: + case AMDGPU::SCHED_BARRIER: return 0; } } @@ -3490,6 +3491,9 @@ if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) return true; + if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) + return true; + // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. Index: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -207,6 +207,14 @@ return; } + if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) { + if (isVerbose()) { + OutStreamer->emitRawComment(" sched_barrier mask(" + + Twine(MI->getOperand(0).getImm()) + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { if (isVerbose()) OutStreamer->emitRawComment(" divergent unreachable"); Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -213,6 +213,15 @@ def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; +// The 1st parameter is a mask for the types of instructions that may be allowed +// to cross the SCHED_BARRIER during scheduling. +// MASK = 0: No instructions may be scheduled across SCHED_BARRIER. +// MASK = 1: Non-memory, non-side-effect producing instructions may be +// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. +def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">, + Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, + IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; + def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">, Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; Index: clang/test/SemaOpenCL/builtins-amdgcn-error.cl =================================================================== --- clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -60,6 +60,12 @@ __builtin_amdgcn_s_setprio(65536); // expected-warning {{implicit conversion from 'int' to 'short' changes value from 65536 to 0}} } +void test_sched_barrier(int x) +{ + __builtin_amdgcn_sched_barrier(x); // expected-error {{argument to '__builtin_amdgcn_sched_barrier' must be a constant integer}} + __builtin_amdgcn_sched_barrier(65536); // expected-warning {{implicit conversion from 'int' to 'short' changes value from 65536 to 0}} +} + void test_sicmp_i32(global ulong* out, int a, int b, uint c) { *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}} Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -396,6 +396,19 @@ __builtin_amdgcn_wave_barrier(); } +// CHECK-LABEL: @test_sched_barrier +// CHECK: call void @llvm.amdgcn.sched.barrier(i16 0) +// CHECK: call void @llvm.amdgcn.sched.barrier(i16 1) +// CHECK: call void @llvm.amdgcn.sched.barrier(i16 4) +// CHECK: call void @llvm.amdgcn.sched.barrier(i16 15) +void test_sched_barrier() +{ + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_sched_barrier(1); + __builtin_amdgcn_sched_barrier(4); + __builtin_amdgcn_sched_barrier(15); +} + // CHECK-LABEL: @test_s_sleep // CHECK: call void @llvm.amdgcn.s.sleep(i32 1) // CHECK: call void @llvm.amdgcn.s.sleep(i32 15) Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -62,6 +62,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") +BUILTIN(__builtin_amdgcn_sched_barrier, "vIs", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits