llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: hidekisaito <details> <summary>Changes</summary> Add MFMAValuSpacingOpt strategy in pre-RA scheduler. Tries to schedule equal number of VALU MIR instructions between MFMAs. Assisted-by: Cursor --- Full diff: https://github.com/llvm/llvm-project/pull/190916.diff 7 Files Affected: - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn.cl (+4-2) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+7-2) - (modified) llvm/lib/IR/Verifier.cpp (+10) - (modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+74-1) - (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+26) - (added) llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll (+72) - (added) llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir (+35) ``````````diff diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 0fc40d396a87d..cd4cb103291dd 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -980,14 +980,16 @@ void test_sched_group_barrier() // CHECK-LABEL: @test_iglp_opt // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 2) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 3) // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 4) -// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 15) void test_iglp_opt() { __builtin_amdgcn_iglp_opt(0); __builtin_amdgcn_iglp_opt(1); + __builtin_amdgcn_iglp_opt(2); + __builtin_amdgcn_iglp_opt(3); __builtin_amdgcn_iglp_opt(4); - __builtin_amdgcn_iglp_opt(15); } // CHECK-LABEL: @test_s_sleep diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f576972183eca..3efa959241266 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -377,8 +377,13 @@ def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -// Scheduler optimization hint. -// MASK = 0: Small gemm opt +// Scheduler optimization hint: immediate selects the IGLP strategy in +// AMDGPUIGroupLP.cpp (createIGLPStrategy). Must be a constant in [0,4]. Mappings: +// 0 - MFMA small-GEMM scheduling (MFMASmallGemmOpt). +// 1 - MFMA small-GEMM single-wave variant (MFMASmallGemmSingleWaveOpt). +// 2 - MFMAExpInterleaveOpt (TRANS/MFMA scheduling pipeline; see AMDGPUIGroupLP.cpp). +// 3 - MFMAExpSimpleInterleaveOpt (simpler TRANS-then-MFMA interleave pattern). +// 4 - MFMA (or WMMA) / VALU spacing (MFMAValuSpacingOpt). def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index cf9131c66d6c3..36e5f96215783 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -7097,6 +7097,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "llvm.amdgcn.s.prefetch.data only supports global or constant memory"); break; } + case Intrinsic::amdgcn_iglp_opt: { + const auto *Mask = dyn_cast<ConstantInt>(Call.getArgOperand(0)); + Check(Mask, "llvm.amdgcn.iglp.opt requires a constant mask argument", &Call, + Call.getArgOperand(0)); + const int64_t V = Mask->getSExtValue(); + Check(V >= 0 && V <= 4, + "llvm.amdgcn.iglp.opt mask must be in the range [0,4]", &Call, + Call.getArgOperand(0)); + break; + } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { Value *Src0 = Call.getArgOperand(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index d49ec90e4c212..363c34c07e2fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -823,7 +823,8 @@ enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, MFMASmallGemmSingleWaveOptID = 1, MFMAExpInterleaveID = 2, - MFMAExpSimpleInterleaveID = 3 + MFMAExpSimpleInterleaveID = 3, + MFMAValuSpacingOptID = 4, }; // Implement a IGLP scheduling strategy. @@ -896,6 +897,76 @@ bool MFMASmallGemmOpt::applyIGLPStrategy( return true; } +static bool isMFMAValuSpacingGapValu(const MachineInstr &MI, + const SIInstrInfo *TII) { + if (MI.isMetaInstruction()) + return false; + return TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) && + !MI.mayLoadOrStore(); +} + +/// Interleave MFMA/WMMA with VALU slots: each repeating stage is one MFMA (or +/// WMMA), then up to N VALU ops per gap where N = floor(#VALU / #MFMA) in this +/// schedule region (same predicate as \c isMFMAValuSpacingGapValu), at least 1. +/// Template length uses MFMACount * 3 for slack, like MFMASmallGemmOpt. +/// \p IsBottomUp is false so SchedGroup pipeline order matches forward program +/// order (MFMA before its VALU gap). +class MFMAValuSpacingOpt final : public IGLPStrategy { +public: + bool applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, + AMDGPU::SchedulingPhase Phase) override { + for (const MachineInstr &I : *DAG) + if (TII->isMFMAorWMMA(I)) + return true; + return false; + } + + MFMAValuSpacingOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = false; + } +}; + +bool MFMAValuSpacingOpt::applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) { + unsigned MFMACount = 0; + unsigned ValuCount = 0; + for (const MachineInstr &I : *DAG) { + if (TII->isMFMAorWMMA(I)) + ++MFMACount; + else if (isMFMAValuSpacingGapValu(I, TII)) + ++ValuCount; + } + + unsigned ValuGap = 1; + if (MFMACount > 0) { + ValuGap = ValuCount / MFMACount; + if (ValuGap < 1) + ValuGap = 1; + } + + const unsigned PipelineSyncID = 0; + SchedGroup *SG = nullptr; + for (unsigned I = 0; I < MFMACount * 3; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, ValuGap, PipelineSyncID, DAG, TII); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); + } + + return true; +} + class MFMAExpInterleaveOpt final : public IGLPStrategy { private: // The count of TRANS SUs involved in the interleaved pipeline @@ -2327,6 +2398,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); case MFMAExpSimpleInterleaveID: return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII); + case MFMAValuSpacingOptID: + return std::make_unique<MFMAValuSpacingOpt>(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ad24bad1fd5d7..94d13bd01af26 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -3244,6 +3244,21 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { }); } +static bool hasIGLPOpt(ScheduleDAGInstrs *DAG, int64_t StrategyImm) { + return any_of(*DAG, [StrategyImm](MachineBasicBlock::iterator MI) { + return MI->getOpcode() == AMDGPU::IGLP_OPT && MI->getNumOperands() >= 1 && + MI->getOperand(0).isImm() && + MI->getOperand(0).getImm() == StrategyImm; + }); +} + +static bool hasSchedBarrier(ScheduleDAGInstrs *DAG) { + return any_of(*DAG, [](MachineBasicBlock::iterator MI) { + unsigned Opc = MI->getOpcode(); + return Opc == AMDGPU::SCHED_BARRIER || Opc == AMDGPU::SCHED_GROUP_BARRIER; + }); +} + GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, bool RemoveKillFlags) @@ -3252,6 +3267,17 @@ GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( void GCNPostScheduleDAGMILive::schedule() { HasIGLPInstrs = hasIGLPInstrs(this); if (HasIGLPInstrs) { + // MFMAValuSpacingOpt is a pre-RA strategy whose interleaving is correct + // after the initial machine scheduler. The post-RA scheduler would undo + // the reordering, so preserve the pre-RA schedule by skipping here. + // When SCHED_[GROUP_]BARRIER coexists with IGLP_OPT, IGroupLP ignores the + // IGLP_OPT (they are mutually exclusive), so let post-RA scheduling proceed + // normally. + // Immediate 4 == MFMAValuSpacingOpt in AMDGPUIGroupLP.cpp (IGLPStrategyID). + if (hasIGLPOpt(this, 4) && !hasSchedBarrier(this)) { + HasIGLPInstrs = false; + return; + } SavedMutations.clear(); SavedMutations.swap(Mutations); addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll new file mode 100644 index 0000000000000..fad3dde3d06bf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Full codegen on gfx950. Two MFMAs fed by loaded floats; three independent +; i32 muls stored to a second buffer. sched.barrier(0) isolates the MUL+MFMA +; region so that address-computation VALUs don't inflate the VALU gap in +; MFMAValuSpacingOpt. +; +; With iglp_opt(4) the expected MFMA/VALU interleaving (ValuGap=1) is: +; MFMA, MUL, MFMA, MUL, MUL +; +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s + +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() +declare void @llvm.amdgcn.iglp.opt(i32 immarg) +declare void @llvm.amdgcn.sched.barrier(i32 immarg) + +define amdgpu_kernel void @mfma_valu_iglp4(ptr addrspace(1) %p, ptr addrspace(1) %q) #0 { +; CHECK-LABEL: mfma_valu_iglp4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v8, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[6:7], v8, s[0:1] +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3] +; CHECK-NEXT: v_mul_lo_u32 v4, v6, v6 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3] +; CHECK-NEXT: v_mul_lo_u32 v5, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v6, v7, v7 +; CHECK-NEXT: ; iglp_opt mask(0x00000004) +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx3 v8, v[4:6], s[2:3] +; CHECK-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %t = load <4 x float>, ptr addrspace(1) %p + %gep0 = getelementptr inbounds float, ptr addrspace(1) %p, i32 %tid + %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i32 1 + %f0 = load float, ptr addrspace(1) %gep0 + %f1 = load float, ptr addrspace(1) %gep1 + %i0 = bitcast float %f0 to i32 + %i1 = bitcast float %f1 to i32 + call void @llvm.amdgcn.sched.barrier(i32 0) + %m0 = mul nsw i32 %i0, %i0 + %m1 = mul nsw i32 %i0, %i1 + %m2 = mul nsw i32 %i1, %i1 + call void @llvm.amdgcn.iglp.opt(i32 4) + %mai = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float %f1, <4 x float> %t, i32 0, i32 0, i32 0) + %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float %f1, <4 x float> %mai, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.sched.barrier(i32 0) + store <4 x float> %mai2, ptr addrspace(1) %p + %qgep0 = getelementptr inbounds i32, ptr addrspace(1) %q, i32 %tid + %qgep1 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 1 + %qgep2 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 2 + store i32 %m0, ptr addrspace(1) %qgep0 + store i32 %m1, ptr addrspace(1) %qgep1 + store i32 %m2, ptr addrspace(1) %qgep2 + ret void +} + +attributes #0 = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir new file mode 100644 index 0000000000000..7f173ddc8e5aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# Pre-RA machine scheduler with IGroupLP / MFMAValuSpacingOpt (iglp_opt(4)). +# With IGLP_OPT 4 the expected MFMA/VALU interleaving (ValuGap=1) is: +# MFMA, MUL, MFMA, MUL, MUL +# +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s + +--- +name: mfma_valu_iglp4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: mfma_valu_iglp4 + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF]], [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF1]], [[DEF1]], implicit $exec + ; CHECK-NEXT: IGLP_OPT 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_]], implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_1]] + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128_align2 = IMPLICIT_DEF + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %0, implicit $exec + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %1, implicit $exec + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %1, %1, implicit $exec + %6:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %2, 0, 0, 0, implicit $mode, implicit $exec + %7:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %6, 0, 0, 0, implicit $mode, implicit $exec + IGLP_OPT 4 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %7 + +... `````````` </details> https://github.com/llvm/llvm-project/pull/190916 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
