https://github.com/lalaniket8 updated https://github.com/llvm/llvm-project/pull/133228
>From c8989dc07dec2af1ecc7e8fd07e422e760d3bfb6 Mon Sep 17 00:00:00 2001 From: anikelal <anike...@amd.com> Date: Tue, 8 Apr 2025 14:14:10 +0530 Subject: [PATCH] reduce wrt divergent mask --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 + clang/lib/CodeGen/CGBuiltin.cpp | 18 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 5 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 171 +++++++++++------- llvm/lib/Target/AMDGPU/SIInstructions.td | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 16 +- 6 files changed, 141 insertions(+), 84 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 44ef404aee72f..762e74461a835 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -363,6 +363,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc") BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc") BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_mask_max_i32, "iiii", "nc") + //===----------------------------------------------------------------------===// // MFMA builtins. //===----------------------------------------------------------------------===// @@ -620,5 +622,6 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index c126f88b9e3a5..7d07ea0b45f68 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20053,6 +20053,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { + switch (BuiltinID) { + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32: + return Intrinsic::amdgcn_wave_reduce_umax; + default: + llvm_unreachable("Unknown BuiltinID for wave reduction"); + } +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -20360,6 +20369,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_hi: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true); + case AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:{ + Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID); + llvm::Value *Value = EmitScalarExpr(E->getArg(0)); + llvm::Value *Mask = EmitScalarExpr(E->getArg(1)); + llvm::Value *Strategy = EmitScalarExpr(E->getArg(2)); + // llvm::errs() << "Value->getType():" << Value->getType() << "\n"; + llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()}); + return Builder.CreateCall(F, {Value, Mask, Strategy}); + } case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray: case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h: case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 86e050333acc7..b85648e6c3077 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2316,12 +2316,13 @@ def int_amdgcn_s_wqm : class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< [data_ty], [ - LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) + LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR), + llvm_i32_ty, // Divergent mask llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, // 1: Iterative strategy, and // 2. DPP) ], - [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>; def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9743320601ed4..e39dd79c4fd62 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4970,114 +4970,149 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, const SIRegisterInfo *TRI = ST.getRegisterInfo(); const DebugLoc &DL = MI.getDebugLoc(); const SIInstrInfo *TII = ST.getInstrInfo(); - + // const MachineFunction *MF = BB.getParent(); + // const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo(); // Reduction operations depend on whether the input operand is SGPR or VGPR. Register SrcReg = MI.getOperand(1).getReg(); - bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); + auto SrcRegClass = MRI.getRegClass(SrcReg); + // llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n"; + bool isSGPR = TRI->isSGPRClass(SrcRegClass); Register DstReg = MI.getOperand(0).getReg(); + // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n"; + Register DivergentMaskReg = MI.getOperand(2).getReg(); + // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n"; + MachineBasicBlock *RetBB = nullptr; if (isSGPR) { - // These operations with a uniform value i.e. SGPR are idempotent. - // Reduced value will be same as given sgpr. - // clang-format off BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg) .addReg(SrcReg); - // clang-format on RetBB = &BB; } else { - // TODO: Implement DPP Strategy and switch based on immediate strategy - // operand. For now, for all the cases (default, Iterative and DPP we use - // iterative approach by default.) - - // To reduce the VGPR using iterative approach, we need to iterate - // over all the active lanes. Lowering consists of ComputeLoop, - // which iterate over only active lanes. We use copy of EXEC register - // as induction variable and every active lane modifies it using bitset0 - // so that we will get the next active lane for next iteration. + MachineBasicBlock::iterator I = BB.end(); - Register SrcReg = MI.getOperand(1).getReg(); - // Create Control flow for loop - // Split MI's Machine Basic block into For loop auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); - // Create virtual registers required for lowering. + auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass; + auto SReg32RegClass = &AMDGPU::SReg_32RegClass; + const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); - Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); - Register InitalValReg = MRI.createVirtualRegister(DstRegClass); - - Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); - Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); - Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); - - Register FF1Reg = MRI.createVirtualRegister(DstRegClass); - Register LaneValueReg = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register ExecCopyReg1 = MRI.createVirtualRegister(WaveMaskRegClass); + Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass); + Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass); + Register AccReg1 = MRI.createVirtualRegister(DstRegClass); + Register AccReg = MRI.createVirtualRegister(DstRegClass); + Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass); + Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass); + Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass); + Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass); + Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass); + Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass); + Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass); + Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass); + Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass); bool IsWave32 = ST.isWave32(); - unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - // Create initail values of induction variable from Exec, Accumulator and - // insert branch instr to newly created ComputeBlockk - uint32_t InitalValue = + uint32_t IdentityValue = (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; - auto TmpSReg = - BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); - BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) - .addImm(InitalValue); - // clang-format off + + BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec + + BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg1).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec + + BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg) + .addImm(IdentityValue);// %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg) + .addImm(0); BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) .addMBB(ComputeLoop); - // clang-format on - // Start constructing ComputeLoop I = ComputeLoop->end(); - auto Accumulator = - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) - .addReg(InitalValReg) - .addMBB(&BB); - auto ActiveBits = - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) - .addReg(TmpSReg->getOperand(0).getReg()) - .addMBB(&BB); + auto PhiActiveLanesInst = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg) + .addReg(ExecCopyReg) + .addMBB(&BB);// %25:sreg_64_xexec = PHI %19:sreg_64_xexec, %bb.0, %26:sreg_64_xexec, %bb.1 + auto PhiAccInst = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1) + .addReg(AccReg) + .addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1 + auto PhiBPermAddrInst = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg) + .addReg(InitialBPermAddrReg) + .addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1 // Perform the computations - unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; - auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) - .addReg(ActiveBits->getOperand(0).getReg()); - auto LaneValue = BuildMI(*ComputeLoop, I, DL, - TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) + BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg) + .addReg(ActiveLanesReg);//%27:sreg_32 = S_FF1_I32_B64 %25:sreg_64_xexec + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg) .addReg(SrcReg) - .addReg(FF1->getOperand(0).getReg()); - auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValue->getOperand(0).getReg()); + .addReg(FF1ActiveLanesReg);//%29:sreg_32_xm0 = V_READLANE_B32 %10:vgpr_32, %27:sreg_32 + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg) + .addReg(DivergentMaskReg) + .addReg(FF1ActiveLanesReg); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg) + .addReg(AccReg1) + .addReg(FF1MaskReg); + + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(FF1MaskReg); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg) + .addReg(UpdatedAccSGPRReg) + .addReg(AMDGPU::M0) + .addReg(AccReg1); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg) + .addReg(FF1MaskReg) + .addImm(2); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(FF1ActiveLanesReg); + + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg) + .addReg(FF1MaskX4Reg) + .addReg(AMDGPU::M0) + .addReg(BPermAddrReg); - // Manipulate the iterator to get the next active lane unsigned BITSETOpc = IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; - auto NewActiveBits = - BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) - .addReg(FF1->getOperand(0).getReg()) - .addReg(ActiveBits->getOperand(0).getReg()); + BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg) + .addReg(FF1ActiveLanesReg) + .addReg(ActiveLanesReg); - // Add phi nodes - Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) + PhiActiveLanesInst.addReg(UpdatedActiveLanesReg) .addMBB(ComputeLoop); - ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) + PhiAccInst.addReg(UpdatedAccReg) + .addMBB(ComputeLoop); + PhiBPermAddrInst.addReg(UpdatedBPermAddrReg) .addMBB(ComputeLoop); - // Creating branching unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) - .addReg(NewActiveBits->getOperand(0).getReg()) + .addReg(UpdatedActiveLanesReg) .addImm(0); BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) .addMBB(ComputeLoop); + BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg) + .addReg(UpdatedBPermAddrReg) + .addReg(UpdatedAccReg) + .addImm(0); + RetBB = ComputeEnd; + } MI.eraseFromParent(); return RetBB; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index de77401eb0137..20192647dfeeb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -298,14 +298,14 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { - def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), - (ins VSrc_b32: $src, VSrc_b32:$strategy), - [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { + def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy), + [(set i32:$vdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$mask, i32:$strategy))]> { } - def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), - (ins VSrc_b32: $src, VSrc_b32:$strategy), - [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { + def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy), + [(set i32:$vdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$mask, i32:$strategy))]> { } } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index deeceed3a19be..f85b94198c390 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s -declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32, i32 immarg) declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { @@ -122,12 +122,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_endpgm entry: - %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1) + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 15, i32 1) store i32 %result, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +define amdgpu_kernel void @const_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -218,7 +218,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_endpgm entry: - %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1) + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 %in, i32 1) store i32 %result, ptr addrspace(1) %out ret void } @@ -256,7 +256,7 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { ; GFX11GISEL: ; %bb.0: ; %entry ; GFX11GISEL-NEXT: s_endpgm entry: - %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1) + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 poison, i32 1) store i32 %result, ptr addrspace(1) %out ret void } @@ -499,7 +499,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_endpgm entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1) + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %in, i32 1) store i32 %result, ptr addrspace(1) %out ret void } @@ -937,11 +937,11 @@ entry: br i1 %d_cmp, label %if, label %else if: - %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1) + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 %in, i32 1) br label %endif else: - %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1) + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 %in, i32 1) br label %endif endif: _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits