================ @@ -5189,10 +5196,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addReg(NewAccumulator->getOperand(0).getReg()) .addImm(1) .setOperandDead(3); // Dead scc - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) - .addReg(SrcReg) - .addReg(ParityRegister); - break; + if (is32BitOpc) { + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(ParityRegister); + break; + } else { + Register DestSub0 = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Op1H_Op0L_Reg = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register CarryReg = ---------------- jmmartinez wrote:
> Re: s_bfe_i32 : either ways I will be emitting 2 instrs. I can try this tho. Don't worry. It if was a `v_mul` there could be a difference according to the sched-model, but I just checked and it seems be the same for all scalar operations. https://github.com/llvm/llvm-project/pull/151310 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits