https://gcc.gnu.org/g:9bc916bc062063b69889fe1b8e304d500183edf0
commit r17-683-g9bc916bc062063b69889fe1b8e304d500183edf0 Author: Dimitar Dimitrov <[email protected]> Date: Wed Mar 25 22:34:05 2026 +0200 pru: Inline muldi3 when optimizing for speed When optimizing for speed, it is faster to inline the 32-bit multiplication sub-operations, instead of calling a library function. This saves instruction cycles spent for preparing a call to the multi64 library function, at the expense of duplication in text section. The inlined muldi3 operation uses only a few temporary registers, so there should be no negative effects due to increased register pressure. Even more, the register pressure may even decrease with inlining because the number of temporary registers is much lower than the number of caller-saved registers for PRU. gcc/ChangeLog: * config/pru/constraints.md: Prevent allocating r27 as SImode destination for mulsi3 pattern. * config/pru/pru.h (enum reg_class): Expand MULDST_REGS to allow fitting DImode. * config/pru/pru.md (umulsidi3): New pattern. (muldi3): Ditto. Signed-off-by: Dimitar Dimitrov <[email protected]> Diff: --- gcc/config/pru/constraints.md | 3 +- gcc/config/pru/pru.h | 2 +- gcc/config/pru/pru.md | 76 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/gcc/config/pru/constraints.md b/gcc/config/pru/constraints.md index 07d82284a548..265a3c8f91e5 100644 --- a/gcc/config/pru/constraints.md +++ b/gcc/config/pru/constraints.md @@ -53,7 +53,8 @@ (define_register_constraint "Rmd0" "MULDST_REGS" "@internal - The multiply destination register.") + The multiply destination register." + "regno == MULDST_REGNUM") (define_register_constraint "Rms0" "MULSRC0_REGS" "@internal diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h index f469122beea6..a370d08c3854 100644 --- a/gcc/config/pru/pru.h +++ b/gcc/config/pru/pru.h @@ -246,7 +246,7 @@ enum reg_class /* NO_REGS */ { 0, 0, 0, 0, 0}, \ /* SIB_REGS */ { 0xf, 0xff000000u, ~0u, 0xffffffu, 0},\ /* LOOPCNTR_REGS */ { 0, 0, 0, 0, 0xf}, \ - /* MULDST_REGS */ { 0, 0, 0, 0x00000f00u, 0}, \ + /* MULDST_REGS */ { 0, 0, 0, 0x0000ff00u, 0}, \ /* MULSRC0_REGS */ { 0, 0, 0, 0x000f0000u, 0}, \ /* MULSRC1_REGS */ { 0, 0, 0, 0x00f00000u, 0}, \ /* REGIO_REGS */ { 0, 0, 0, 0xff000000u, 0}, \ diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md index e62fcf27a863..93638963d29e 100644 --- a/gcc/config/pru/pru.md +++ b/gcc/config/pru/pru.md @@ -1220,6 +1220,82 @@ "nop\;xin\\t0, %0, 4" [(set_attr "type" "alu") (set_attr "length" "8")]) + +(define_insn "umulsidi3" + [(set (match_operand:DI 0 "pru_muldst_operand" "=Rmd0") + (mult:DI + (zero_extend:DI + (match_operand:SI 1 "pru_mulsrc0_operand" "%Rms0")) + (zero_extend:DI + (match_operand:SI 2 "pru_mulsrc1_operand" "Rms1"))))] + "TARGET_OPT_MUL" + "nop\;xin\\t0, %0, 8" + [(set_attr "type" "alu") + (set_attr "length" "8")]) + + +;; If optimizing for speed, prefer to inline 64-bit multiplication, +;; in order to avoid the overhead of calling a library function. +(define_expand "muldi3" + [(parallel [(set (match_operand:DI 0 "register_operand") + (mult:DI (match_operand:DI 1 "register_operand") + (match_operand:DI 2 "register_operand"))) + (clobber (reg:DI MULDST_REGNUM)) + (clobber (reg:SI MULSRC0_REGNUM)) + (clobber (reg:SI MULSRC1_REGNUM)) + ])] + "TARGET_OPT_MUL && !optimize_size && can_create_pseudo_p ()" +{ + rtx op0_lo = simplify_gen_subreg (SImode, operands[0], DImode, 0); + rtx op0_hi = simplify_gen_subreg (SImode, operands[0], DImode, 4); + rtx op1_lo = simplify_gen_subreg (SImode, operands[1], DImode, 0); + rtx op1_hi = simplify_gen_subreg (SImode, operands[1], DImode, 4); + rtx op2_lo = simplify_gen_subreg (SImode, operands[2], DImode, 0); + rtx op2_hi = simplify_gen_subreg (SImode, operands[2], DImode, 4); + + gcc_assert (!reload_completed); + + rtx cross_product1 = gen_reg_rtx (SImode); + rtx cross_product2 = gen_reg_rtx (SImode); + rtx low_product = gen_reg_rtx (DImode); + rtx cross_scratch1_hi = gen_reg_rtx (SImode); + + rtx reg_mulsrc0_si = gen_reg_rtx (SImode); + rtx reg_mulsrc1_si = gen_reg_rtx (SImode); + rtx reg_muldst_di = gen_reg_rtx (DImode); + + /* (al + C * ah) * (bl + C * bh) = al * bl + + C * ah * bl + + C * al * bh + + C * C * ah * bh -> discard, overflow + Where C=(1 << 32). */ + + emit_move_insn (cross_product1, + gen_rtx_MULT (SImode, op1_hi, op2_lo)); + emit_move_insn (cross_product2, + gen_rtx_MULT (SImode, op1_lo, op2_hi)); + + /* Calculate "al * bl". */ + emit_move_insn (reg_mulsrc0_si, op1_lo); + emit_move_insn (reg_mulsrc1_si, op2_lo); + emit_insn (gen_umulsidi3 (reg_muldst_di, reg_mulsrc0_si, reg_mulsrc1_si)); + emit_move_insn (low_product, reg_muldst_di); + + emit_move_insn (cross_scratch1_hi, + gen_rtx_PLUS (SImode, + cross_product1, + cross_product2)); + emit_move_insn (op0_lo, + simplify_gen_subreg (SImode, low_product, DImode, 0)); + emit_move_insn (op0_hi, + gen_rtx_PLUS (SImode, + simplify_gen_subreg (SImode, + low_product, + DImode, + 4), + cross_scratch1_hi)); + DONE; +}) ;; Prologue, Epilogue and Return
