[gcc r17-683] pru: Inline muldi3 when optimizing for speed

Dimitar Dimitrov via Gcc-cvs Fri, 22 May 2026 23:57:01 -0700

https://gcc.gnu.org/g:9bc916bc062063b69889fe1b8e304d500183edf0


commit r17-683-g9bc916bc062063b69889fe1b8e304d500183edf0
Author: Dimitar Dimitrov <[email protected]>
Date:   Wed Mar 25 22:34:05 2026 +0200

    pru: Inline muldi3 when optimizing for speed
    
    When optimizing for speed, it is faster to inline the 32-bit
    multiplication sub-operations, instead of calling a library function.
    This saves instruction cycles spent for preparing a call to the multi64
    library function, at the expense of duplication in text section.
    
    The inlined muldi3 operation uses only a few temporary registers,
    so there should be no negative effects due to increased register
    pressure.  Even more, the register pressure may even decrease with
    inlining because the number of temporary registers is much lower than
    the number of caller-saved registers for PRU.
    
    gcc/ChangeLog:
    
            * config/pru/constraints.md: Prevent allocating r27 as
            SImode destination for mulsi3 pattern.
            * config/pru/pru.h (enum reg_class): Expand MULDST_REGS
            to allow fitting DImode.
            * config/pru/pru.md (umulsidi3): New pattern.
            (muldi3): Ditto.
    
    Signed-off-by: Dimitar Dimitrov <[email protected]>

Diff:
---
 gcc/config/pru/constraints.md |  3 +-
 gcc/config/pru/pru.h          |  2 +-
 gcc/config/pru/pru.md         | 76 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/gcc/config/pru/constraints.md b/gcc/config/pru/constraints.md
index 07d82284a548..265a3c8f91e5 100644
--- a/gcc/config/pru/constraints.md
+++ b/gcc/config/pru/constraints.md
@@ -53,7 +53,8 @@
 
 (define_register_constraint "Rmd0" "MULDST_REGS"
   "@internal
-  The multiply destination register.")
+  The multiply destination register."
+  "regno == MULDST_REGNUM")
 
 (define_register_constraint "Rms0" "MULSRC0_REGS"
   "@internal
diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h
index f469122beea6..a370d08c3854 100644
--- a/gcc/config/pru/pru.h
+++ b/gcc/config/pru/pru.h
@@ -246,7 +246,7 @@ enum reg_class
     /* NO_REGS       */ { 0, 0, 0, 0, 0},                      \
     /* SIB_REGS              */ { 0xf, 0xff000000u, ~0u, 0xffffffu, 0},\
     /* LOOPCNTR_REGS  */ { 0, 0, 0, 0, 0xf},                   \
-    /* MULDST_REGS    */ { 0, 0, 0, 0x00000f00u, 0},           \
+    /* MULDST_REGS    */ { 0, 0, 0, 0x0000ff00u, 0},           \
     /* MULSRC0_REGS   */ { 0, 0, 0, 0x000f0000u, 0},           \
     /* MULSRC1_REGS   */ { 0, 0, 0, 0x00f00000u, 0},           \
     /* REGIO_REGS     */ { 0, 0, 0, 0xff000000u, 0},           \
diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md
index e62fcf27a863..93638963d29e 100644
--- a/gcc/config/pru/pru.md
+++ b/gcc/config/pru/pru.md
@@ -1220,6 +1220,82 @@
   "nop\;xin\\t0, %0, 4"
   [(set_attr "type" "alu")
    (set_attr "length" "8")])
+
+(define_insn "umulsidi3"
+  [(set (match_operand:DI 0 "pru_muldst_operand"          "=Rmd0")
+       (mult:DI
+         (zero_extend:DI
+           (match_operand:SI 1 "pru_mulsrc0_operand"      "%Rms0"))
+         (zero_extend:DI
+           (match_operand:SI 2 "pru_mulsrc1_operand"      "Rms1"))))]
+  "TARGET_OPT_MUL"
+  "nop\;xin\\t0, %0, 8"
+  [(set_attr "type" "alu")
+   (set_attr "length" "8")])
+
+
+;; If optimizing for speed, prefer to inline 64-bit multiplication,
+;; in order to avoid the overhead of calling a library function.
+(define_expand "muldi3"
+  [(parallel [(set (match_operand:DI 0 "register_operand")
+       (mult:DI (match_operand:DI 1 "register_operand")
+                (match_operand:DI 2 "register_operand")))
+                (clobber (reg:DI MULDST_REGNUM))
+                (clobber (reg:SI MULSRC0_REGNUM))
+                (clobber (reg:SI MULSRC1_REGNUM))
+                ])]
+  "TARGET_OPT_MUL && !optimize_size && can_create_pseudo_p ()"
+{
+  rtx op0_lo = simplify_gen_subreg (SImode, operands[0], DImode, 0);
+  rtx op0_hi = simplify_gen_subreg (SImode, operands[0], DImode, 4);
+  rtx op1_lo = simplify_gen_subreg (SImode, operands[1], DImode, 0);
+  rtx op1_hi = simplify_gen_subreg (SImode, operands[1], DImode, 4);
+  rtx op2_lo = simplify_gen_subreg (SImode, operands[2], DImode, 0);
+  rtx op2_hi = simplify_gen_subreg (SImode, operands[2], DImode, 4);
+
+  gcc_assert (!reload_completed);
+
+  rtx cross_product1 = gen_reg_rtx (SImode);
+  rtx cross_product2 = gen_reg_rtx (SImode);
+  rtx low_product = gen_reg_rtx (DImode);
+  rtx cross_scratch1_hi = gen_reg_rtx (SImode);
+
+  rtx reg_mulsrc0_si = gen_reg_rtx (SImode);
+  rtx reg_mulsrc1_si = gen_reg_rtx (SImode);
+  rtx reg_muldst_di = gen_reg_rtx (DImode);
+
+  /* (al + C * ah) * (bl + C * bh) =   al * bl
+                                       + C * ah * bl
+                                       + C * al * bh
+                                       + C * C * ah * bh  -> discard, overflow
+      Where C=(1 << 32).  */
+
+  emit_move_insn (cross_product1,
+                 gen_rtx_MULT (SImode, op1_hi, op2_lo));
+  emit_move_insn (cross_product2,
+                 gen_rtx_MULT (SImode, op1_lo, op2_hi));
+
+  /* Calculate "al * bl".  */
+  emit_move_insn (reg_mulsrc0_si, op1_lo);
+  emit_move_insn (reg_mulsrc1_si, op2_lo);
+  emit_insn (gen_umulsidi3 (reg_muldst_di, reg_mulsrc0_si, reg_mulsrc1_si));
+  emit_move_insn (low_product, reg_muldst_di);
+
+  emit_move_insn (cross_scratch1_hi,
+                 gen_rtx_PLUS (SImode,
+                               cross_product1,
+                               cross_product2));
+  emit_move_insn (op0_lo,
+                 simplify_gen_subreg (SImode, low_product, DImode, 0));
+  emit_move_insn (op0_hi,
+                 gen_rtx_PLUS (SImode,
+                               simplify_gen_subreg (SImode,
+                                                    low_product,
+                                                    DImode,
+                                                    4),
+                               cross_scratch1_hi));
+  DONE;
+})
 
 ;; Prologue, Epilogue and Return

[gcc r17-683] pru: Inline muldi3 when optimizing for speed

Reply via email to