https://gcc.gnu.org/g:bf6f77edd625cfe2f2f164e90437df318b96527f

commit r15-5938-gbf6f77edd625cfe2f2f164e90437df318b96527f
Author: Georg-Johann Lay <a...@gjlay.de>
Date:   Thu Dec 5 11:24:30 2024 +0100

    AVR: target/107957 - Propagate zero_reg to store sources.
    
    When -msplit-ldst is on, it may be possible to propagate __zero_reg__
    to the sources of the new stores.  For example, without this patch,
    
    unsigned long lx;
    
    void store_lsr17 (void)
    {
       lx >>= 17;
    }
    
    compiles to:
    
    store_lsr17:
       lds r26,lx+2           ;  movqi_insn
       lds r27,lx+3           ;  movqi_insn
       movw r24,r26           ;  *movhi
       lsr r25                ;  *lshrhi3_const
       ror r24
       ldi r26,0              ;  movqi_insn
       ldi r27,0              ;  movqi_insn
       sts lx,r24             ;  movqi_insn
       sts lx+1,r25           ;  movqi_insn
       sts lx+2,r26           ;  movqi_insn
       sts lx+3,r27           ;  movqi_insn
       ret
    
    but with this patch it becomes:
    
    store_lsr17:
       lds r26,lx+2           ;  movqi_insn
       lds r27,lx+3           ;  movqi_insn
       movw r24,r26           ;  *movhi
       lsr r25                ;  *lshrhi3_const
       ror r24
       sts lx,r24             ;  movqi_insn
       sts lx+1,r25           ;  movqi_insn
       sts lx+2,__zero_reg__  ;  movqi_insn
       sts lx+3,__zero_reg__  ;  movqi_insn
       ret
    
    gcc/
            PR target/107957
            * config/avr/avr-passes-fuse-move.h (bbinfo_t) <try_mem0_p>:
            Add static property.
            * config/avr/avr-passes.cc (bbinfo_t::try_mem0_p): Define it.
            (optimize_data_t::try_mem0): New method.
            (bbinfo_t::optimize_one_block) [bbinfo_t::try_mem0_p]: Run try_mem0.
            (bbinfo_t::optimize_one_function): Set bbinfo_t::try_mem0_p.
            * config/avr/avr.md (pushhi1_insn): Also allow zero as source.
            (define_split) [avropt_split_ldst]: Only run avr_split_ldst()
            when avr-fuse-move has been run at least once.
            * doc/invoke.texi (AVR Options) <-msplit-ldst>: Document it.

Diff:
---
 gcc/config/avr/avr-passes-fuse-move.h |  1 +
 gcc/config/avr/avr-passes.cc          | 49 ++++++++++++++++++++++++++++++++++-
 gcc/config/avr/avr.md                 |  9 +++++--
 gcc/doc/invoke.texi                   |  9 +++++--
 4 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/gcc/config/avr/avr-passes-fuse-move.h 
b/gcc/config/avr/avr-passes-fuse-move.h
index dbed1a636f3d..432f9ca4670f 100644
--- a/gcc/config/avr/avr-passes-fuse-move.h
+++ b/gcc/config/avr/avr-passes-fuse-move.h
@@ -1172,6 +1172,7 @@ struct bbinfo_t
 
   static find_plies_data_t *fpd;
   static bool try_fuse_p;
+  static bool try_mem0_p;
   static bool try_bin_arg1_p;
   static bool try_simplify_p;
   static bool try_split_ldi_p;
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index de8de1cd2e8a..fad64b1b3454 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -434,6 +434,11 @@ static machine_mode size_to_mode (int size)
       Split all insns where the operation can be performed on individual
       bytes, like andsi3.  In example (4) the andhi3 can be optimized
       to an andqi3.
+
+   bbinfo_t::try_mem0_p
+      Try to fuse a mem = reg insn to mem = __zero_reg__.
+      This should only occur when -msplit-ldst is on, but may
+      also occur with pushes since push<mode>1 splits them.
 */
 
 
@@ -514,6 +519,7 @@ bool bbinfo_t::try_split_any_p;
 bool bbinfo_t::try_simplify_p;
 bool bbinfo_t::use_arith_p;
 bool bbinfo_t::use_set_some_p;
+bool bbinfo_t::try_mem0_p;
 
 
 // Abstract Interpretation of expressions.
@@ -1087,6 +1093,7 @@ struct optimize_data_t
   {}
 
   bool try_fuse (bbinfo_t *);
+  bool try_mem0 (bbinfo_t *);
   bool try_bin_arg1 (bbinfo_t *);
   bool try_simplify (bbinfo_t *);
   bool try_split_ldi (bbinfo_t *);
@@ -2509,6 +2516,44 @@ bbinfo_t::run_find_plies (const insninfo_t &ii, const 
memento_t &memo) const
 }
 
 
+// Try to propagate __zero_reg__ to a mem = reg insn's source.
+// Returns true on success and sets .n_new_insns.
+bool
+optimize_data_t::try_mem0 (bbinfo_t *)
+{
+  rtx_insn *insn = curr.ii.m_insn;
+  rtx set, mem, reg;
+  machine_mode mode;
+
+  if (insn
+      && (set = single_set (insn))
+      && MEM_P (mem = SET_DEST (set))
+      && REG_P (reg = SET_SRC (set))
+      && GET_MODE_SIZE (mode = GET_MODE (mem)) <= 4
+      && END_REGNO (reg) <= REG_32
+      && ! (regmask (reg) & memento_t::fixed_regs_mask)
+      && curr.regs.have_value (REGNO (reg), GET_MODE_SIZE (mode), 0x0))
+    {
+      avr_dump (";; Found insn %d: mem:%m = 0 = r%d\n", INSN_UID (insn),
+               mode, REGNO (reg));
+
+      // Some insns like PUSHes don't clobber REG_CC.
+      bool clobbers_cc = GET_CODE (PATTERN (insn)) == PARALLEL;
+
+      if (clobbers_cc)
+       emit_valid_move_clobbercc (mem, CONST0_RTX (mode));
+      else
+       emit_valid_insn (gen_rtx_SET (mem, CONST0_RTX (mode)));
+
+      n_new_insns = 1;
+
+      return true;
+    }
+
+  return false;
+}
+
+
 // Try to fuse two 1-byte insns .prev and .curr to one 2-byte insn (MOVW).
 // Returns true on success, and sets .n_new_insns, .ignore_mask etc.
 bool
@@ -3108,7 +3153,8 @@ bbinfo_t::optimize_one_block (bool &changed)
                    || (bbinfo_t::try_bin_arg1_p && od.try_bin_arg1 (this))
                    || (bbinfo_t::try_simplify_p && od.try_simplify (this))
                    || (bbinfo_t::try_split_ldi_p && od.try_split_ldi (this))
-                   || (bbinfo_t::try_split_any_p && od.try_split_any (this)));
+                   || (bbinfo_t::try_split_any_p && od.try_split_any (this))
+                   || (bbinfo_t::try_mem0_p && od.try_mem0 (this)));
 
       rtx_insn *new_insns = get_insns ();
       end_sequence ();
@@ -3193,6 +3239,7 @@ bbinfo_t::optimize_one_function (function *func)
 
   // Which optimization(s) to perform.
   bbinfo_t::try_fuse_p = avropt_fuse_move & 0x1;      // Digit 0 in [0, 1].
+  bbinfo_t::try_mem0_p = avropt_fuse_move & 0x1;      // Digit 0 in [0, 1].
   bbinfo_t::try_bin_arg1_p = avropt_fuse_move & 0x2;  // Digit 1 in [0, 1].
   bbinfo_t::try_split_any_p = avropt_fuse_move & 0x4; // Digit 2 in [0, 1].
   bbinfo_t::try_split_ldi_p = avropt_fuse_move >> 3;    // Digit 3 in [0, 2].
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 3683001200da..9c348be7c48a 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -450,9 +450,11 @@
 
 (define_insn "pushhi1_insn"
   [(set (mem:HI (post_dec:HI (reg:HI REG_SP)))
-        (match_operand:HI 0 "register_operand" "r"))]
+        (match_operand:HI 0 "reg_or_0_operand" "r,Y00"))]
   ""
-  "push %B0\;push %A0"
+  "@
+       push %B0\;push %A0
+       push __zero_reg__\;push __zero_reg__"
   [(set_attr "length" "2")])
 
 ;; All modes for a multi-byte push.  We must include complex modes here too,
@@ -1029,6 +1031,9 @@
     // provided non-volatile, addr-space = generic, no reg-overlap
     // and the resulting addressings are natively supported.
     if (avropt_split_ldst
+        // Splitting too early may obfuscate some PRE_DEC / POST_INC
+        // opportunities, thus only split after avr-fuse-add.
+        && n_avr_fuse_add_executed > 0
         && GET_MODE_SIZE (<MODE>mode) > 1
         && avr_split_ldst (operands))
       DONE;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4b1acf9b79c1..78ead0e494e1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -904,9 +904,9 @@ Objective-C and Objective-C++ Dialects}.
 -mbranch-cost=@var{cost}  -mfuse-add=@var{level}  -mfuse-move=@var{level}
 -mcall-prologues  -mgas-isr-prologues  -mint8  -mflmap
 -mdouble=@var{bits}  -mlong-double=@var{bits}
--mn_flash=@var{size}  -mno-interrupts
+-mn_flash=@var{size}  -mfract-convert-truncate  -mno-interrupts
 -mmain-is-OS_task  -mrelax  -mrmw  -mstrict-X  -mtiny-stack
--mrodata-in-ram  -mfract-convert-truncate  -msplit-bit-shift
+-mrodata-in-ram  -msplit-bit-shift  -msplit-ldst
 -mshort-calls  -mskip-bug  -nodevicelib  -nodevicespecs
 -Waddr-space-convert  -Wmisspelled-isr}
 
@@ -24374,6 +24374,11 @@ This optimization is turned on per default for 
@option{-O2} and higher,
 including @option{-Os} but excluding @option{-Oz}.
 Splitting of shifts with a constant offset that is
 a multiple of 8 is controlled by @option{-mfuse-move}.
+@opindex msplit-ldst
+
+@item -msplit-ldst
+Split multi-byte loads and stores into several byte loads and stores.
+This optimization is turned on per default for @option{-O2} and higher.
 
 @opindex mtiny-stack
 @item -mtiny-stack

Reply via email to