When a shift is performed by a shift-loop, then there are cases
where the runtime can be improved. For example, uint32_t R22 >> 5
is currently

        ldi srcatch, 5
    1:  lsr r25
        ror r24
        ror r23
        ror r22
        dec scratch
        brne 1b

but can be done as:

        andi r22,-32   ; Set lower 5 bits to 0.
        ori r22,16     ; Set bit 4 to 1.
        ;; Now r22 = 0b***10000
    1:  lsr r25
        ror r24
        ror r23
        ror r22
        brcc 1b        ; Carry will be 0, 0, 0, 0, 1.

this is count-1 cycles faster where count is the shift offset.
In the example that's 4 cycles.

Part 1 of the patch refactors the shift output function so
it gets a shift rtx_code instead of an asm template.

Part 2 is the very optimization.

This is for trunk and passes without new regressions.
Ok to apply?

Johann

--

    AVR: Refactor avr.cc::out_shift_with_cnt().

    This is a no-op refactoring of out_shift_with_cnt() that passes the
    shift rtx_code instead of an template asm string.

    gcc/
            * config/avr/avr/avr-protos.h (out_shift_with_cnt): Remove.
            * config/avr/avr/avr.cc (avr_out_shift_with_cnt): New static
            function from out_shift_with_cnt: Pass shift rtx_code instead
            of asm template.
            (avr_out_shift_1): New static helper function.
            (ashlqi3_out, ashlhi3_out, avr_out_ashlpsi3, ashlsi3_out)
            (ashrqi3_out, ashrhi3_out, avr_out_ashrpsi3, ashrsi3_out)
            (lshrqi3_out, lshrhi3_out, avr_out_lshrpsi3, lshrsi3_out):
            Adjust avr_out_shift_with_cnt to new interface.
---

    AVR: Tweak shift execution times in some cases.

    When the tail reg (last register) in a shift is an upper register,
    then inserting a sequence of 0s and a 1 into the tail register
    only takes 2 instruction.  The preparation will be one instruction
    longer, but the loop body will be one instruction shorter, saving
    count-1 cycles.

    For example uint32_t R22 >> 5 will turn from:

        ldi srcatch, 5
    1:  lsr r25
        ror r24
        ror r23
        ror r22
        dec scratch
        brne 1b

    to:

        andi r22,-32   ; Set lower 5 bits to 0.
        ori r22,16     ; Set bit 4 to 1.
        ;; Now r22 = 0b***10000
    1:  lsr r25
        ror r24
        ror r23
        ror r22
        brcc 1b        ; Carry will be 0, 0, 0, 0, 1.

    gcc/
            * config/avr/avr.cc (avr_out_shift_with_cnt): Tweak
            execution time by count-1 cycles in some cases.
    AVR: Refactor avr.cc::out_shift_with_cnt().
    
    This is a no-op refactoring of out_shift_with_cnt() that passes the
    shift rtx_code instead of an template asm string.
    
    gcc/
            * config/avr/avr/avr-protos.h (out_shift_with_cnt): Remove.
            * config/avr/avr/avr.cc (avr_out_shift_with_cnt): New static
            function from out_shift_with_cnt: Pass shift rtx_code instead
            of asm template.
            (avr_out_shift_1): New static helper function.
            (ashlqi3_out, ashlhi3_out, avr_out_ashlpsi3, ashlsi3_out)
            (ashrqi3_out, ashrhi3_out, avr_out_ashrpsi3, ashrsi3_out)
            (lshrqi3_out, lshrhi3_out, avr_out_lshrpsi3, lshrsi3_out):
            Adjust avr_out_shift_with_cnt to new interface.

diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 8ba1945cff7..86ad24bac20 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -141,8 +141,6 @@ extern bool avr_nonzero_bits_lsr_operands_p (rtx_code, rtx *);
 extern void avr_final_prescan_insn (rtx_insn *insn, rtx *operand,
 				    int num_operands);
 extern rtx_code avr_normalize_condition (rtx_code condition);
-extern void out_shift_with_cnt (const char *templ, rtx_insn *insn,
-				rtx operands[], int *len, int t_len);
 extern enum reg_class avr_mode_code_base_reg_class (machine_mode, addr_space_t, rtx_code, rtx_code);
 extern bool avr_regno_mode_code_ok_for_base_p (int, machine_mode, addr_space_t, rtx_code, rtx_code);
 extern rtx avr_incoming_return_addr_rtx (void);
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index 775be800be0..dd1bfbcdfcb 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -6917,26 +6917,51 @@ avr_out_cmp_ext (rtx xop[], rtx_code code, int *plen)
 }
 
 
-/* Generate asm equivalent for various shifts.  This only handles cases
-   that are not already carefully hand-optimized in ?sh<mode>3_out.
+/* Helper for the next function:  Shift register REG by 1 bit position
+   according to the shift CODE of ASHIFT, LSHIFTRT or ASHIFTRT.
+   PLEN == 0: Asm output respective shift instruction(s).
+   PLEN != 0: Add the length of the sequence in words to *PLEN.  */
+
+static void
+avr_out_shift_1 (rtx_code code, rtx reg, int *plen)
+{
+  const int n_bytes = GET_MODE_SIZE (GET_MODE (reg));
+  const int dir = code == ASHIFT ? 1 : -1;
+  const int regno = REGNO (reg);
+  const int first = code == ASHIFT ? 0 : n_bytes - 1;
+  for (int i = 0; i < n_bytes; ++i)
+    {
+      rtx reg8 = all_regs_rtx[regno + first + i * dir];
+      if (code == ASHIFT)
+	avr_asm_len (i == 0 ? "lsl %0" : "rol %0", &reg8, plen, 1);
+      else if (code == LSHIFTRT)
+	avr_asm_len (i == 0 ? "lsr %0" : "ror %0", &reg8, plen, 1);
+      else if (code == ASHIFTRT)
+	avr_asm_len (i == 0 ? "asr %0" : "ror %0", &reg8, plen, 1);
+      else
+	gcc_unreachable ();
+    }
+}
+
+
+/* Generate asm code to perform a shift of code CODE on register OPERANDS[0].
+   This only handles cases that are not already carefully hand-optimized
+   in ?sh<mode>3_out.  Always returns "".
 
-   OPERANDS[0] resp. %0 in TEMPL is the operand to be shifted.
    OPERANDS[2] is the shift count as CONST_INT, MEM or REG.
    OPERANDS[3] is a QImode scratch register from LD regs if
-               available and SCRATCH, otherwise (no scratch available)
-
-   TEMPL is an assembler template that shifts by one position.
-   T_LEN is the length of this template.
+	       available and SCRATCH, otherwise (no scratch available).
    PLEN != 0: Set *PLEN to the length of the sequence in words.
    PLEN == 0: Output instructions.  */
 
-void
-out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[],
-		    int *plen, int t_len)
+static const char*
+avr_out_shift_with_cnt (rtx_code code, rtx_insn *insn, rtx operands[],
+			int *plen)
 {
   bool second_label = true;
   bool saved_in_tmp = false;
   bool use_zero_reg = false;
+  const int t_len = GET_MODE_SIZE (GET_MODE (operands[0]));
   rtx op[5];
 
   op[0] = operands[0];
@@ -6961,7 +6986,7 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[],
       int max_len = 10;  /* If larger than this, always use a loop.  */
 
       if (count <= 0)
-	return;
+	return "";
 
       if (count < 8 && !scratch)
 	use_zero_reg = true;
@@ -6974,9 +6999,9 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[],
 	  /* Output shifts inline with no loop - faster.  */
 
 	  while (count-- > 0)
-	    avr_asm_len (templ, op, plen, t_len);
+	    avr_out_shift_1 (code, op[0], plen);
 
-	  return;
+	  return "";
 	}
 
       if (scratch)
@@ -7035,7 +7060,7 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[],
     avr_asm_len ("rjmp 2f", op, plen, 1);
 
   avr_asm_len ("1:", op, plen, 0);
-  avr_asm_len (templ, op, plen, t_len);
+  avr_out_shift_1 (code, op[0], plen);
 
   if (second_label)
     avr_asm_len ("2:", op, plen, 0);
@@ -7045,6 +7070,8 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[],
 
   if (saved_in_tmp)
     avr_asm_len ("mov %3,%4", op, plen, 1);
+
+  return "";
 }
 
 
@@ -7122,9 +7149,7 @@ ashlqi3_out (rtx_insn *insn, rtx operands[], int *plen)
   else if (CONSTANT_P (operands[2]))
     fatal_insn ("internal compiler error.  Incorrect shift:", insn);
 
-  out_shift_with_cnt ("lsl %0",
-		      insn, operands, plen, 1);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFT, insn, operands, plen);
 }
 
 
@@ -7389,9 +7414,7 @@ ashlhi3_out (rtx_insn *insn, rtx operands[], int *plen)
 	} // switch
     }
 
-  out_shift_with_cnt ("lsl %A0" CR_TAB
-		      "rol %B0", insn, operands, plen, 2);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFT, insn, operands, plen);
 }
 
 
@@ -7455,10 +7478,7 @@ avr_out_ashlpsi3 (rtx_insn *insn, rtx *op, int *plen)
 	}
     }
 
-  out_shift_with_cnt ("lsl %A0" CR_TAB
-		      "rol %B0" CR_TAB
-		      "rol %C0", insn, op, plen, 3);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFT, insn, op, plen);
 }
 
 
@@ -7604,11 +7624,7 @@ ashlsi3_out (rtx_insn *insn, rtx operands[], int *plen)
 	}
     }
 
-  out_shift_with_cnt ("lsl %A0" CR_TAB
-		      "rol %B0" CR_TAB
-		      "rol %C0" CR_TAB
-		      "rol %D0", insn, operands, plen, 4);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFT, insn, operands, plen);
 }
 
 
@@ -7659,9 +7675,7 @@ ashrqi3_out (rtx_insn *insn, rtx operands[], int *plen)
   else if (CONSTANT_P (operands[2]))
     fatal_insn ("internal compiler error.  Incorrect shift:", insn);
 
-  out_shift_with_cnt ("asr %0",
-		      insn, operands, plen, 1);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFTRT, insn, operands, plen);
 }
 
 
@@ -7818,9 +7832,7 @@ ashrhi3_out (rtx_insn *insn, rtx operands[], int *plen)
 	} // switch
     }
 
-  out_shift_with_cnt ("asr %B0" CR_TAB
-		      "ror %A0", insn, operands, plen, 2);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFTRT, insn, operands, plen);
 }
 
 
@@ -7915,10 +7927,7 @@ avr_out_ashrpsi3 (rtx_insn *insn, rtx *op, int *plen)
 	} /* switch */
     }
 
-  out_shift_with_cnt ("asr %C0" CR_TAB
-		      "ror %B0" CR_TAB
-		      "ror %A0", insn, op, plen, 3);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFTRT, insn, op, plen);
 }
 
 
@@ -8052,11 +8061,7 @@ ashrsi3_out (rtx_insn *insn, rtx operands[], int *plen)
 	} // switch
     }
 
-  out_shift_with_cnt ("asr %D0" CR_TAB
-		      "ror %C0" CR_TAB
-		      "ror %B0" CR_TAB
-		      "ror %A0", insn, operands, plen, 4);
-  return "";
+  return avr_out_shift_with_cnt (ASHIFTRT, insn, operands, plen);
 }
 
 /* 8-bit logic shift right ((unsigned char)x >> i) */
@@ -8133,9 +8138,7 @@ lshrqi3_out (rtx_insn *insn, rtx operands[], int *plen)
   else if (CONSTANT_P (operands[2]))
     fatal_insn ("internal compiler error.  Incorrect shift:", insn);
 
-  out_shift_with_cnt ("lsr %0",
-		      insn, operands, plen, 1);
-  return "";
+  return avr_out_shift_with_cnt (LSHIFTRT, insn, operands, plen);
 }
 
 
@@ -8339,9 +8342,7 @@ lshrhi3_out (rtx_insn *insn, rtx operands[], int *plen)
 	}
     }
 
-  out_shift_with_cnt ("lsr %B0" CR_TAB
-		      "ror %A0", insn, operands, plen, 2);
-  return "";
+  return avr_out_shift_with_cnt (LSHIFTRT, insn, operands, plen);
 }
 
 
@@ -8406,10 +8407,7 @@ avr_out_lshrpsi3 (rtx_insn *insn, rtx *op, int *plen)
 	} /* switch */
     }
 
-  out_shift_with_cnt ("lsr %C0" CR_TAB
-		      "ror %B0" CR_TAB
-		      "ror %A0", insn, op, plen, 3);
-  return "";
+  return avr_out_shift_with_cnt (LSHIFTRT, insn, op, plen);
 }
 
 
@@ -8555,11 +8553,7 @@ lshrsi3_out (rtx_insn *insn, rtx operands[], int *plen)
 	} // switch
     }
 
-  out_shift_with_cnt ("lsr %D0" CR_TAB
-		      "ror %C0" CR_TAB
-		      "ror %B0" CR_TAB
-		      "ror %A0", insn, operands, plen, 4);
-  return "";
+  return avr_out_shift_with_cnt (LSHIFTRT, insn, operands, plen);
 }
 
 
    AVR: Tweak shift execution times in some cases.
    
    When the tail reg (last register) in a shift is an upper register,
    then inserting a sequence of 0s and a 1 into the tail register
    only takes 2 instruction.  The preparation will be one instruction
    longer, but the loop body will be one instruction shorter, saving
    count-1 cycles.
    
    For example uint32_t R22 >> 5 will turn from:
    
        ldi srcatch, 5
    1:  lsr r25
        ror r24
        ror r23
        ror r22
        dec scratch
        brne 1b
    
    to:
    
        andi r22,-32   ; Set lower 5 bits to 0.
        ori r22,16     ; Set bit 4 to 1.
        ;; Now r22 = 0b***10000
    1:  lsr r25
        ror r24
        ror r23
        ror r22
        brcc 1b        ; Carry will be 0, 0, 0, 0, 1.
    
    gcc/
            * config/avr/avr.cc (avr_out_shift_with_cnt): Tweak
            execution time by count-1 cycles in some cases.

diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index dd1bfbcdfcb..88e9b24f15f 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -6961,8 +6961,11 @@ avr_out_shift_with_cnt (rtx_code code, rtx_insn *insn, rtx operands[],
   bool second_label = true;
   bool saved_in_tmp = false;
   bool use_zero_reg = false;
+  bool tail_bits = false;
   const int t_len = GET_MODE_SIZE (GET_MODE (operands[0]));
-  rtx op[5];
+  const int regno = REGNO (operands[0]);
+  const int tail_regno = regno + (code == ASHIFT ? t_len - 1 : 0);
+  rtx op[6];
 
   op[0] = operands[0];
   op[1] = operands[1];
@@ -6988,11 +6991,13 @@ avr_out_shift_with_cnt (rtx_code code, rtx_insn *insn, rtx operands[],
       if (count <= 0)
 	return "";
 
-      if (count < 8 && !scratch)
+      if (count < 8 && tail_regno >= REG_16)
+	tail_bits = true;
+      else if (count < 8 && !scratch)
 	use_zero_reg = true;
 
       if (optimize_size)
-	max_len = t_len + (scratch ? 3 : (use_zero_reg ? 4 : 5));
+	max_len = t_len + (scratch || tail_bits ? 3 : (use_zero_reg ? 4 : 5));
 
       if (t_len * count <= max_len)
 	{
@@ -7004,7 +7009,27 @@ avr_out_shift_with_cnt (rtx_code code, rtx_insn *insn, rtx operands[],
 	  return "";
 	}
 
-      if (scratch)
+      if (tail_bits)
+	{
+	  /* The tail register (the last one in a multi-byte shift) is
+	     an upper register, so we can insert a stop mask into it.
+	     This will cost 2 instructions, but the loop body is one
+	     instruction shorter.  That yields the same code size like
+	     the "scratch" case but saves count-1 cycles.
+	     The loop branch is an BRCC that sees count-1 zeros and
+	     then a one to drop out of the loop.  */
+
+	  op[3] = all_regs_rtx[tail_regno];
+	  op[4] = gen_int_mode (code == ASHIFT
+				? 0xff >> count
+				: 0xff << count, QImode);
+	  op[5] = gen_int_mode (code == ASHIFT
+				? 0x80 >> (count - 1)
+				: 0x01 << (count - 1), QImode);
+	  avr_asm_len ("andi %3,%4" CR_TAB
+		       "ori %3,%5", op, plen, 2);
+	}
+      else if (scratch)
 	{
 	  avr_asm_len ("ldi %3,%2", op, plen, 1);
 	}
@@ -7065,8 +7090,15 @@ avr_out_shift_with_cnt (rtx_code code, rtx_insn *insn, rtx operands[],
   if (second_label)
     avr_asm_len ("2:", op, plen, 0);
 
-  avr_asm_len (use_zero_reg ? "lsr %3" : "dec %3", op, plen, 1);
-  avr_asm_len (second_label ? "brpl 1b" : "brne 1b", op, plen, 1);
+  if (tail_bits)
+    {
+      avr_asm_len ("brcc 1b", op, plen, 1);
+    }
+  else
+    {
+      avr_asm_len (use_zero_reg ? "lsr %3" : "dec %3", op, plen, 1);
+      avr_asm_len (second_label ? "brpl 1b" : "brne 1b", op, plen, 1);
+    }
 
   if (saved_in_tmp)
     avr_asm_len ("mov %3,%4", op, plen, 1);

Reply via email to