Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-05-18 Thread Andrew Stubbs

On 16/05/12 17:43, Ramana Radhakrishnan wrote:

OK (if no regressions).


Cross tested with no regressions, and committed.

Thanks

Andrew


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-05-16 Thread Ramana Radhakrishnan

  extern const struct tune_params *current_tune;
  extern int vfp3_const_double_for_fract_bits (rtx);
 +
 +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 +rtx);
  #endif /* RTX_CODE */

  #endif /* ! GCC_ARM_PROTOS_H */
 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
 index c3a19e4..02dc6ca 100644
 --- a/gcc/config/arm/arm.c
 +++ b/gcc/config/arm/arm.c
 @@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
return 0;
  }

 +/* The default expansion of general 64-bit shifts in core-regs is suboptimal
 +   on ARM, since we know that shifts by negative amounts are no-ops.
 +
 +   It's safe for the input and output to be the same register, but
 +   early-clobber rules apply for the shift amount and scratch registers.
 +
 +   Shift by register requires both scratch registers.  Shift by a constant
 +   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
 +   the scratch registers may be NULL.
 +
 +   Additionally, ashiftrt by a register also clobbers the CC register.  */
 +void
 +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
 +rtx amount, rtx scratch1, rtx scratch2)
 +{
 +  rtx out_high = gen_highpart (SImode, out);
 +  rtx out_low = gen_lowpart (SImode, out);
 +  rtx in_high = gen_highpart (SImode, in);
 +  rtx in_low = gen_lowpart (SImode, in);
 +
 +  /* Bits flow from up-stream to down-stream.  */

Some thing more about upstream and downstream here would be nice :)

 +  rtx out_up   = code == ASHIFT ? out_low : out_high;
 +  rtx out_down = code == ASHIFT ? out_high : out_low;
 +  rtx in_up   = code == ASHIFT ? in_low : in_high;
 +  rtx in_down = code == ASHIFT ? in_high : in_low;
 +
 +  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
 +  gcc_assert (out
 +(REG_P (out) || GET_CODE (out) == SUBREG)


 +GET_MODE (out) == DImode);
 +  gcc_assert (in
 +(REG_P (in) || GET_CODE (in) == SUBREG)
 +GET_MODE (in) == DImode);
 +  gcc_assert (amount
 +(((REG_P (amount) || GET_CODE (amount) == SUBREG)
 + GET_MODE (amount) == SImode)
 +   || CONST_INT_P (amount)));
 +  gcc_assert (scratch1 == NULL
 +   || (GET_CODE (scratch1) == SCRATCH)
 +   || (GET_MODE (scratch1) == SImode
 +REG_P (scratch1)));
 +  gcc_assert (scratch2 == NULL
 +   || (GET_CODE (scratch2) == SCRATCH)
 +   || (GET_MODE (scratch2) == SImode
 +REG_P (scratch2)));
 +  gcc_assert (!REG_P (out) || !REG_P (amount)
 +   || !HARD_REGISTER_P (out)
 +   || (REGNO (out) != REGNO (amount)
 +REGNO (out) + 1 != REGNO (amount)));
 +
 +  /* Macros to make following code more readable.  */
 +  #define SUB_32(DEST,SRC) \
 + gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
 +  #define RSB_32(DEST,SRC) \
 + gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
 +  #define SUB_S_32(DEST,SRC) \
 + gen_addsi3_compare0 ((DEST), (SRC), \
 +  gen_rtx_CONST_INT (VOIDmode, -32))
 +  #define SET(DEST,SRC) \
 + gen_rtx_SET (SImode, (DEST), (SRC))
 +  #define SHIFT(CODE,SRC,AMOUNT) \
 + gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
 +  #define LSHIFT(CODE,SRC,AMOUNT) \
 + gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
 + SImode, (SRC), (AMOUNT))
 +  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
 + gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
 + SImode, (SRC), (AMOUNT))
 +  #define ORR(A,B) \
 + gen_rtx_IOR (SImode, (A), (B))
 +  #define BRANCH(COND,LABEL) \
 + gen_arm_cond_branch ((LABEL), \
 +  gen_rtx_ ## COND (CCmode, cc_reg, \
 +const0_rtx), \
 +  cc_reg)
 +
 +  if (CONST_INT_P (amount))
 +{
 +  /* Shifts by a constant amount.  */
 +  if (INTVAL (amount) = 0)
 + /* Match what shift-by-register would do.  */
 + emit_insn (gen_movdi (out, in));
 +  else if (INTVAL (amount) = 64)
 + {
 +   /* Match what shift-by-register would do.  */
 +   if (code == ASHIFTRT)
 + {
 +   rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
 +   emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
 +   emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
 + }
 +   else
 + emit_insn (gen_movdi (out, const0_rtx));
 + }
 +  else if (INTVAL (amount)  32)
 + {
 +   /* Shifts by a constant less than 32.  */
 +   rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
 +   32 - INTVAL (amount));
 +
 +   emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
 +   emit_insn (SET 

Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-05-16 Thread Andrew Stubbs

On 16/05/12 11:25, Ramana Radhakrishnan wrote:

Ok with those changes.


Hi Ramana,

Here's an update rebased and modified as requested.

Can you please confirm that the comments explain what you wanted to 
know, and then I will commit it.


Thanks

Andrew
2012-05-16  Andrew Stubbs  a...@codesourcery.com

	gcc/
	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.
	(arm_cond_branch): Remove '*' to enable gen_arm_cond_branch.

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index cb74d70..b338470 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -245,6 +245,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+	   rtx);
 #endif /* RTX_CODE */
 
 extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 2c62c51..3ad4c75 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25933,4 +25933,256 @@ arm_autoinc_modes_ok_p (enum machine_mode mode, enum arm_auto_incmodes code)
   return false;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal,
+   on ARM, since we know that shifts by negative amounts are no-ops.
+   Additionally, the default expansion code is not available or suitable
+   for post-reload insn splits (this can occur when the register allocator
+   chooses not to do a shift in NEON).
+   
+   This function is used in both initial expand and post-reload splits, and
+   handles all kinds of 64-bit shifts.
+
+   Input requirements:
+- It is safe for the input and output to be the same register, but
+  early-clobber rules apply for the shift amount and scratch registers.
+- Shift by register requires both scratch registers.  Shift by a constant
+  less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+  the scratch registers may be NULL.
+- Ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			   rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Terminology:
+	in = the register pair containing the input value.
+	out = the destination register pair.
+	up = the high- or low-part of each pair.
+	down = the opposite part to up.
+ In a shift, we can consider bits to shift from up-stream to
+ down-stream, so in a left-shift up is the low-part and down
+ is the high-part of each register pair.  */
+
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	   (REG_P (out) || GET_CODE (out) == SUBREG)
+	   GET_MODE (out) == DImode);
+  gcc_assert (in
+	   (REG_P (in) || GET_CODE (in) == SUBREG)
+	   GET_MODE (in) == DImode);
+  gcc_assert (amount
+	   (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	  || (GET_CODE (scratch1) == SCRATCH)
+	  || (GET_MODE (scratch1) == SImode
+		   REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	  || (GET_CODE (scratch2) == SCRATCH)
+	  || (GET_MODE (scratch2) == SImode
+		   REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	  || !HARD_REGISTER_P (out)
+	  || (REGNO (out) != REGNO (amount)
+		   REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	gen_addsi3_compare0 ((DEST), (SRC), \
+ gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	gen_rtx_IOR (SImode, (A), (B))
+  #define BRANCH(COND,LABEL) \
+	gen_arm_cond_branch ((LABEL), \
+ 

Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-05-16 Thread Ramana Radhakrishnan
On 16 May 2012 17:09, Andrew Stubbs a...@codesourcery.com wrote:
 On 16/05/12 11:25, Ramana Radhakrishnan wrote:

 Ok with those changes.


 Hi Ramana,

 Here's an update rebased and modified as requested.

 Can you please confirm that the comments explain what you wanted to know,
 and then I will commit it.

OK (if no regressions).

Ramana


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-17 Thread Andrew Stubbs

On 16/02/12 15:33, Andrew Stubbs wrote:

OK for 4.8?


I forgot to address Ramana's comment about optimize_size.

This update fixes that and leaves everything else untouched.

OK?

Andrew
2012-02-17  Andrew Stubbs  a...@codesourcery.com

	gcc/
	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.
	(arm_cond_branch): Remove '*' to enable gen_arm_cond_branch.

---
 gcc/config/arm/arm-protos.h |3 +
 gcc/config/arm/arm.c|  201 +++
 gcc/config/arm/arm.md   |  104 --
 3 files changed, 280 insertions(+), 28 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+	   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index c3a19e4..02dc6ca 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			   rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	   (REG_P (out) || GET_CODE (out) == SUBREG)
+	   GET_MODE (out) == DImode);
+  gcc_assert (in
+	   (REG_P (in) || GET_CODE (in) == SUBREG)
+	   GET_MODE (in) == DImode);
+  gcc_assert (amount
+	   (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	  || (GET_CODE (scratch1) == SCRATCH)
+	  || (GET_MODE (scratch1) == SImode
+		   REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	  || (GET_CODE (scratch2) == SCRATCH)
+	  || (GET_MODE (scratch2) == SImode
+		   REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	  || !HARD_REGISTER_P (out)
+	  || (REGNO (out) != REGNO (amount)
+		   REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	gen_addsi3_compare0 ((DEST), (SRC), \
+ gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	gen_rtx_IOR (SImode, (A), (B))
+  #define BRANCH(COND,LABEL) \
+	gen_arm_cond_branch ((LABEL), \
+ gen_rtx_ ## COND (CCmode, cc_reg, \
+		   const0_rtx), \
+ cc_reg)
+
+  if (CONST_INT_P (amount))
+{
+  /* Shifts by a constant amount.  */
+  if (INTVAL (amount) = 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+  else if (INTVAL (amount) = 64)
+	{
+	  /* Match what shift-by-register would do.  */
+	  if (code == ASHIFTRT)
+	{
+	  rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	  emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	  emit_insn (SET (out_up, SHIFT 

Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-16 Thread Andrew Stubbs

On 11/02/12 01:11, Richard Henderson wrote:

On 02/08/2012 08:28 AM, Andrew Stubbs wrote:

Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to 
add arith-shift variants, I think, for ARM mode to work.

H ... I'll try again.


Does it work to simply use branches initially, and rely on post-reload
ifcvt to transform to cond_exec when possible?


It did work in ARM mode, but did not work in Thumb2 mode. My recent 
patch has fixed that:

http://gcc.gnu.org/ml/gcc-patches/2012-02/msg00734.html

With that patch applied, the attached patch has almost the right effect. 
The only problem now is that the scheduling gives multiple IT blocks, 
but that's a different problem.


Here's the new thumb code for ashiftrt:

ldrdr2, [r0]
push{r4, r5, r6}
mov r4, r1
rsb r5, r1, #32
subsr6, r4, #32
lsr r0, r2, r4
lsl r5, r3, r5
it  ge
asrge   r6, r3, r6
asr r1, r3, r4
orr r0, r0, r5
it  ge
orrge   r0, r0, r6
pop {r4, r5, r6}
bx  lr

OK for 4.8?

Andrew
2012-02-16  Andrew Stubbs  a...@codesourcery.com

	gcc/
	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.
	(arm_cond_branch): Remove '*' to enable gen_arm_cond_branch.

---
 gcc/config/arm/arm-protos.h |3 +
 gcc/config/arm/arm.c|  201 +++
 gcc/config/arm/arm.md   |  104 --
 3 files changed, 280 insertions(+), 28 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+	   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..f3accc8 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			   rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	   (REG_P (out) || GET_CODE (out) == SUBREG)
+	   GET_MODE (out) == DImode);
+  gcc_assert (in
+	   (REG_P (in) || GET_CODE (in) == SUBREG)
+	   GET_MODE (in) == DImode);
+  gcc_assert (amount
+	   (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	  || (GET_CODE (scratch1) == SCRATCH)
+	  || (GET_MODE (scratch1) == SImode
+		   REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	  || (GET_CODE (scratch2) == SCRATCH)
+	  || (GET_MODE (scratch2) == SImode
+		   REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	  || !HARD_REGISTER_P (out)
+	  || (REGNO (out) != REGNO (amount)
+		   REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	gen_addsi3_compare0 ((DEST), (SRC), \
+ gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define 

Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-10 Thread Richard Henderson
On 02/08/2012 08:28 AM, Andrew Stubbs wrote:
 Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to 
 add arith-shift variants, I think, for ARM mode to work.
 
 H ... I'll try again.

Does it work to simply use branches initially, and rely on post-reload
ifcvt to transform to cond_exec when possible?


r~


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-08 Thread Andrew Stubbs

On 07/02/12 22:19, Ramana Radhakrishnan wrote:

I find it interesting that cond_exec's in this form survive all the
way till reload and work.  AFAIK we could never have cond_exec's
before reload . The documentation doesn't appear to mention this,
therefore I would like to see if  the cond_exec's can be recast as
if_then_else forms from expand rather than these forms. Has this
survived bootstrap and testing ?


I've tried to do this, but it can't be done by a straight translation 
because we don't have the insns available to do it. As I understand it, 
all predicable instructions automatically get a cond_exec variant, but 
the only if_then_else I could find (it's hard to grep because 
if_then_else occurs many times in attributes) is in the conditional move 
instruction. Have I missed something?


Anyway, I've tried to redo it using conditional move, and that works, 
but it's not as efficient.


Hopefully Steven Bosscher is correct, and there's no problem with 
cond_exec on ARM.



+  /* If we're optimizing for size, we prefer the libgcc calls.  */
+  if (optimize_size)
+   FAIL;


In addition you want to replace optimize_size with
optimize_function_for_size_p in these cases.


Ok, I've fixed this. I'll post an update soon.

Unfortunately, my testing has found a problem in the native bootstrap, 
so I'm going to need to track that down first.


Andrew



Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-08 Thread Bernd Schmidt
On 02/07/2012 11:33 PM, Steven Bosscher wrote:
 On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:
 Hi Andrew

 I find it interesting that cond_exec's in this form survive all the
 way till reload and work.  AFAIK we could never have cond_exec's
 before reload .
 
 There is nothing wrong per-se with cond_execs before reload, as long
 as you don't have to reload a predicate pseudo-reg.

I thought the problem was that we'd have to emit conditional reload
insns and inheritance wouldn't work.


Bernd


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-08 Thread Richard Guenther
On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt ber...@codesourcery.com wrote:
 On 02/07/2012 11:33 PM, Steven Bosscher wrote:
 On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:
 Hi Andrew

 I find it interesting that cond_exec's in this form survive all the
 way till reload and work.  AFAIK we could never have cond_exec's
 before reload .

 There is nothing wrong per-se with cond_execs before reload, as long
 as you don't have to reload a predicate pseudo-reg.

 I thought the problem was that we'd have to emit conditional reload
 insns and inheritance wouldn't work.

It probably depends on how DF sees conditional uses / defs.  If they
look like regular uses / defs then I suppose un-conditional spills/reloads
are fine - otherwise of course you'd corrupt one of the two register set
states.  But that also means it's probably safe if the sequence of conditional
insns is of length 1.

Not sure we want to open that possible can of worms though ;)

Richard.


 Bernd


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-08 Thread Bernd Schmidt
On 02/08/2012 01:12 PM, Richard Guenther wrote:
 On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt ber...@codesourcery.com wrote:
 On 02/07/2012 11:33 PM, Steven Bosscher wrote:
 On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:
 Hi Andrew

 I find it interesting that cond_exec's in this form survive all the
 way till reload and work.  AFAIK we could never have cond_exec's
 before reload .

 There is nothing wrong per-se with cond_execs before reload, as long
 as you don't have to reload a predicate pseudo-reg.

 I thought the problem was that we'd have to emit conditional reload
 insns and inheritance wouldn't work.
 
 It probably depends on how DF sees conditional uses / defs.  If they
 look like regular uses / defs then I suppose un-conditional spills/reloads
 are fine - otherwise of course you'd corrupt one of the two register set
 states.

I'm pretty sure conditional defs are always RMW, but I'd have to go
look. Can't imagine it working otherwise though.


Bernd


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-08 Thread Richard Guenther
On Wed, Feb 8, 2012 at 1:41 PM, Bernd Schmidt ber...@codesourcery.com wrote:
 On 02/08/2012 01:12 PM, Richard Guenther wrote:
 On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt ber...@codesourcery.com 
 wrote:
 On 02/07/2012 11:33 PM, Steven Bosscher wrote:
 On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:
 Hi Andrew

 I find it interesting that cond_exec's in this form survive all the
 way till reload and work.  AFAIK we could never have cond_exec's
 before reload .

 There is nothing wrong per-se with cond_execs before reload, as long
 as you don't have to reload a predicate pseudo-reg.

 I thought the problem was that we'd have to emit conditional reload
 insns and inheritance wouldn't work.

 It probably depends on how DF sees conditional uses / defs.  If they
 look like regular uses / defs then I suppose un-conditional spills/reloads
 are fine - otherwise of course you'd corrupt one of the two register set
 states.

 I'm pretty sure conditional defs are always RMW, but I'd have to go
 look. Can't imagine it working otherwise though.

I was thinking about

  (cond_exec 1 (set (reg:SI 30) (...)))
  (cond_exec 0 (set (reg:SI 30) (...)))
  (cond_exec 1 (use (reg:SI 30) (...)))

where if we spill/reload 30 with non-cond_exec stmts then we might
clobber the shared register set of the conditional execution paths.
Similar of course shared stack space if we happen to spill/reload in
both paths.  If of course defs/uses of both paths conflict and get different
hard registers assigned the register issue doesn't exist, still the
shared spill stack space issue may.

Richard.


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-08 Thread Andrew Stubbs

On 08/02/12 11:18, Andrew Stubbs wrote:

I've tried to do this, but it can't be done by a straight translation
because we don't have the insns available to do it. As I understand it,
all predicable instructions automatically get a cond_exec variant, but
the only if_then_else I could find (it's hard to grep because
if_then_else occurs many times in attributes) is in the conditional move
instruction. Have I missed something?


Ok, I missed the various patterns like if_arith_move.

Unfortunately, these don't work in Thumb mode (no IT block), and I'd 
have to add arith-shift variants, I think, for ARM mode to work.


H ... I'll try again.

Andrew


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-07 Thread Ramana Radhakrishnan
Hi Andrew

I find it interesting that cond_exec's in this form survive all the
way till reload and work.  AFAIK we could never have cond_exec's
before reload . The documentation doesn't appear to mention this,
therefore I would like to see if  the cond_exec's can be recast as
if_then_else forms from expand rather than these forms. Has this
survived bootstrap and testing ?


 +  /* If we're optimizing for size, we prefer the libgcc calls.  */
 +  if (optimize_size)
 + FAIL;

In addition you want to replace optimize_size with
optimize_function_for_size_p in these cases.

cheers
Ramana


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-02-07 Thread Steven Bosscher
On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
ramana.radhakrish...@linaro.org wrote:
 Hi Andrew

 I find it interesting that cond_exec's in this form survive all the
 way till reload and work.  AFAIK we could never have cond_exec's
 before reload .

There is nothing wrong per-se with cond_execs before reload, as long
as you don't have to reload a predicate pseudo-reg. For ia64, with its
(IIRC) 64 predicate registers, it was not practical, or even possible,
to assign the predicate registers to hard regs during expand.

AFAIU this isn't an issue on ARM because there is just one condition
code register.

Ciao!
Steven


Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-01-31 Thread Andrew Stubbs

On 30/01/12 15:25, Richard Earnshaw wrote:

What's the impact of this on -Os?  At present we fall back to the
libcalls, but I can't immediately see how the new code would do that.

Gut feeling is that shift by a constant is always worth inlining at -Os,
but shift by a register isn't.


Ah, I hadn't considered that. Good point!

This updated patch causes it to fall back to the old behaviour in 
optimize_size mode. This should do what you want.


OK?

Andrew
2012-01-31  Andrew Stubbs  a...@codesourcery.com

	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.

---
 gcc/config/arm/arm-protos.h |3 +
 gcc/config/arm/arm.c|  198 +++
 gcc/config/arm/arm.md   |  102 --
 3 files changed, 276 insertions(+), 27 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+	   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..eefc45c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+   
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			   rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	   (REG_P (out) || GET_CODE (out) == SUBREG)
+	   GET_MODE (out) == DImode);
+  gcc_assert (in
+	   (REG_P (in) || GET_CODE (in) == SUBREG)
+	   GET_MODE (in) == DImode);
+  gcc_assert (amount
+	   (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	  || (GET_CODE (scratch1) == SCRATCH)
+	  || (GET_MODE (scratch1) == SImode
+		   REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	  || (GET_CODE (scratch2) == SCRATCH)
+	  || (GET_MODE (scratch2) == SImode
+		   REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	  || !HARD_REGISTER_P (out)
+	  || (REGNO (out) != REGNO (amount)
+		   REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	gen_addsi3_compare0 ((DEST), (SRC), \
+ gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	gen_rtx_IOR (SImode, (A), (B))
+  #define IF(COND,RTX) \
+	gen_rtx_COND_EXEC (VOIDmode, \
+			   gen_rtx_ ## COND (CCmode, cc_reg, \
+		 const0_rtx), \
+			   (RTX))
+
+  if (CONST_INT_P (amount))
+{
+  /* Shifts by a constant amount.  */
+  if (INTVAL (amount) = 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+  else if (INTVAL (amount) = 64)
+	{
+	  /* Match what shift-by-register would do.  

Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-01-30 Thread Richard Earnshaw
On 27/01/12 16:07, Andrew Stubbs wrote:
 Hi all,
 
 This patch introduces a new, more efficient set of DImode shift 
 sequences for values stored in core-registers (as opposed to VFP/NEON 
 registers).
 
 The new sequences take advantage of knowledge of what the ARM 
 instructions do with out-of-range shift amounts.
 
 The following are examples or a simple test case, like this one:
 
 long long
 f (long long *a, int b)
 {
return *a  b;
 }
 
 
 In ARM mode, old left-shift vs. the new one:
 
  stmfd   sp!, {r4, r5}| ldrdr2, [r0]
  rsb r4, r1, #32  | mov ip, r1
  ldr r5, [r0, #4] | stmfd   sp!, {r4, r5}
  subsip, r1, #32  | sub r5, ip, #32
  ldr r0, [r0, #0] | rsb r4, ip, #32
  mov r3, r5, asl r1   | mov r1, r3, asl ip
  orr r3, r3, r0, lsr r4   | mov r0, r2, asl ip
  mov r2, r0, asl r1   | orr r1, r1, r2, asl r5
  movpl   r3, r0, asl ip   | orr r1, r1, r2, lsr r4
  mov r0, r2   | ldmfd   sp!, {r4, r5}
  mov r1, r3   | bx  lr
  ldmfd   sp!, {r4, r5}|
  bx  lr   |
 
 In Thumb mode, old left-shift vs. new:
 
  ldr r2, [r0, #0] | ldrdr2, [r0]
  ldr r3, [r0, #4] | push{r4, r5, r6}
  push{r4, r5, r6} | sub r6, r1, #32
  rsb r6, r1, #32  | mov r4, r1
  sub r4, r1, #32  | rsb r5, r1, #32
  lslsr3, r3, r1   | lslsr6, r2, r6
  lsrsr6, r2, r6   | lslsr1, r3, r1
  lslsr5, r2, r4   | lsrsr5, r2, r5
  orrsr3, r3, r6   | lslsr0, r2, r4
  lslsr0, r2, r1   | orrsr1, r1, r6
  bicsr1, r5, r4, asr #32  | orrsr1, r1, r5
  it  cs   | pop {r4, r5, r6}
  movcs   r1, r3   | bx  lr
  pop {r4, r5, r6} |
  bx  lr   |
 
 Logical right shift is essentially the same sequence as the left shift 
 above. However, arithmetic right shift requires something slightly 
 different. Here it is in ARM mode, old vs. new:
 
  stmfd   sp!, {r4, r5}| ldrdr2, [r0]
  rsb r4, r1, #32  | mov ip, r1
  ldr r5, [r0, #0] | stmfd   sp!, {r4, r5}
  subsip, r1, #32  | rsb r5, ip, #32
  ldr r0, [r0, #4] | subsr4, ip, #32
  mov r2, r5, lsr r1   | mov r0, r2, lsr ip
  orr r2, r2, r0, asl r4   | mov r1, r3, asr ip
  mov r3, r0, asr r1   | orr r0, r0, r3, asl r5
  movpl   r2, r0, asr ip   | orrge   r0, r0, r3, asr r4
  mov r1, r3   | ldmfd   sp!, {r4, r5}
  mov r0, r2   | bx  lr
  ldmfd   sp!, {r4, r5}|
  bx  lr   |
 
 I won't bore you with the Thumb mode comparison.
 
 The shift-by-constant cases have also been reimplemented, although the 
 resultant sequences are much the same as before. (Doing this isn't 
 strictly necessary just yet, but when I post my next patch to do 64-bit 
 shifts in NEON, this feature will be required by the fall-back 
 alternatives.)
 
 I've run a regression test on a cross-compiler, and I should have native 
 test results next week sometime. Also some benchmark results.
 
 Is this OK for stage 1?
 

What's the impact of this on -Os?  At present we fall back to the
libcalls, but I can't immediately see how the new code would do that.

Gut feeling is that shift by a constant is always worth inlining at -Os,
but shift by a register isn't.

R.


 



[PATCH][ARM] Improve 64-bit shifts (non-NEON)

2012-01-27 Thread Andrew Stubbs

Hi all,

This patch introduces a new, more efficient set of DImode shift 
sequences for values stored in core-registers (as opposed to VFP/NEON 
registers).


The new sequences take advantage of knowledge of what the ARM 
instructions do with out-of-range shift amounts.


The following are examples or a simple test case, like this one:

long long
f (long long *a, int b)
{
  return *a  b;
}


In ARM mode, old left-shift vs. the new one:

stmfd   sp!, {r4, r5}| ldrdr2, [r0]
rsb r4, r1, #32  | mov ip, r1
ldr r5, [r0, #4] | stmfd   sp!, {r4, r5}
subsip, r1, #32  | sub r5, ip, #32
ldr r0, [r0, #0] | rsb r4, ip, #32
mov r3, r5, asl r1   | mov r1, r3, asl ip
orr r3, r3, r0, lsr r4   | mov r0, r2, asl ip
mov r2, r0, asl r1   | orr r1, r1, r2, asl r5
movpl   r3, r0, asl ip   | orr r1, r1, r2, lsr r4
mov r0, r2   | ldmfd   sp!, {r4, r5}
mov r1, r3   | bx  lr
ldmfd   sp!, {r4, r5}|
bx  lr   |

In Thumb mode, old left-shift vs. new:

ldr r2, [r0, #0] | ldrdr2, [r0]
ldr r3, [r0, #4] | push{r4, r5, r6}
push{r4, r5, r6} | sub r6, r1, #32
rsb r6, r1, #32  | mov r4, r1
sub r4, r1, #32  | rsb r5, r1, #32
lslsr3, r3, r1   | lslsr6, r2, r6
lsrsr6, r2, r6   | lslsr1, r3, r1
lslsr5, r2, r4   | lsrsr5, r2, r5
orrsr3, r3, r6   | lslsr0, r2, r4
lslsr0, r2, r1   | orrsr1, r1, r6
bicsr1, r5, r4, asr #32  | orrsr1, r1, r5
it  cs   | pop {r4, r5, r6}
movcs   r1, r3   | bx  lr
pop {r4, r5, r6} |
bx  lr   |

Logical right shift is essentially the same sequence as the left shift 
above. However, arithmetic right shift requires something slightly 
different. Here it is in ARM mode, old vs. new:


stmfd   sp!, {r4, r5}| ldrdr2, [r0]
rsb r4, r1, #32  | mov ip, r1
ldr r5, [r0, #0] | stmfd   sp!, {r4, r5}
subsip, r1, #32  | rsb r5, ip, #32
ldr r0, [r0, #4] | subsr4, ip, #32
mov r2, r5, lsr r1   | mov r0, r2, lsr ip
orr r2, r2, r0, asl r4   | mov r1, r3, asr ip
mov r3, r0, asr r1   | orr r0, r0, r3, asl r5
movpl   r2, r0, asr ip   | orrge   r0, r0, r3, asr r4
mov r1, r3   | ldmfd   sp!, {r4, r5}
mov r0, r2   | bx  lr
ldmfd   sp!, {r4, r5}|
bx  lr   |

I won't bore you with the Thumb mode comparison.

The shift-by-constant cases have also been reimplemented, although the 
resultant sequences are much the same as before. (Doing this isn't 
strictly necessary just yet, but when I post my next patch to do 64-bit 
shifts in NEON, this feature will be required by the fall-back 
alternatives.)


I've run a regression test on a cross-compiler, and I should have native 
test results next week sometime. Also some benchmark results.


Is this OK for stage 1?

Andrew
2012-01-27  Andrew Stubbs  a...@codesourcery.com

	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.

---
 gcc/config/arm/arm-protos.h |3 +
 gcc/config/arm/arm.c|  198 +++
 gcc/config/arm/arm.md   |   90 ++--
 3 files changed, 264 insertions(+), 27 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+	   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..eefc45c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch