Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 16/05/12 17:43, Ramana Radhakrishnan wrote: OK (if no regressions). Cross tested with no regressions, and committed. Thanks Andrew
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); + +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, +rtx); #endif /* RTX_CODE */ #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index c3a19e4..02dc6ca 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand) return 0; } +/* The default expansion of general 64-bit shifts in core-regs is suboptimal + on ARM, since we know that shifts by negative amounts are no-ops. + + It's safe for the input and output to be the same register, but + early-clobber rules apply for the shift amount and scratch registers. + + Shift by register requires both scratch registers. Shift by a constant + less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases + the scratch registers may be NULL. + + Additionally, ashiftrt by a register also clobbers the CC register. */ +void +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in, +rtx amount, rtx scratch1, rtx scratch2) +{ + rtx out_high = gen_highpart (SImode, out); + rtx out_low = gen_lowpart (SImode, out); + rtx in_high = gen_highpart (SImode, in); + rtx in_low = gen_lowpart (SImode, in); + + /* Bits flow from up-stream to down-stream. */ Some thing more about upstream and downstream here would be nice :) + rtx out_up = code == ASHIFT ? out_low : out_high; + rtx out_down = code == ASHIFT ? out_high : out_low; + rtx in_up = code == ASHIFT ? in_low : in_high; + rtx in_down = code == ASHIFT ? in_high : in_low; + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + gcc_assert (out +(REG_P (out) || GET_CODE (out) == SUBREG) +GET_MODE (out) == DImode); + gcc_assert (in +(REG_P (in) || GET_CODE (in) == SUBREG) +GET_MODE (in) == DImode); + gcc_assert (amount +(((REG_P (amount) || GET_CODE (amount) == SUBREG) + GET_MODE (amount) == SImode) + || CONST_INT_P (amount))); + gcc_assert (scratch1 == NULL + || (GET_CODE (scratch1) == SCRATCH) + || (GET_MODE (scratch1) == SImode +REG_P (scratch1))); + gcc_assert (scratch2 == NULL + || (GET_CODE (scratch2) == SCRATCH) + || (GET_MODE (scratch2) == SImode +REG_P (scratch2))); + gcc_assert (!REG_P (out) || !REG_P (amount) + || !HARD_REGISTER_P (out) + || (REGNO (out) != REGNO (amount) +REGNO (out) + 1 != REGNO (amount))); + + /* Macros to make following code more readable. */ + #define SUB_32(DEST,SRC) \ + gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32)) + #define RSB_32(DEST,SRC) \ + gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC)) + #define SUB_S_32(DEST,SRC) \ + gen_addsi3_compare0 ((DEST), (SRC), \ + gen_rtx_CONST_INT (VOIDmode, -32)) + #define SET(DEST,SRC) \ + gen_rtx_SET (SImode, (DEST), (SRC)) + #define SHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT)) + #define LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \ + SImode, (SRC), (AMOUNT)) + #define REV_LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \ + SImode, (SRC), (AMOUNT)) + #define ORR(A,B) \ + gen_rtx_IOR (SImode, (A), (B)) + #define BRANCH(COND,LABEL) \ + gen_arm_cond_branch ((LABEL), \ + gen_rtx_ ## COND (CCmode, cc_reg, \ +const0_rtx), \ + cc_reg) + + if (CONST_INT_P (amount)) +{ + /* Shifts by a constant amount. */ + if (INTVAL (amount) = 0) + /* Match what shift-by-register would do. */ + emit_insn (gen_movdi (out, in)); + else if (INTVAL (amount) = 64) + { + /* Match what shift-by-register would do. */ + if (code == ASHIFTRT) + { + rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31); + emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx))); + emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx))); + } + else + emit_insn (gen_movdi (out, const0_rtx)); + } + else if (INTVAL (amount) 32) + { + /* Shifts by a constant less than 32. */ + rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode, + 32 - INTVAL (amount)); + + emit_insn (SET (out_down, LSHIFT (code, in_down, amount))); + emit_insn (SET
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 16/05/12 11:25, Ramana Radhakrishnan wrote: Ok with those changes. Hi Ramana, Here's an update rebased and modified as requested. Can you please confirm that the comments explain what you wanted to know, and then I will commit it. Thanks Andrew 2012-05-16 Andrew Stubbs a...@codesourcery.com gcc/ * config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New prototype. * config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function. * config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift. (ashrdi3,lshrdi3): Likewise. (arm_cond_branch): Remove '*' to enable gen_arm_cond_branch. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index cb74d70..b338470 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -245,6 +245,9 @@ struct tune_params extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); + +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, + rtx); #endif /* RTX_CODE */ extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 2c62c51..3ad4c75 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25933,4 +25933,256 @@ arm_autoinc_modes_ok_p (enum machine_mode mode, enum arm_auto_incmodes code) return false; } +/* The default expansion of general 64-bit shifts in core-regs is suboptimal, + on ARM, since we know that shifts by negative amounts are no-ops. + Additionally, the default expansion code is not available or suitable + for post-reload insn splits (this can occur when the register allocator + chooses not to do a shift in NEON). + + This function is used in both initial expand and post-reload splits, and + handles all kinds of 64-bit shifts. + + Input requirements: +- It is safe for the input and output to be the same register, but + early-clobber rules apply for the shift amount and scratch registers. +- Shift by register requires both scratch registers. Shift by a constant + less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases + the scratch registers may be NULL. +- Ashiftrt by a register also clobbers the CC register. */ +void +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in, + rtx amount, rtx scratch1, rtx scratch2) +{ + rtx out_high = gen_highpart (SImode, out); + rtx out_low = gen_lowpart (SImode, out); + rtx in_high = gen_highpart (SImode, in); + rtx in_low = gen_lowpart (SImode, in); + + /* Terminology: + in = the register pair containing the input value. + out = the destination register pair. + up = the high- or low-part of each pair. + down = the opposite part to up. + In a shift, we can consider bits to shift from up-stream to + down-stream, so in a left-shift up is the low-part and down + is the high-part of each register pair. */ + + rtx out_up = code == ASHIFT ? out_low : out_high; + rtx out_down = code == ASHIFT ? out_high : out_low; + rtx in_up = code == ASHIFT ? in_low : in_high; + rtx in_down = code == ASHIFT ? in_high : in_low; + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + gcc_assert (out + (REG_P (out) || GET_CODE (out) == SUBREG) + GET_MODE (out) == DImode); + gcc_assert (in + (REG_P (in) || GET_CODE (in) == SUBREG) + GET_MODE (in) == DImode); + gcc_assert (amount + (((REG_P (amount) || GET_CODE (amount) == SUBREG) + GET_MODE (amount) == SImode) + || CONST_INT_P (amount))); + gcc_assert (scratch1 == NULL + || (GET_CODE (scratch1) == SCRATCH) + || (GET_MODE (scratch1) == SImode + REG_P (scratch1))); + gcc_assert (scratch2 == NULL + || (GET_CODE (scratch2) == SCRATCH) + || (GET_MODE (scratch2) == SImode + REG_P (scratch2))); + gcc_assert (!REG_P (out) || !REG_P (amount) + || !HARD_REGISTER_P (out) + || (REGNO (out) != REGNO (amount) + REGNO (out) + 1 != REGNO (amount))); + + /* Macros to make following code more readable. */ + #define SUB_32(DEST,SRC) \ + gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32)) + #define RSB_32(DEST,SRC) \ + gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC)) + #define SUB_S_32(DEST,SRC) \ + gen_addsi3_compare0 ((DEST), (SRC), \ + gen_rtx_CONST_INT (VOIDmode, -32)) + #define SET(DEST,SRC) \ + gen_rtx_SET (SImode, (DEST), (SRC)) + #define SHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT)) + #define LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \ + SImode, (SRC), (AMOUNT)) + #define REV_LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \ + SImode, (SRC), (AMOUNT)) + #define ORR(A,B) \ + gen_rtx_IOR (SImode, (A), (B)) + #define BRANCH(COND,LABEL) \ + gen_arm_cond_branch ((LABEL), \ +
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 16 May 2012 17:09, Andrew Stubbs a...@codesourcery.com wrote: On 16/05/12 11:25, Ramana Radhakrishnan wrote: Ok with those changes. Hi Ramana, Here's an update rebased and modified as requested. Can you please confirm that the comments explain what you wanted to know, and then I will commit it. OK (if no regressions). Ramana
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 16/02/12 15:33, Andrew Stubbs wrote: OK for 4.8? I forgot to address Ramana's comment about optimize_size. This update fixes that and leaves everything else untouched. OK? Andrew 2012-02-17 Andrew Stubbs a...@codesourcery.com gcc/ * config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New prototype. * config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function. * config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift. (ashrdi3,lshrdi3): Likewise. (arm_cond_branch): Remove '*' to enable gen_arm_cond_branch. --- gcc/config/arm/arm-protos.h |3 + gcc/config/arm/arm.c| 201 +++ gcc/config/arm/arm.md | 104 -- 3 files changed, 280 insertions(+), 28 deletions(-) diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 296550a..df8d7a9 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -242,6 +242,9 @@ struct tune_params extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); + +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, + rtx); #endif /* RTX_CODE */ #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index c3a19e4..02dc6ca 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand) return 0; } +/* The default expansion of general 64-bit shifts in core-regs is suboptimal + on ARM, since we know that shifts by negative amounts are no-ops. + + It's safe for the input and output to be the same register, but + early-clobber rules apply for the shift amount and scratch registers. + + Shift by register requires both scratch registers. Shift by a constant + less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases + the scratch registers may be NULL. + + Additionally, ashiftrt by a register also clobbers the CC register. */ +void +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in, + rtx amount, rtx scratch1, rtx scratch2) +{ + rtx out_high = gen_highpart (SImode, out); + rtx out_low = gen_lowpart (SImode, out); + rtx in_high = gen_highpart (SImode, in); + rtx in_low = gen_lowpart (SImode, in); + + /* Bits flow from up-stream to down-stream. */ + rtx out_up = code == ASHIFT ? out_low : out_high; + rtx out_down = code == ASHIFT ? out_high : out_low; + rtx in_up = code == ASHIFT ? in_low : in_high; + rtx in_down = code == ASHIFT ? in_high : in_low; + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + gcc_assert (out + (REG_P (out) || GET_CODE (out) == SUBREG) + GET_MODE (out) == DImode); + gcc_assert (in + (REG_P (in) || GET_CODE (in) == SUBREG) + GET_MODE (in) == DImode); + gcc_assert (amount + (((REG_P (amount) || GET_CODE (amount) == SUBREG) + GET_MODE (amount) == SImode) + || CONST_INT_P (amount))); + gcc_assert (scratch1 == NULL + || (GET_CODE (scratch1) == SCRATCH) + || (GET_MODE (scratch1) == SImode + REG_P (scratch1))); + gcc_assert (scratch2 == NULL + || (GET_CODE (scratch2) == SCRATCH) + || (GET_MODE (scratch2) == SImode + REG_P (scratch2))); + gcc_assert (!REG_P (out) || !REG_P (amount) + || !HARD_REGISTER_P (out) + || (REGNO (out) != REGNO (amount) + REGNO (out) + 1 != REGNO (amount))); + + /* Macros to make following code more readable. */ + #define SUB_32(DEST,SRC) \ + gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32)) + #define RSB_32(DEST,SRC) \ + gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC)) + #define SUB_S_32(DEST,SRC) \ + gen_addsi3_compare0 ((DEST), (SRC), \ + gen_rtx_CONST_INT (VOIDmode, -32)) + #define SET(DEST,SRC) \ + gen_rtx_SET (SImode, (DEST), (SRC)) + #define SHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT)) + #define LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \ + SImode, (SRC), (AMOUNT)) + #define REV_LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \ + SImode, (SRC), (AMOUNT)) + #define ORR(A,B) \ + gen_rtx_IOR (SImode, (A), (B)) + #define BRANCH(COND,LABEL) \ + gen_arm_cond_branch ((LABEL), \ + gen_rtx_ ## COND (CCmode, cc_reg, \ + const0_rtx), \ + cc_reg) + + if (CONST_INT_P (amount)) +{ + /* Shifts by a constant amount. */ + if (INTVAL (amount) = 0) + /* Match what shift-by-register would do. */ + emit_insn (gen_movdi (out, in)); + else if (INTVAL (amount) = 64) + { + /* Match what shift-by-register would do. */ + if (code == ASHIFTRT) + { + rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31); + emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx))); + emit_insn (SET (out_up, SHIFT
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 11/02/12 01:11, Richard Henderson wrote: On 02/08/2012 08:28 AM, Andrew Stubbs wrote: Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to add arith-shift variants, I think, for ARM mode to work. H ... I'll try again. Does it work to simply use branches initially, and rely on post-reload ifcvt to transform to cond_exec when possible? It did work in ARM mode, but did not work in Thumb2 mode. My recent patch has fixed that: http://gcc.gnu.org/ml/gcc-patches/2012-02/msg00734.html With that patch applied, the attached patch has almost the right effect. The only problem now is that the scheduling gives multiple IT blocks, but that's a different problem. Here's the new thumb code for ashiftrt: ldrdr2, [r0] push{r4, r5, r6} mov r4, r1 rsb r5, r1, #32 subsr6, r4, #32 lsr r0, r2, r4 lsl r5, r3, r5 it ge asrge r6, r3, r6 asr r1, r3, r4 orr r0, r0, r5 it ge orrge r0, r0, r6 pop {r4, r5, r6} bx lr OK for 4.8? Andrew 2012-02-16 Andrew Stubbs a...@codesourcery.com gcc/ * config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New prototype. * config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function. * config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift. (ashrdi3,lshrdi3): Likewise. (arm_cond_branch): Remove '*' to enable gen_arm_cond_branch. --- gcc/config/arm/arm-protos.h |3 + gcc/config/arm/arm.c| 201 +++ gcc/config/arm/arm.md | 104 -- 3 files changed, 280 insertions(+), 28 deletions(-) diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 296550a..df8d7a9 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -242,6 +242,9 @@ struct tune_params extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); + +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, + rtx); #endif /* RTX_CODE */ #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 0bded8d..f3accc8 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25139,5 +25139,206 @@ vfp3_const_double_for_fract_bits (rtx operand) return 0; } +/* The default expansion of general 64-bit shifts in core-regs is suboptimal + on ARM, since we know that shifts by negative amounts are no-ops. + + It's safe for the input and output to be the same register, but + early-clobber rules apply for the shift amount and scratch registers. + + Shift by register requires both scratch registers. Shift by a constant + less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases + the scratch registers may be NULL. + + Additionally, ashiftrt by a register also clobbers the CC register. */ +void +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in, + rtx amount, rtx scratch1, rtx scratch2) +{ + rtx out_high = gen_highpart (SImode, out); + rtx out_low = gen_lowpart (SImode, out); + rtx in_high = gen_highpart (SImode, in); + rtx in_low = gen_lowpart (SImode, in); + + /* Bits flow from up-stream to down-stream. */ + rtx out_up = code == ASHIFT ? out_low : out_high; + rtx out_down = code == ASHIFT ? out_high : out_low; + rtx in_up = code == ASHIFT ? in_low : in_high; + rtx in_down = code == ASHIFT ? in_high : in_low; + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + gcc_assert (out + (REG_P (out) || GET_CODE (out) == SUBREG) + GET_MODE (out) == DImode); + gcc_assert (in + (REG_P (in) || GET_CODE (in) == SUBREG) + GET_MODE (in) == DImode); + gcc_assert (amount + (((REG_P (amount) || GET_CODE (amount) == SUBREG) + GET_MODE (amount) == SImode) + || CONST_INT_P (amount))); + gcc_assert (scratch1 == NULL + || (GET_CODE (scratch1) == SCRATCH) + || (GET_MODE (scratch1) == SImode + REG_P (scratch1))); + gcc_assert (scratch2 == NULL + || (GET_CODE (scratch2) == SCRATCH) + || (GET_MODE (scratch2) == SImode + REG_P (scratch2))); + gcc_assert (!REG_P (out) || !REG_P (amount) + || !HARD_REGISTER_P (out) + || (REGNO (out) != REGNO (amount) + REGNO (out) + 1 != REGNO (amount))); + + /* Macros to make following code more readable. */ + #define SUB_32(DEST,SRC) \ + gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32)) + #define RSB_32(DEST,SRC) \ + gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC)) + #define SUB_S_32(DEST,SRC) \ + gen_addsi3_compare0 ((DEST), (SRC), \ + gen_rtx_CONST_INT (VOIDmode, -32)) + #define SET(DEST,SRC) \ + gen_rtx_SET (SImode, (DEST), (SRC)) + #define SHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT)) + #define
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 02/08/2012 08:28 AM, Andrew Stubbs wrote: Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to add arith-shift variants, I think, for ARM mode to work. H ... I'll try again. Does it work to simply use branches initially, and rely on post-reload ifcvt to transform to cond_exec when possible? r~
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 07/02/12 22:19, Ramana Radhakrishnan wrote: I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . The documentation doesn't appear to mention this, therefore I would like to see if the cond_exec's can be recast as if_then_else forms from expand rather than these forms. Has this survived bootstrap and testing ? I've tried to do this, but it can't be done by a straight translation because we don't have the insns available to do it. As I understand it, all predicable instructions automatically get a cond_exec variant, but the only if_then_else I could find (it's hard to grep because if_then_else occurs many times in attributes) is in the conditional move instruction. Have I missed something? Anyway, I've tried to redo it using conditional move, and that works, but it's not as efficient. Hopefully Steven Bosscher is correct, and there's no problem with cond_exec on ARM. + /* If we're optimizing for size, we prefer the libgcc calls. */ + if (optimize_size) + FAIL; In addition you want to replace optimize_size with optimize_function_for_size_p in these cases. Ok, I've fixed this. I'll post an update soon. Unfortunately, my testing has found a problem in the native bootstrap, so I'm going to need to track that down first. Andrew
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 02/07/2012 11:33 PM, Steven Bosscher wrote: On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: Hi Andrew I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . There is nothing wrong per-se with cond_execs before reload, as long as you don't have to reload a predicate pseudo-reg. I thought the problem was that we'd have to emit conditional reload insns and inheritance wouldn't work. Bernd
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt ber...@codesourcery.com wrote: On 02/07/2012 11:33 PM, Steven Bosscher wrote: On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: Hi Andrew I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . There is nothing wrong per-se with cond_execs before reload, as long as you don't have to reload a predicate pseudo-reg. I thought the problem was that we'd have to emit conditional reload insns and inheritance wouldn't work. It probably depends on how DF sees conditional uses / defs. If they look like regular uses / defs then I suppose un-conditional spills/reloads are fine - otherwise of course you'd corrupt one of the two register set states. But that also means it's probably safe if the sequence of conditional insns is of length 1. Not sure we want to open that possible can of worms though ;) Richard. Bernd
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 02/08/2012 01:12 PM, Richard Guenther wrote: On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt ber...@codesourcery.com wrote: On 02/07/2012 11:33 PM, Steven Bosscher wrote: On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: Hi Andrew I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . There is nothing wrong per-se with cond_execs before reload, as long as you don't have to reload a predicate pseudo-reg. I thought the problem was that we'd have to emit conditional reload insns and inheritance wouldn't work. It probably depends on how DF sees conditional uses / defs. If they look like regular uses / defs then I suppose un-conditional spills/reloads are fine - otherwise of course you'd corrupt one of the two register set states. I'm pretty sure conditional defs are always RMW, but I'd have to go look. Can't imagine it working otherwise though. Bernd
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On Wed, Feb 8, 2012 at 1:41 PM, Bernd Schmidt ber...@codesourcery.com wrote: On 02/08/2012 01:12 PM, Richard Guenther wrote: On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt ber...@codesourcery.com wrote: On 02/07/2012 11:33 PM, Steven Bosscher wrote: On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: Hi Andrew I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . There is nothing wrong per-se with cond_execs before reload, as long as you don't have to reload a predicate pseudo-reg. I thought the problem was that we'd have to emit conditional reload insns and inheritance wouldn't work. It probably depends on how DF sees conditional uses / defs. If they look like regular uses / defs then I suppose un-conditional spills/reloads are fine - otherwise of course you'd corrupt one of the two register set states. I'm pretty sure conditional defs are always RMW, but I'd have to go look. Can't imagine it working otherwise though. I was thinking about (cond_exec 1 (set (reg:SI 30) (...))) (cond_exec 0 (set (reg:SI 30) (...))) (cond_exec 1 (use (reg:SI 30) (...))) where if we spill/reload 30 with non-cond_exec stmts then we might clobber the shared register set of the conditional execution paths. Similar of course shared stack space if we happen to spill/reload in both paths. If of course defs/uses of both paths conflict and get different hard registers assigned the register issue doesn't exist, still the shared spill stack space issue may. Richard.
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 08/02/12 11:18, Andrew Stubbs wrote: I've tried to do this, but it can't be done by a straight translation because we don't have the insns available to do it. As I understand it, all predicable instructions automatically get a cond_exec variant, but the only if_then_else I could find (it's hard to grep because if_then_else occurs many times in attributes) is in the conditional move instruction. Have I missed something? Ok, I missed the various patterns like if_arith_move. Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to add arith-shift variants, I think, for ARM mode to work. H ... I'll try again. Andrew
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
Hi Andrew I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . The documentation doesn't appear to mention this, therefore I would like to see if the cond_exec's can be recast as if_then_else forms from expand rather than these forms. Has this survived bootstrap and testing ? + /* If we're optimizing for size, we prefer the libgcc calls. */ + if (optimize_size) + FAIL; In addition you want to replace optimize_size with optimize_function_for_size_p in these cases. cheers Ramana
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: Hi Andrew I find it interesting that cond_exec's in this form survive all the way till reload and work. AFAIK we could never have cond_exec's before reload . There is nothing wrong per-se with cond_execs before reload, as long as you don't have to reload a predicate pseudo-reg. For ia64, with its (IIRC) 64 predicate registers, it was not practical, or even possible, to assign the predicate registers to hard regs during expand. AFAIU this isn't an issue on ARM because there is just one condition code register. Ciao! Steven
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 30/01/12 15:25, Richard Earnshaw wrote: What's the impact of this on -Os? At present we fall back to the libcalls, but I can't immediately see how the new code would do that. Gut feeling is that shift by a constant is always worth inlining at -Os, but shift by a register isn't. Ah, I hadn't considered that. Good point! This updated patch causes it to fall back to the old behaviour in optimize_size mode. This should do what you want. OK? Andrew 2012-01-31 Andrew Stubbs a...@codesourcery.com * config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New prototype. * config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function. * config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift. (ashrdi3,lshrdi3): Likewise. --- gcc/config/arm/arm-protos.h |3 + gcc/config/arm/arm.c| 198 +++ gcc/config/arm/arm.md | 102 -- 3 files changed, 276 insertions(+), 27 deletions(-) diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 296550a..df8d7a9 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -242,6 +242,9 @@ struct tune_params extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); + +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, + rtx); #endif /* RTX_CODE */ #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 0bded8d..eefc45c 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand) return 0; } +/* The default expansion of general 64-bit shifts in core-regs is suboptimal + on ARM, since we know that shifts by negative amounts are no-ops. + + It's safe for the input and output to be the same register, but + early-clobber rules apply for the shift amount and scratch registers. + + Shift by register requires both scratch registers. Shift by a constant + less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases + the scratch registers may be NULL. + + Additionally, ashiftrt by a register also clobbers the CC register. */ +void +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in, + rtx amount, rtx scratch1, rtx scratch2) +{ + rtx out_high = gen_highpart (SImode, out); + rtx out_low = gen_lowpart (SImode, out); + rtx in_high = gen_highpart (SImode, in); + rtx in_low = gen_lowpart (SImode, in); + + /* Bits flow from up-stream to down-stream. */ + rtx out_up = code == ASHIFT ? out_low : out_high; + rtx out_down = code == ASHIFT ? out_high : out_low; + rtx in_up = code == ASHIFT ? in_low : in_high; + rtx in_down = code == ASHIFT ? in_high : in_low; + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + gcc_assert (out + (REG_P (out) || GET_CODE (out) == SUBREG) + GET_MODE (out) == DImode); + gcc_assert (in + (REG_P (in) || GET_CODE (in) == SUBREG) + GET_MODE (in) == DImode); + gcc_assert (amount + (((REG_P (amount) || GET_CODE (amount) == SUBREG) + GET_MODE (amount) == SImode) + || CONST_INT_P (amount))); + gcc_assert (scratch1 == NULL + || (GET_CODE (scratch1) == SCRATCH) + || (GET_MODE (scratch1) == SImode + REG_P (scratch1))); + gcc_assert (scratch2 == NULL + || (GET_CODE (scratch2) == SCRATCH) + || (GET_MODE (scratch2) == SImode + REG_P (scratch2))); + gcc_assert (!REG_P (out) || !REG_P (amount) + || !HARD_REGISTER_P (out) + || (REGNO (out) != REGNO (amount) + REGNO (out) + 1 != REGNO (amount))); + + /* Macros to make following code more readable. */ + #define SUB_32(DEST,SRC) \ + gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32)) + #define RSB_32(DEST,SRC) \ + gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC)) + #define SUB_S_32(DEST,SRC) \ + gen_addsi3_compare0 ((DEST), (SRC), \ + gen_rtx_CONST_INT (VOIDmode, -32)) + #define SET(DEST,SRC) \ + gen_rtx_SET (SImode, (DEST), (SRC)) + #define SHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT)) + #define LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \ + SImode, (SRC), (AMOUNT)) + #define REV_LSHIFT(CODE,SRC,AMOUNT) \ + gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \ + SImode, (SRC), (AMOUNT)) + #define ORR(A,B) \ + gen_rtx_IOR (SImode, (A), (B)) + #define IF(COND,RTX) \ + gen_rtx_COND_EXEC (VOIDmode, \ + gen_rtx_ ## COND (CCmode, cc_reg, \ + const0_rtx), \ + (RTX)) + + if (CONST_INT_P (amount)) +{ + /* Shifts by a constant amount. */ + if (INTVAL (amount) = 0) + /* Match what shift-by-register would do. */ + emit_insn (gen_movdi (out, in)); + else if (INTVAL (amount) = 64) + { + /* Match what shift-by-register would do.
Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
On 27/01/12 16:07, Andrew Stubbs wrote: Hi all, This patch introduces a new, more efficient set of DImode shift sequences for values stored in core-registers (as opposed to VFP/NEON registers). The new sequences take advantage of knowledge of what the ARM instructions do with out-of-range shift amounts. The following are examples or a simple test case, like this one: long long f (long long *a, int b) { return *a b; } In ARM mode, old left-shift vs. the new one: stmfd sp!, {r4, r5}| ldrdr2, [r0] rsb r4, r1, #32 | mov ip, r1 ldr r5, [r0, #4] | stmfd sp!, {r4, r5} subsip, r1, #32 | sub r5, ip, #32 ldr r0, [r0, #0] | rsb r4, ip, #32 mov r3, r5, asl r1 | mov r1, r3, asl ip orr r3, r3, r0, lsr r4 | mov r0, r2, asl ip mov r2, r0, asl r1 | orr r1, r1, r2, asl r5 movpl r3, r0, asl ip | orr r1, r1, r2, lsr r4 mov r0, r2 | ldmfd sp!, {r4, r5} mov r1, r3 | bx lr ldmfd sp!, {r4, r5}| bx lr | In Thumb mode, old left-shift vs. new: ldr r2, [r0, #0] | ldrdr2, [r0] ldr r3, [r0, #4] | push{r4, r5, r6} push{r4, r5, r6} | sub r6, r1, #32 rsb r6, r1, #32 | mov r4, r1 sub r4, r1, #32 | rsb r5, r1, #32 lslsr3, r3, r1 | lslsr6, r2, r6 lsrsr6, r2, r6 | lslsr1, r3, r1 lslsr5, r2, r4 | lsrsr5, r2, r5 orrsr3, r3, r6 | lslsr0, r2, r4 lslsr0, r2, r1 | orrsr1, r1, r6 bicsr1, r5, r4, asr #32 | orrsr1, r1, r5 it cs | pop {r4, r5, r6} movcs r1, r3 | bx lr pop {r4, r5, r6} | bx lr | Logical right shift is essentially the same sequence as the left shift above. However, arithmetic right shift requires something slightly different. Here it is in ARM mode, old vs. new: stmfd sp!, {r4, r5}| ldrdr2, [r0] rsb r4, r1, #32 | mov ip, r1 ldr r5, [r0, #0] | stmfd sp!, {r4, r5} subsip, r1, #32 | rsb r5, ip, #32 ldr r0, [r0, #4] | subsr4, ip, #32 mov r2, r5, lsr r1 | mov r0, r2, lsr ip orr r2, r2, r0, asl r4 | mov r1, r3, asr ip mov r3, r0, asr r1 | orr r0, r0, r3, asl r5 movpl r2, r0, asr ip | orrge r0, r0, r3, asr r4 mov r1, r3 | ldmfd sp!, {r4, r5} mov r0, r2 | bx lr ldmfd sp!, {r4, r5}| bx lr | I won't bore you with the Thumb mode comparison. The shift-by-constant cases have also been reimplemented, although the resultant sequences are much the same as before. (Doing this isn't strictly necessary just yet, but when I post my next patch to do 64-bit shifts in NEON, this feature will be required by the fall-back alternatives.) I've run a regression test on a cross-compiler, and I should have native test results next week sometime. Also some benchmark results. Is this OK for stage 1? What's the impact of this on -Os? At present we fall back to the libcalls, but I can't immediately see how the new code would do that. Gut feeling is that shift by a constant is always worth inlining at -Os, but shift by a register isn't. R.
[PATCH][ARM] Improve 64-bit shifts (non-NEON)
Hi all, This patch introduces a new, more efficient set of DImode shift sequences for values stored in core-registers (as opposed to VFP/NEON registers). The new sequences take advantage of knowledge of what the ARM instructions do with out-of-range shift amounts. The following are examples or a simple test case, like this one: long long f (long long *a, int b) { return *a b; } In ARM mode, old left-shift vs. the new one: stmfd sp!, {r4, r5}| ldrdr2, [r0] rsb r4, r1, #32 | mov ip, r1 ldr r5, [r0, #4] | stmfd sp!, {r4, r5} subsip, r1, #32 | sub r5, ip, #32 ldr r0, [r0, #0] | rsb r4, ip, #32 mov r3, r5, asl r1 | mov r1, r3, asl ip orr r3, r3, r0, lsr r4 | mov r0, r2, asl ip mov r2, r0, asl r1 | orr r1, r1, r2, asl r5 movpl r3, r0, asl ip | orr r1, r1, r2, lsr r4 mov r0, r2 | ldmfd sp!, {r4, r5} mov r1, r3 | bx lr ldmfd sp!, {r4, r5}| bx lr | In Thumb mode, old left-shift vs. new: ldr r2, [r0, #0] | ldrdr2, [r0] ldr r3, [r0, #4] | push{r4, r5, r6} push{r4, r5, r6} | sub r6, r1, #32 rsb r6, r1, #32 | mov r4, r1 sub r4, r1, #32 | rsb r5, r1, #32 lslsr3, r3, r1 | lslsr6, r2, r6 lsrsr6, r2, r6 | lslsr1, r3, r1 lslsr5, r2, r4 | lsrsr5, r2, r5 orrsr3, r3, r6 | lslsr0, r2, r4 lslsr0, r2, r1 | orrsr1, r1, r6 bicsr1, r5, r4, asr #32 | orrsr1, r1, r5 it cs | pop {r4, r5, r6} movcs r1, r3 | bx lr pop {r4, r5, r6} | bx lr | Logical right shift is essentially the same sequence as the left shift above. However, arithmetic right shift requires something slightly different. Here it is in ARM mode, old vs. new: stmfd sp!, {r4, r5}| ldrdr2, [r0] rsb r4, r1, #32 | mov ip, r1 ldr r5, [r0, #0] | stmfd sp!, {r4, r5} subsip, r1, #32 | rsb r5, ip, #32 ldr r0, [r0, #4] | subsr4, ip, #32 mov r2, r5, lsr r1 | mov r0, r2, lsr ip orr r2, r2, r0, asl r4 | mov r1, r3, asr ip mov r3, r0, asr r1 | orr r0, r0, r3, asl r5 movpl r2, r0, asr ip | orrge r0, r0, r3, asr r4 mov r1, r3 | ldmfd sp!, {r4, r5} mov r0, r2 | bx lr ldmfd sp!, {r4, r5}| bx lr | I won't bore you with the Thumb mode comparison. The shift-by-constant cases have also been reimplemented, although the resultant sequences are much the same as before. (Doing this isn't strictly necessary just yet, but when I post my next patch to do 64-bit shifts in NEON, this feature will be required by the fall-back alternatives.) I've run a regression test on a cross-compiler, and I should have native test results next week sometime. Also some benchmark results. Is this OK for stage 1? Andrew 2012-01-27 Andrew Stubbs a...@codesourcery.com * config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New prototype. * config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function. * config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift. (ashrdi3,lshrdi3): Likewise. --- gcc/config/arm/arm-protos.h |3 + gcc/config/arm/arm.c| 198 +++ gcc/config/arm/arm.md | 90 ++-- 3 files changed, 264 insertions(+), 27 deletions(-) diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 296550a..df8d7a9 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -242,6 +242,9 @@ struct tune_params extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); + +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, + rtx); #endif /* RTX_CODE */ #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 0bded8d..eefc45c 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand) return 0; } +/* The default expansion of general 64-bit shifts in core-regs is suboptimal + on ARM, since we know that shifts by negative amounts are no-ops. + + It's safe for the input and output to be the same register, but + early-clobber rules apply for the shift amount and scratch registers. + + Shift by register requires both scratch registers. Shift by a constant + less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases + the scratch