Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

Ramana Radhakrishnan Wed, 16 May 2012 03:26:19 -0700

>
>  extern const struct tune_params *current_tune;
>  extern int vfp3_const_double_for_fract_bits (rtx);
> +
> +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
> +                                        rtx);
>  #endif /* RTX_CODE */


>  #endif /* ! GCC_ARM_PROTOS_H */
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index c3a19e4..02dc6ca 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
>    return 0;
>  }

> +/* The default expansion of general 64-bit shifts in core-regs is suboptimal
> +   on ARM, since we know that shifts by negative amounts are no-ops.
> +
> +   It's safe for the input and output to be the same register, but
> +   early-clobber rules apply for the shift amount and scratch registers.
> +
> +   Shift by register requires both scratch registers.  Shift by a constant
> +   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
> +   the scratch registers may be NULL.
> +
> +   Additionally, ashiftrt by a register also clobbers the CC register.  */
> +void
> +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
> +                            rtx amount, rtx scratch1, rtx scratch2)
> +{
> +  rtx out_high = gen_highpart (SImode, out);
> +  rtx out_low = gen_lowpart (SImode, out);
> +  rtx in_high = gen_highpart (SImode, in);
> +  rtx in_low = gen_lowpart (SImode, in);
> +
> +  /* Bits flow from up-stream to down-stream.  */

Some thing more about "upstream" and "downstream" here would be nice :)

> +  rtx out_up   = code == ASHIFT ? out_low : out_high;
> +  rtx out_down = code == ASHIFT ? out_high : out_low;
> +  rtx in_up   = code == ASHIFT ? in_low : in_high;
> +  rtx in_down = code == ASHIFT ? in_high : in_low;
> +
> +  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
> +  gcc_assert (out
> +           && (REG_P (out) || GET_CODE (out) == SUBREG)


> +           && GET_MODE (out) == DImode);
> +  gcc_assert (in
> +           && (REG_P (in) || GET_CODE (in) == SUBREG)
> +           && GET_MODE (in) == DImode);
> +  gcc_assert (amount
> +           && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
> +                && GET_MODE (amount) == SImode)
> +               || CONST_INT_P (amount)));
> +  gcc_assert (scratch1 == NULL
> +           || (GET_CODE (scratch1) == SCRATCH)
> +           || (GET_MODE (scratch1) == SImode
> +               && REG_P (scratch1)));
> +  gcc_assert (scratch2 == NULL
> +           || (GET_CODE (scratch2) == SCRATCH)
> +           || (GET_MODE (scratch2) == SImode
> +               && REG_P (scratch2)));
> +  gcc_assert (!REG_P (out) || !REG_P (amount)
> +           || !HARD_REGISTER_P (out)
> +           || (REGNO (out) != REGNO (amount)
> +               && REGNO (out) + 1 != REGNO (amount)));
> +
> +  /* Macros to make following code more readable.  */
> +  #define SUB_32(DEST,SRC) \
> +         gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
> +  #define RSB_32(DEST,SRC) \
> +         gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
> +  #define SUB_S_32(DEST,SRC) \
> +         gen_addsi3_compare0 ((DEST), (SRC), \
> +                              gen_rtx_CONST_INT (VOIDmode, -32))
> +  #define SET(DEST,SRC) \
> +         gen_rtx_SET (SImode, (DEST), (SRC))
> +  #define SHIFT(CODE,SRC,AMOUNT) \
> +         gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
> +  #define LSHIFT(CODE,SRC,AMOUNT) \
> +         gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
> +                         SImode, (SRC), (AMOUNT))
> +  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
> +         gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
> +                         SImode, (SRC), (AMOUNT))
> +  #define ORR(A,B) \
> +         gen_rtx_IOR (SImode, (A), (B))
> +  #define BRANCH(COND,LABEL) \
> +         gen_arm_cond_branch ((LABEL), \
> +                              gen_rtx_ ## COND (CCmode, cc_reg, \
> +                                                const0_rtx), \
> +                              cc_reg)
> +
> +  if (CONST_INT_P (amount))
> +    {
> +      /* Shifts by a constant amount.  */
> +      if (INTVAL (amount) <= 0)
> +     /* Match what shift-by-register would do.  */
> +     emit_insn (gen_movdi (out, in));
> +      else if (INTVAL (amount) >= 64)
> +     {
> +       /* Match what shift-by-register would do.  */
> +       if (code == ASHIFTRT)
> +         {
> +           rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
> +           emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
> +           emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
> +         }
> +       else
> +         emit_insn (gen_movdi (out, const0_rtx));
> +     }
> +      else if (INTVAL (amount) < 32)
> +     {
> +       /* Shifts by a constant less than 32.  */
> +       rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
> +                                               32 - INTVAL (amount));
> +
> +       emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
> +       emit_insn (SET (out_down,
> +                       ORR (REV_LSHIFT (code, in_up, reverse_amount),
> +                            out_down)));
> +       emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
> +     }
> +      else
> +     {
> +       /* Shifts by a constant greater than 31.  */
> +       rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
> +
> +       emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
> +       if (code == ASHIFTRT)
> +         emit_insn (gen_ashrsi3 (out_up, in_up,
> +                                 gen_rtx_CONST_INT (VOIDmode, 31)));
> +       else
> +         emit_insn (SET (out_up, const0_rtx));
> +     }
> +    }
> +  else
> +    {
> +      /* Shifts by a variable amount.  */
> +      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);

This isn't something I'm terribly confident about. I think I'd rather
use CC_NOOVmode or in CCmode in this case I think (in this case you
only care that the value as a result of subs r0, r1, 32 is positive or
0) so it's possibly ok to do so. GE with CC_NCV mode really doesn't
make sense as this expects only N, C and V flags to be set but GE
requires the Z bit as well if you went for it.

> +      gcc_assert (scratch1 && REG_P (scratch1));
> +      gcc_assert (scratch2 && REG_P (scratch2));
> +
> +      switch (code)
> +     {
> +     case ASHIFT:
> +       emit_insn (SUB_32 (scratch1, amount));
> +       emit_insn (RSB_32 (scratch2, amount));
> +       break;
> +     case ASHIFTRT:
> +       emit_insn (RSB_32 (scratch1, amount));
> +       emit_insn (SUB_S_32 (scratch2, amount));
> +       break;
> +     case LSHIFTRT:
> +       emit_insn (RSB_32 (scratch1, amount));
> +       emit_insn (SUB_32 (scratch2, amount));
> +       break;
> +     default:
> +       gcc_unreachable ();
> +     }
> +
> +      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
> +
> +      if (!TARGET_THUMB2)
> +     {
> +       /* If this were only called during expand we could just use the else
> +          case and let combine deal with it, but this can also be called
> +          from post-reload splitters.  */
> +       emit_insn (SET (out_down,
> +                       ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
> +       if (code == ASHIFTRT)
> +         {
> +           rtx done_label = gen_label_rtx ();
> +           emit_jump_insn (BRANCH (LT, done_label));
> +           emit_insn (SET (out_down, ORR (SHIFT (ASHIFTRT, in_up, scratch2),
> +                                          out_down)));
> +           emit_label (done_label);
> +         }
> +       else
> +         emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
> +                                        out_down)));
> +     }
> +      else
> +     {
> +       /* Thumb2 can't do shift and or in one insn.  */
> +       emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
> +       emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
> +
> +       if (code == ASHIFTRT)
> +         {
> +           rtx done_label = gen_label_rtx ();
> +           emit_jump_insn (BRANCH (LT, done_label));
> +           emit_insn (SET (scratch2, SHIFT (ASHIFTRT, in_up, scratch2)));
> +           emit_insn (SET (out_down, ORR (out_down, scratch2)));
> +           emit_label (done_label);
> +         }
> +       else
> +         {
> +           emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
> +           emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
> +         }
> +     }
> +
> +      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
> +    }
> +
> +  #undef SUB_32
> +  #undef RSB_32
> +  #undef SUB_S_32
> +  #undef SET
> +  #undef SHIFT
> +  #undef LSHIFT
> +  #undef REV_LSHIFT
> +  #undef ORR
> +  #undef BRANCH
> +}
> +
>  #include "gt-arm.h"

> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
> index 751997f..7910bae 100644
> --- a/gcc/config/arm/arm.md
> +++ b/gcc/config/arm/arm.md
> @@ -3466,21 +3466,37 @@
>                     (match_operand:SI 2 "reg_or_int_operand" "")))]
>    "TARGET_32BIT"
>    "
> -  if (GET_CODE (operands[2]) == CONST_INT)
> +  if (!CONST_INT_P (operands[2])
> +      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
> +    ; /* No special preparation statements; expand pattern as above.  */
> +  else
>      {
> -      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
> +      rtx scratch1, scratch2;
> +
> +      if (GET_CODE (operands[2]) == CONST_INT
> +       && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
>          {
>            emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
>            DONE;
>          }
> -        /* Ideally we shouldn't fail here if we could know that operands[1]
> -           ends up already living in an iwmmxt register. Otherwise it's
> -           cheaper to have the alternate code being generated than moving
> -           values to iwmmxt regs and back.  */
> -        FAIL;
> +
> +      /* Ideally we should use iwmmxt here if we could know that operands[1]
> +         ends up already living in an iwmmxt register. Otherwise it's
> +         cheaper to have the alternate code being generated than moving
> +         values to iwmmxt regs and back.  */
> +
> +      /* If we're optimizing for size, we prefer the libgcc calls.  */
> +      if (optimize_function_for_size_p (cfun))
> +     FAIL;
> +
> +      /* Expand operation using core-registers.
> +      'FAIL' would achieve the same thing, but this is a bit smarter.  */
> +      scratch1 = gen_reg_rtx (SImode);
> +      scratch2 = gen_reg_rtx (SImode);
> +      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
> +                                  operands[2], scratch1, scratch2);
> +      DONE;
>      }
> -  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
> -    FAIL;
>    "
>  )

> @@ -3525,21 +3541,37 @@
>                       (match_operand:SI 2 "reg_or_int_operand" "")))]
>    "TARGET_32BIT"
>    "
> -  if (GET_CODE (operands[2]) == CONST_INT)
> +  if (!CONST_INT_P (operands[2])
> +      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
> +    ; /* No special preparation statements; expand pattern as above.  */
> +  else
>      {
> -      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
> +      rtx scratch1, scratch2;
> +
> +      if (GET_CODE (operands[2]) == CONST_INT
> +       && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
>          {
>            emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
>            DONE;
>          }
> -        /* Ideally we shouldn't fail here if we could know that operands[1]
> -           ends up already living in an iwmmxt register. Otherwise it's
> -           cheaper to have the alternate code being generated than moving
> -           values to iwmmxt regs and back.  */
> -        FAIL;
> +
> +      /* Ideally we should use iwmmxt here if we could know that operands[1]
> +         ends up already living in an iwmmxt register. Otherwise it's
> +         cheaper to have the alternate code being generated than moving
> +         values to iwmmxt regs and back.  */
> +
> +      /* If we're optimizing for size, we prefer the libgcc calls.  */
> +      if (optimize_function_for_size_p (cfun))
> +     FAIL;
> +
> +      /* Expand operation using core-registers.
> +      'FAIL' would achieve the same thing, but this is a bit smarter.  */
> +      scratch1 = gen_reg_rtx (SImode);
> +      scratch2 = gen_reg_rtx (SImode);
> +      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
> +                                  operands[2], scratch1, scratch2);
> +      DONE;
>      }
> -  else if (!TARGET_REALLY_IWMMXT)
> -    FAIL;
>    "
>  )

> @@ -3582,21 +3614,37 @@
>                       (match_operand:SI 2 "reg_or_int_operand" "")))]
>    "TARGET_32BIT"
>    "
> -  if (GET_CODE (operands[2]) == CONST_INT)
> +  if (!CONST_INT_P (operands[2])
> +      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
> +    ; /* No special preparation statements; expand pattern as above.  */
> +  else
>      {
> -      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
> +      rtx scratch1, scratch2;
> +
> +      if (GET_CODE (operands[2]) == CONST_INT

Use CONST_INT_P (operands[2]) instead.

Ok with those changes.

regards
Ramana

Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)

Reply via email to