On 14/09/12 19:02, Ulrich Weigand wrote:
> Hello,
> 
> this patch changes the ARM back-end to use vld1.64/vst1.64 instructions
> instead of vldm/vstm  -where possible-  to implement double-word moves.
> 
> The main benefit of this is that it allows the compiler to provide
> appropriate alignment hints, which may improve performance.
> 
> The patch is based on an earlier version by Ramana.  This version has
> now successfully passed regression testing and benchmarking (no
> performance regressions found, improvements of up to 2.5% on certain
> benchmarks).
> 
> Tested on arm-linux-gnueabi.
> OK for mainline?
> 
> Bye,
> Ulrich
> 
> 
> 2012-09-14  Ramana Radhakrishnan  <ramana.radhakrish...@linaro.org>
>           Ulrich Weigand  <ulrich.weig...@linaro.org>
> 
>       * config/arm/arm.c (output_move_neon): Update comment.
>       Use vld1.64/vst1.64 instead of vldm/vstm where possible.
>       (neon_vector_mem_operand): Support double-word modes.
>       * config/arm/neon.md (*neon_mov VD): Call output_move_neon
>       instead of output_move_vfp.  Change constraint from Uv to Un.
> 

You should use HARD_REGNO_NREGS rather than ARM_NUM_REGS (which is
really intended only for core registers).

OK with that change.

R.
> Index: gcc-head/gcc/config/arm/arm.c
> ===================================================================
> --- gcc-head.orig/gcc/config/arm/arm.c        2012-09-14 19:38:20.000000000 
> +0200
> +++ gcc-head/gcc/config/arm/arm.c     2012-09-14 19:40:51.000000000 +0200
> @@ -9629,7 +9629,11 @@ neon_vector_mem_operand (rtx op, int typ
>        && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode)
>        && CONST_INT_P (XEXP (ind, 1))
>        && INTVAL (XEXP (ind, 1)) > -1024
> -      && INTVAL (XEXP (ind, 1)) < 1016
> +      /* For quad modes, we restrict the constant offset to be slightly less
> +      than what the instruction format permits.  We have no such constraint
> +      on double mode offsets.  (This must match arm_legitimate_index_p.)  */
> +      && (INTVAL (XEXP (ind, 1))
> +       < (VALID_NEON_QREG_MODE (GET_MODE (op))? 1016 : 1024))
>        && (INTVAL (XEXP (ind, 1)) & 3) == 0)
>      return TRUE;
>  
> @@ -14573,15 +14577,16 @@ output_move_vfp (rtx *operands)
>    return "";
>  }
>  
> -/* Output a Neon quad-word load or store, or a load or store for
> -   larger structure modes.
> +/* Output a Neon double-word or quad-word load or store, or a load
> +   or store for larger structure modes.
>  
>     WARNING: The ordering of elements is weird in big-endian mode,
> -   because we use VSTM, as required by the EABI.  GCC RTL defines
> -   element ordering based on in-memory order.  This can be differ
> -   from the architectural ordering of elements within a NEON register.
> -   The intrinsics defined in arm_neon.h use the NEON register element
> -   ordering, not the GCC RTL element ordering.
> +   because the EABI requires that vectors stored in memory appear
> +   as though they were stored by a VSTM, as required by the EABI.
> +   GCC RTL defines element ordering based on in-memory order.
> +   This can be different from the architectural ordering of elements
> +   within a NEON register. The intrinsics defined in arm_neon.h use the
> +   NEON register element ordering, not the GCC RTL element ordering.
>  
>     For example, the in-memory ordering of a big-endian a quadword
>     vector with 16-bit elements when stored from register pair {d0,d1}
> @@ -14595,7 +14600,22 @@ output_move_vfp (rtx *operands)
>       dN -> (rN+1, rN), dN+1 -> (rN+3, rN+2)
>  
>     So that STM/LDM can be used on vectors in ARM registers, and the
> -   same memory layout will result as if VSTM/VLDM were used.  */
> +   same memory layout will result as if VSTM/VLDM were used.
> +
> +   Instead of VSTM/VLDM we prefer to use VST1.64/VLD1.64 where
> +   possible, which allows use of appropriate alignment tags.
> +   Note that the choice of "64" is independent of the actual vector
> +   element size; this size simply ensures that the behavior is
> +   equivalent to VSTM/VLDM in both little-endian and big-endian mode.
> +
> +   Due to limitations of those instructions, use of VST1.64/VLD1.64
> +   is not possible if:
> +    - the address contains PRE_DEC, or
> +    - the mode refers to more than 4 double-word registers
> +
> +   In those cases, it would be possible to replace VSTM/VLDM by a
> +   sequence of instructions; this is not currently implemented since
> +   this is not certain to actually improve performance.  */
>  
>  const char *
>  output_move_neon (rtx *operands)
> @@ -14629,13 +14649,23 @@ output_move_neon (rtx *operands)
>    switch (GET_CODE (addr))
>      {
>      case POST_INC:
> -      templ = "v%smia%%?\t%%0!, %%h1";
> -      ops[0] = XEXP (addr, 0);
> +      /* We have to use vldm / vstm for too-large modes.  */
> +      if (ARM_NUM_REGS (mode) / 2 > 4)
> +     {
> +       templ = "v%smia%%?\t%%0!, %%h1";
> +       ops[0] = XEXP (addr, 0);
> +     }
> +      else
> +     {
> +       templ = "v%s1.64\t%%h1, %%A0";
> +       ops[0] = mem;
> +     }
>        ops[1] = reg;
>        break;
>  
>      case PRE_DEC:
> -      /* FIXME: We should be using vld1/vst1 here in BE mode?  */
> +      /* We have to use vldm / vstm in this case, since there is no
> +      pre-decrement form of the vld1 / vst1 instructions.  */
>        templ = "v%smdb%%?\t%%0!, %%h1";
>        ops[0] = XEXP (addr, 0);
>        ops[1] = reg;
> @@ -14679,7 +14709,12 @@ output_move_neon (rtx *operands)
>        }
>  
>      default:
> -      templ = "v%smia%%?\t%%m0, %%h1";
> +      /* We have to use vldm / vstm for too-large modes.  */
> +      if (ARM_NUM_REGS (mode) / 2 > 4)
> +     templ = "v%smia%%?\t%%m0, %%h1";
> +      else
> +     templ = "v%s1.64\t%%h1, %%A0";
> +
>        ops[0] = mem;
>        ops[1] = reg;
>      }
> Index: gcc-head/gcc/config/arm/neon.md
> ===================================================================
> --- gcc-head.orig/gcc/config/arm/neon.md      2012-09-14 19:38:20.000000000 
> +0200
> +++ gcc-head/gcc/config/arm/neon.md   2012-09-14 19:40:51.000000000 +0200
> @@ -156,9 +156,9 @@
>  
>  (define_insn "*neon_mov<mode>"
>    [(set (match_operand:VDX 0 "nonimmediate_operand"
> -       "=w,Uv,w, w,  ?r,?w,?r,?r, ?Us")
> +       "=w,Un,w, w,  ?r,?w,?r,?r, ?Us")
>       (match_operand:VDX 1 "general_operand"
> -       " w,w, Dn,Uvi, w, r, r, Usi,r"))]
> +       " w,w, Dn,Uni, w, r, r, Usi,r"))]
>    "TARGET_NEON
>     && (register_operand (operands[0], <MODE>mode)
>         || register_operand (operands[1], <MODE>mode))"
> @@ -181,15 +181,10 @@
>        return templ;
>      }
>  
> -  /* FIXME: If the memory layout is changed in big-endian mode, 
> output_move_vfp
> -     below must be changed to output_move_neon (which will use the
> -     element/structure loads/stores), and the constraint changed to 'Um' 
> instead
> -     of 'Uv'.  */
> -
>    switch (which_alternative)
>      {
>      case 0: return "vmov\t%P0, %P1  @ <mode>";
> -    case 1: case 3: return output_move_vfp (operands);
> +    case 1: case 3: return output_move_neon (operands);
>      case 2: gcc_unreachable ();
>      case 4: return "vmov\t%Q0, %R0, %P1  @ <mode>";
>      case 5: return "vmov\t%P0, %Q1, %R1  @ <mode>";
> 




Reply via email to