Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> A MOPS memmove may corrupt registers since there is no copy of the input 
> operands to temporary
> registers.  Fix this by calling aarch64_expand_cpymem which does this.  Also 
> fix an issue with
> STRICT_ALIGNMENT being ignored if TARGET_MOPS is true, and avoid crashing or 
> generating a huge
> expansion if aarch64_mops_memcpy_size_threshold is large.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         PR target/111121
>         * config/aarch64/aarch64.md (cpymemdi): Remove STRICT_ALIGNMENT, add 
> param for memmove.
>         (aarch64_movmemdi): Add new expander similar to aarch64_cpymemdi.
>         (movmemdi): Like cpymemdi call aarch64_expand_cpymem for correct 
> expansion.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem_mops): Add support 
> for memmove.
>         (aarch64_expand_cpymem): Add support for memmove. Handle 
> STRICT_ALIGNMENT correctly.
>         Handle TARGET_MOPS size selection correctly.
>         * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Update 
> prototype.
>
> gcc/testsuite/ChangeLog/
>         PR target/111121
>         * gcc.target/aarch64/mops_4.c: Add memmove testcases.
>
> ---
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 70303d6fd953e0c397b9138ede8858c2db2e53db..97375e81cbda078847af83bf5dd4e0d7673d6af4
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -765,7 +765,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
>  bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>  tree aarch64_vector_load_decl (tree);
>  void aarch64_expand_call (rtx, rtx, rtx, bool);
> -bool aarch64_expand_cpymem (rtx *);
> +bool aarch64_expand_cpymem (rtx *, bool);
>  bool aarch64_expand_setmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_float_const_rtx_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> eba5d4a7e04b7af82437453a691d5607d98133c9..5e8d0a0c91bc7719de2a8c5627b354cf905a4db0
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25135,10 +25135,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx 
> *src, rtx *dst,
>    *dst = aarch64_progress_pointer (*dst);
>  }
>
> -/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
> -   from the cpymem pattern.  Return true iff we succeeded.  */
> +/* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
> +   from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
> +   rather than memcpy.  Return true iff we succeeded.  */
>  static bool
> -aarch64_expand_cpymem_mops (rtx *operands)
> +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
>  {
>    if (!TARGET_MOPS)
>      return false;
> @@ -25150,17 +25151,19 @@ aarch64_expand_cpymem_mops (rtx *operands)
>    rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
>    rtx src_mem = replace_equiv_address (operands[1], src_addr);
>    rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
> -  emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
> -
> +  if (is_memmove)
> +    emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
> +  else
> +    emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
>    return true;
>  }
>
> -/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
> -   we succeed, otherwise return false, indicating that a libcall to
> -   memcpy should be emitted.  */
> -
> +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
> +   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
> +   if this is a memmove rather than memcpy.  Return true if we succeed,
> +   otherwise return false, indicating that a libcall should be emitted.  */
>  bool
> -aarch64_expand_cpymem (rtx *operands)
> +aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  {
>    int mode_bits;
>    rtx dst = operands[0];
> @@ -25168,25 +25171,23 @@ aarch64_expand_cpymem (rtx *operands)
>    rtx base;
>    machine_mode cur_mode = BLKmode;
>
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  
> */
> -  if (!CONST_INT_P (operands[2]))
> -    return aarch64_expand_cpymem_mops (operands);
> +  /* Variable-sized or strict align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || STRICT_ALIGNMENT)
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
>    unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  
> */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> +  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  
> */
> +  unsigned HOST_WIDE_INT max_copy_size = is_memmove ? 0 : 256;
> +  unsigned HOST_WIDE_INT max_mops_size = max_copy_size;
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  if (TARGET_MOPS)
> +    max_mops_size = is_memmove ? aarch64_mops_memmove_size_threshold
> +                              : aarch64_mops_memcpy_size_threshold;
>
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> -    return aarch64_expand_cpymem_mops (operands);
> +  /* Large copies use library call or MOPS when available.  */
> +  if (size > max_copy_size || size > max_mops_size)
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);

Could you explain this a bit more?  If I've followed the logic correctly,
max_copy_size will always be 0 for movmem, so this "if" condition will
always be true for movmem (given that the caller can be relied on to
optimise away zero-length copies).  So doesn't this function reduce to:

  aarch64_expand_cpymem_mops (operands, true);

when is_memmove is true?  If so, I think it would be clearer to do that
directly, rather than go through aarch64_expand_cpymem.  max_copy_size
is really an optimisation threshold, whereas the above seems to be
leaning on it for correctness.

If we make that change, then...

>    int copy_bits = 256;
>
> @@ -25254,6 +25255,8 @@ aarch64_expand_cpymem (rtx *operands)
>       the constant size into a register.  */
>    unsigned mops_cost = 3 + 1;
>
> +  bool size_p = optimize_function_for_size_p (cfun);
> +
>    /* If MOPS is available at this point we don't consider the libcall as it's
>       not a win even on code size.  At this point only consider MOPS if
>       optimizing for size.  For speed optimizations we will have chosen 
> between
> @@ -25261,7 +25264,7 @@ aarch64_expand_cpymem (rtx *operands)
>    if (TARGET_MOPS)
>      {
>        if (size_p && mops_cost < nops)
> -       return aarch64_expand_cpymem_mops (operands);
> +       return aarch64_expand_cpymem_mops (operands, is_memmove);
>        emit_insn (seq);
>        return true;
>      }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 01cf989641fce8e6c3828f6cfef62e101c4142df..97f70d39cc0ddeb330e044bae0544d85a695567d
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1609,15 +1609,30 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
> -  if (aarch64_expand_cpymem (operands))
> +  if (aarch64_expand_cpymem (operands, false))
>      DONE;
>    FAIL;
>  }
>  )
>
> -(define_insn "aarch64_movmemdi"
> +(define_expand "aarch64_movmemdi"
> +  [(parallel
> +     [(set (match_operand 2) (const_int 0))
> +      (clobber (match_dup 3))
> +      (clobber (match_dup 4))
> +      (clobber (reg:CC CC_REGNUM))
> +      (set (match_operand 0)
> +          (unspec:BLK [(match_operand 1) (match_dup 2)] UNSPEC_MOVMEM))])]
> +  "TARGET_MOPS"
> +  {
> +    operands[3] = XEXP (operands[0], 0);
> +    operands[4] = XEXP (operands[1], 0);
> +  }
> +)
> +
> +(define_insn "*aarch64_movmemdi"
>    [(parallel [
>     (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
>     (clobber (match_operand:DI 0 "register_operand" "+&r"))
> @@ -1640,27 +1655,11 @@ (define_expand "movmemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "TARGET_MOPS"
> +   ""
>  {
> -   rtx sz_reg = operands[2];
> -   /* For constant-sized memmoves check the threshold.
> -      FIXME: We should add a non-MOPS memmove expansion for smaller,
> -      constant-sized memmove to avoid going to a libcall.  */
> -   if (CONST_INT_P (sz_reg)
> -       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
> -     FAIL;
> -
> -   rtx addr_dst = XEXP (operands[0], 0);
> -   rtx addr_src = XEXP (operands[1], 0);
> -
> -   if (!REG_P (sz_reg))
> -     sz_reg = force_reg (DImode, sz_reg);
> -   if (!REG_P (addr_dst))
> -     addr_dst = force_reg (DImode, addr_dst);
> -   if (!REG_P (addr_src))
> -     addr_src = force_reg (DImode, addr_src);
> -   emit_insn (gen_aarch64_movmemdi (addr_dst, addr_src, sz_reg));
> -   DONE;
> +  if (aarch64_expand_cpymem (operands, true))
> +    DONE;
> +  FAIL;
>  }

...I think we might as well keep this pattern conditional on TARGET_MOPS.

I think we can then also split:

  /* All three registers are changed by the instruction, so each one
     must be a fresh pseudo.  */
  rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
  rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
  rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
  rtx src_mem = replace_equiv_address (operands[1], src_addr);
  rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);

out of aarch64_expand_cpymem_mops into a new function (say
aarch64_prepare_mops_operands) and call it from the movmemdi
expander.  There should then be no need for the extra staging
expander (aarch64_movmemdi).

IMO the STRICT_ALIGNMENT stuff should be a separate patch,
with its own testcases.

Thanks,
Richard


>  )
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/mops_4.c 
> b/gcc/testsuite/gcc.target/aarch64/mops_4.c
> index 
> 1b87759cb5e8bbcbb58cf63404d1d579d44b2818..dd796115cb4093251964d881e93bf4b98ade0c32
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/mops_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/mops_4.c
> @@ -50,6 +50,54 @@ copy3 (int *x, int *y, long z, long *res)
>    *res = z;
>  }
>
> +/*
> +** move1:
> +**     mov     (x[0-9]+), x0
> +**     cpyp    \[\1\]!, \[x1\]!, x2!
> +**     cpym    \[\1\]!, \[x1\]!, x2!
> +**     cpye    \[\1\]!, \[x1\]!, x2!
> +**     str     x0, \[x3\]
> +**     ret
> +*/
> +void
> +move1 (int *x, int *y, long z, int **res)
> +{
> +  __builtin_memmove (x, y, z);
> +  *res = x;
> +}
> +
> +/*
> +** move2:
> +**     mov     (x[0-9]+), x1
> +**     cpyp    \[x0\]!, \[\1\]!, x2!
> +**     cpym    \[x0\]!, \[\1\]!, x2!
> +**     cpye    \[x0\]!, \[\1\]!, x2!
> +**     str     x1, \[x3\]
> +**     ret
> +*/
> +void
> +move2 (int *x, int *y, long z, int **res)
> +{
> +  __builtin_memmove (x, y, z);
> +  *res = y;
> +}
> +
> +/*
> +** move3:
> +**     mov     (x[0-9]+), x2
> +**     cpyp    \[x0\]!, \[x1\]!, \1!
> +**     cpym    \[x0\]!, \[x1\]!, \1!
> +**     cpye    \[x0\]!, \[x1\]!, \1!
> +**     str     x2, \[x3\]
> +**     ret
> +*/
> +void
> +move3 (int *x, int *y, long z, long *res)
> +{
> +  __builtin_memmove (x, y, z);
> +  *res = z;
> +}
> +
>  /*
>  ** set1:
>  **     mov     (x[0-9]+), x0

Reply via email to