Wilco Dijkstra <wilco.dijks...@arm.com> writes: > A MOPS memmove may corrupt registers since there is no copy of the input > operands to temporary > registers. Fix this by calling aarch64_expand_cpymem which does this. Also > fix an issue with > STRICT_ALIGNMENT being ignored if TARGET_MOPS is true, and avoid crashing or > generating a huge > expansion if aarch64_mops_memcpy_size_threshold is large. > > Passes regress/bootstrap, OK for commit? > > gcc/ChangeLog/ > PR target/111121 > * config/aarch64/aarch64.md (cpymemdi): Remove STRICT_ALIGNMENT, add > param for memmove. > (aarch64_movmemdi): Add new expander similar to aarch64_cpymemdi. > (movmemdi): Like cpymemdi call aarch64_expand_cpymem for correct > expansion. > * config/aarch64/aarch64.cc (aarch64_expand_cpymem_mops): Add support > for memmove. > (aarch64_expand_cpymem): Add support for memmove. Handle > STRICT_ALIGNMENT correctly. > Handle TARGET_MOPS size selection correctly. > * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Update > prototype. > > gcc/testsuite/ChangeLog/ > PR target/111121 > * gcc.target/aarch64/mops_4.c: Add memmove testcases. > > --- > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > 70303d6fd953e0c397b9138ede8858c2db2e53db..97375e81cbda078847af83bf5dd4e0d7673d6af4 > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -765,7 +765,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx); > bool aarch64_emit_approx_sqrt (rtx, rtx, bool); > tree aarch64_vector_load_decl (tree); > void aarch64_expand_call (rtx, rtx, rtx, bool); > -bool aarch64_expand_cpymem (rtx *); > +bool aarch64_expand_cpymem (rtx *, bool); > bool aarch64_expand_setmem (rtx *); > bool aarch64_float_const_zero_rtx_p (rtx); > bool aarch64_float_const_rtx_p (rtx); > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > eba5d4a7e04b7af82437453a691d5607d98133c9..5e8d0a0c91bc7719de2a8c5627b354cf905a4db0 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -25135,10 +25135,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx > *src, rtx *dst, > *dst = aarch64_progress_pointer (*dst); > } > > -/* Expand a cpymem using the MOPS extension. OPERANDS are taken > - from the cpymem pattern. Return true iff we succeeded. */ > +/* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken > + from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove > + rather than memcpy. Return true iff we succeeded. */ > static bool > -aarch64_expand_cpymem_mops (rtx *operands) > +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove) > { > if (!TARGET_MOPS) > return false; > @@ -25150,17 +25151,19 @@ aarch64_expand_cpymem_mops (rtx *operands) > rtx dst_mem = replace_equiv_address (operands[0], dst_addr); > rtx src_mem = replace_equiv_address (operands[1], src_addr); > rtx sz_reg = copy_to_mode_reg (DImode, operands[2]); > - emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg)); > - > + if (is_memmove) > + emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg)); > + else > + emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg)); > return true; > } > > -/* Expand cpymem, as if from a __builtin_memcpy. Return true if > - we succeed, otherwise return false, indicating that a libcall to > - memcpy should be emitted. */ > - > +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove. > + OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true > + if this is a memmove rather than memcpy. Return true if we succeed, > + otherwise return false, indicating that a libcall should be emitted. */ > bool > -aarch64_expand_cpymem (rtx *operands) > +aarch64_expand_cpymem (rtx *operands, bool is_memmove) > { > int mode_bits; > rtx dst = operands[0]; > @@ -25168,25 +25171,23 @@ aarch64_expand_cpymem (rtx *operands) > rtx base; > machine_mode cur_mode = BLKmode; > > - /* Variable-sized memcpy can go through the MOPS expansion if available. > */ > - if (!CONST_INT_P (operands[2])) > - return aarch64_expand_cpymem_mops (operands); > + /* Variable-sized or strict align copies may use the MOPS expansion. */ > + if (!CONST_INT_P (operands[2]) || STRICT_ALIGNMENT) > + return aarch64_expand_cpymem_mops (operands, is_memmove); > > unsigned HOST_WIDE_INT size = INTVAL (operands[2]); > > - /* Try to inline up to 256 bytes or use the MOPS threshold if available. > */ > - unsigned HOST_WIDE_INT max_copy_size > - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; > + /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. > */ > + unsigned HOST_WIDE_INT max_copy_size = is_memmove ? 0 : 256; > + unsigned HOST_WIDE_INT max_mops_size = max_copy_size; > > - bool size_p = optimize_function_for_size_p (cfun); > + if (TARGET_MOPS) > + max_mops_size = is_memmove ? aarch64_mops_memmove_size_threshold > + : aarch64_mops_memcpy_size_threshold; > > - /* Large constant-sized cpymem should go through MOPS when possible. > - It should be a win even for size optimization in the general case. > - For speed optimization the choice between MOPS and the SIMD sequence > - depends on the size of the copy, rather than number of instructions, > - alignment etc. */ > - if (size > max_copy_size) > - return aarch64_expand_cpymem_mops (operands); > + /* Large copies use library call or MOPS when available. */ > + if (size > max_copy_size || size > max_mops_size) > + return aarch64_expand_cpymem_mops (operands, is_memmove);
Could you explain this a bit more? If I've followed the logic correctly, max_copy_size will always be 0 for movmem, so this "if" condition will always be true for movmem (given that the caller can be relied on to optimise away zero-length copies). So doesn't this function reduce to: aarch64_expand_cpymem_mops (operands, true); when is_memmove is true? If so, I think it would be clearer to do that directly, rather than go through aarch64_expand_cpymem. max_copy_size is really an optimisation threshold, whereas the above seems to be leaning on it for correctness. If we make that change, then... > int copy_bits = 256; > > @@ -25254,6 +25255,8 @@ aarch64_expand_cpymem (rtx *operands) > the constant size into a register. */ > unsigned mops_cost = 3 + 1; > > + bool size_p = optimize_function_for_size_p (cfun); > + > /* If MOPS is available at this point we don't consider the libcall as it's > not a win even on code size. At this point only consider MOPS if > optimizing for size. For speed optimizations we will have chosen > between > @@ -25261,7 +25264,7 @@ aarch64_expand_cpymem (rtx *operands) > if (TARGET_MOPS) > { > if (size_p && mops_cost < nops) > - return aarch64_expand_cpymem_mops (operands); > + return aarch64_expand_cpymem_mops (operands, is_memmove); > emit_insn (seq); > return true; > } > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > 01cf989641fce8e6c3828f6cfef62e101c4142df..97f70d39cc0ddeb330e044bae0544d85a695567d > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1609,15 +1609,30 @@ (define_expand "cpymemdi" > (match_operand:BLK 1 "memory_operand") > (match_operand:DI 2 "general_operand") > (match_operand:DI 3 "immediate_operand")] > - "!STRICT_ALIGNMENT || TARGET_MOPS" > + "" > { > - if (aarch64_expand_cpymem (operands)) > + if (aarch64_expand_cpymem (operands, false)) > DONE; > FAIL; > } > ) > > -(define_insn "aarch64_movmemdi" > +(define_expand "aarch64_movmemdi" > + [(parallel > + [(set (match_operand 2) (const_int 0)) > + (clobber (match_dup 3)) > + (clobber (match_dup 4)) > + (clobber (reg:CC CC_REGNUM)) > + (set (match_operand 0) > + (unspec:BLK [(match_operand 1) (match_dup 2)] UNSPEC_MOVMEM))])] > + "TARGET_MOPS" > + { > + operands[3] = XEXP (operands[0], 0); > + operands[4] = XEXP (operands[1], 0); > + } > +) > + > +(define_insn "*aarch64_movmemdi" > [(parallel [ > (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0)) > (clobber (match_operand:DI 0 "register_operand" "+&r")) > @@ -1640,27 +1655,11 @@ (define_expand "movmemdi" > (match_operand:BLK 1 "memory_operand") > (match_operand:DI 2 "general_operand") > (match_operand:DI 3 "immediate_operand")] > - "TARGET_MOPS" > + "" > { > - rtx sz_reg = operands[2]; > - /* For constant-sized memmoves check the threshold. > - FIXME: We should add a non-MOPS memmove expansion for smaller, > - constant-sized memmove to avoid going to a libcall. */ > - if (CONST_INT_P (sz_reg) > - && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold) > - FAIL; > - > - rtx addr_dst = XEXP (operands[0], 0); > - rtx addr_src = XEXP (operands[1], 0); > - > - if (!REG_P (sz_reg)) > - sz_reg = force_reg (DImode, sz_reg); > - if (!REG_P (addr_dst)) > - addr_dst = force_reg (DImode, addr_dst); > - if (!REG_P (addr_src)) > - addr_src = force_reg (DImode, addr_src); > - emit_insn (gen_aarch64_movmemdi (addr_dst, addr_src, sz_reg)); > - DONE; > + if (aarch64_expand_cpymem (operands, true)) > + DONE; > + FAIL; > } ...I think we might as well keep this pattern conditional on TARGET_MOPS. I think we can then also split: /* All three registers are changed by the instruction, so each one must be a fresh pseudo. */ rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0)); rtx dst_mem = replace_equiv_address (operands[0], dst_addr); rtx src_mem = replace_equiv_address (operands[1], src_addr); rtx sz_reg = copy_to_mode_reg (DImode, operands[2]); out of aarch64_expand_cpymem_mops into a new function (say aarch64_prepare_mops_operands) and call it from the movmemdi expander. There should then be no need for the extra staging expander (aarch64_movmemdi). IMO the STRICT_ALIGNMENT stuff should be a separate patch, with its own testcases. Thanks, Richard > ) > > diff --git a/gcc/testsuite/gcc.target/aarch64/mops_4.c > b/gcc/testsuite/gcc.target/aarch64/mops_4.c > index > 1b87759cb5e8bbcbb58cf63404d1d579d44b2818..dd796115cb4093251964d881e93bf4b98ade0c32 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/mops_4.c > +++ b/gcc/testsuite/gcc.target/aarch64/mops_4.c > @@ -50,6 +50,54 @@ copy3 (int *x, int *y, long z, long *res) > *res = z; > } > > +/* > +** move1: > +** mov (x[0-9]+), x0 > +** cpyp \[\1\]!, \[x1\]!, x2! > +** cpym \[\1\]!, \[x1\]!, x2! > +** cpye \[\1\]!, \[x1\]!, x2! > +** str x0, \[x3\] > +** ret > +*/ > +void > +move1 (int *x, int *y, long z, int **res) > +{ > + __builtin_memmove (x, y, z); > + *res = x; > +} > + > +/* > +** move2: > +** mov (x[0-9]+), x1 > +** cpyp \[x0\]!, \[\1\]!, x2! > +** cpym \[x0\]!, \[\1\]!, x2! > +** cpye \[x0\]!, \[\1\]!, x2! > +** str x1, \[x3\] > +** ret > +*/ > +void > +move2 (int *x, int *y, long z, int **res) > +{ > + __builtin_memmove (x, y, z); > + *res = y; > +} > + > +/* > +** move3: > +** mov (x[0-9]+), x2 > +** cpyp \[x0\]!, \[x1\]!, \1! > +** cpym \[x0\]!, \[x1\]!, \1! > +** cpye \[x0\]!, \[x1\]!, \1! > +** str x2, \[x3\] > +** ret > +*/ > +void > +move3 (int *x, int *y, long z, long *res) > +{ > + __builtin_memmove (x, y, z); > + *res = z; > +} > + > /* > ** set1: > ** mov (x[0-9]+), x0