Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> v3: rebased to latest trunk
>
> Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
>
> Passes regress & bootstrap.
>
> gcc/ChangeLog:
>         * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
>         * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove 
> function.
>         (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
>         (aarch64_expand_setmem): Clean up implementation, use byte offsets,
>         simplify size calculation.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1178,6 +1178,10 @@ typedef struct
>     mode that should actually be used.  We allow pairs of registers.  */
>  #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
>
> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
> +

Since this isn't (AFAIK) a standard macro, there doesn't seem to be
any need to put it in the header file.  It could just go at the head
of aarch64.cc instead.

>  /* Maximum bytes moved by a single instruction (load/store pair).  */
>  #define MOVE_MAX (UNITS_PER_WORD * 2)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
>                                     next, amount);
>  }
>
> -/* Return a new RTX holding the result of moving POINTER forward by the
> -   size of the mode it points to.  */
> -
> -static rtx
> -aarch64_progress_pointer (rtx pointer)
> -{
> -  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
> -}
> -
>  typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
>
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> @@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool 
> is_memmove)
>    return true;
>  }
>
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>  static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -                                           machine_mode mode)
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
>  {
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> -     the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (*dst, src, src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> +      mode = V16QImode;
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      emit_insn (aarch64_gen_store_pair (dst1, src, src));
>        return;
>      }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +    src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>  }
>
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>  bool
>  aarch64_expand_setmem (rtx *operands)
>  {
> -  int n, mode_bits;
> +  int mode_bytes;
>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> @@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
>        || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> -
>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
> -  unsigned max_set_size = 256;
> +  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p 
> (cfun));
>    unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>
>    len = UINTVAL (operands[1]);
> @@ -26550,91 +26515,55 @@ aarch64_expand_setmem (rtx *operands)
>    if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>      return aarch64_expand_setmem_mops (operands);
>
> -  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
> -  /* The MOPS sequence takes:
> -     3 instructions for the memory storing
> -     + 1 to move the constant size into a reg
> -     + 1 if VAL is a non-zero constant to move into a reg
> -    (zero constants can use XZR directly).  */
> -  unsigned mops_cost = 3 + 1 + cst_val;
> -  /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> -
> -  /* Attempt a sequence with a vector broadcast followed by stores.
> -     Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> -     algorithmically-relevant operations is used rather than an rtx_insn 
> count
> -     as all the pointer adjusmtents and mode reinterprets will be optimized
> -     away later.  */
> -  start_sequence ();
> -  unsigned simd_ops = 0;
> -
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
>
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
> -  simd_ops++;
> -  /* Convert len to bits to make the rest of the code simpler.  */
> -  n = len * BITS_PER_UNIT;
>
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -                         ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
> +     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> +  unsigned set_max = 32;
> +
> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
> +                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    set_max = 16;

I think we should take the tuning parameter into account when applying
the MAX_SET_SIZE limit for -Os.  Shouldn't it be 48 rather than 96 in
that case?  (Alternatively, I suppose it would make sense to ignore
the param for -Os, although we don't seem to do that elsewhere.)

Otherwise it looks like a nice improvement, thanks.  I agree a simple
limit like this is enough for the -Os case, since it's impossible
to account accurately for the register-clobbering effect of a call.

Richard

> -  while (n > 0)
> +  int offset = 0;
> +  while (len > 0)
>      {
>        /* Find the largest mode in which to do the copy without
>          over writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
>           cur_mode = mode_iter.require ();
>
>        gcc_assert (cur_mode != BLKmode);
>
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bytes == 16)
> +       cur_mode = V16QImode;
> +
> +      aarch64_set_one_block (src, dst, offset, cur_mode);
> +      len -= mode_bytes;
> +      offset += mode_bytes;
>
>        /* Emit trailing writes using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
>         {
> -         next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - len;
> +         len = n_bytes;
>         }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast 
> sequence,
> -        call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -         && mops_cost <= libcall_cost
> -         && mops_cost <= simd_ops)
> -       return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -        sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -       return false;
> -      emit_insn (seq);
> -      return true;
> -    }
>
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
> -  emit_insn (seq);
>    return true;
>  }

Reply via email to