Re: [PATCH v3] AArch64: Cleanup memset expansion

2024-01-05 Thread Richard Sandiford
Wilco Dijkstra  writes:
> v3: rebased to latest trunk
>
> Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
>
> Passes regress & bootstrap.
>
> gcc/ChangeLog:
> * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
> * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove 
> function.
> (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
> (aarch64_expand_setmem): Clean up implementation, use byte offsets,
> simplify size calculation.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1178,6 +1178,10 @@ typedef struct
> mode that should actually be used.  We allow pairs of registers.  */
>  #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
>
> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
> +

Since this isn't (AFAIK) a standard macro, there doesn't seem to be
any need to put it in the header file.  It could just go at the head
of aarch64.cc instead.

>  /* Maximum bytes moved by a single instruction (load/store pair).  */
>  #define MOVE_MAX (UNITS_PER_WORD * 2)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
> next, amount);
>  }
>
> -/* Return a new RTX holding the result of moving POINTER forward by the
> -   size of the mode it points to.  */
> -
> -static rtx
> -aarch64_progress_pointer (rtx pointer)
> -{
> -  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
> -}
> -
>  typedef auto_vec, 12> copy_ops;
>
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> @@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool 
> is_memmove)
>return true;
>  }
>
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>  static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -   machine_mode mode)
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
>  {
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> - the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -{
> -  mode = GET_MODE (src);
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_insn (aarch64_gen_store_pair (*dst, src, src));
> -
> -  /* Move the pointers forward.  */
> -  *dst = aarch64_move_pointer (*dst, 32);
> -  return;
> -}
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>  {
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, GET_MODE (src), 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, src);
> -  /* Move the pointers forward.  */
> -  *dst = aarch64_move_pointer (*dst, 16);
> +  mode = V16QImode;
> +  rtx dst1 = adjust_address (dst, mode, offset);
> +  emit_insn (aarch64_gen_store_pair (dst1, src, src));
>return;
>  }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>  }
>
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>  bool
>  aarch64_expand_setmem (rtx *operands)
>  {
> -  int n, mode_bits;
> +  int mode_bytes;
>unsigned HOST_WIDE_INT len;
>rtx dst = operands[0];
>rtx val = operands[2], src;
> @@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
>|| (STRICT_

Re: [PATCH v3] AArch64: Cleanup memset expansion

2023-12-22 Thread Wilco Dijkstra
v3: rebased to latest trunk

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress & bootstrap.

gcc/ChangeLog:
* config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
* config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1178,6 +1178,10 @@ typedef struct
mode that should actually be used.  We allow pairs of registers.  */
 #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 typedef auto_vec, 12> copy_ops;
 
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
@@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-{
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_insn (aarch64_gen_store_pair (*dst, src, src));
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-}
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, GET_MODE (src), 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 16);
+  mode = V16QImode;
+  rtx dst1 = adjust_address (dst, mode, offset);
+  emit_insn (aarch64_gen_store_pair (dst1, src, src));
   return;
 }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
   || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;