Re: [PATCH v3] AArch64: Cleanup memset expansion
Wilco Dijkstra writes: > v3: rebased to latest trunk > > Cleanup memset implementation. Similar to memcpy/memmove, use an offset and > bytes throughout. Simplify the complex calculations when optimizing for size > by using a fixed limit. > > Passes regress & bootstrap. > > gcc/ChangeLog: > * config/aarch64/aarch64.h (MAX_SET_SIZE): New define. > * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove > function. > (aarch64_set_one_block_and_progress_pointer): Simplify and clean up. > (aarch64_expand_setmem): Clean up implementation, use byte offsets, > simplify size calculation. > > --- > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0 > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -1178,6 +1178,10 @@ typedef struct > mode that should actually be used. We allow pairs of registers. */ > #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode) > > +/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP > + and 1 MOVI/DUP (same size as a call). */ > +#define MAX_SET_SIZE(speed) (speed ? 256 : 96) > + Since this isn't (AFAIK) a standard macro, there doesn't seem to be any need to put it in the header file. It could just go at the head of aarch64.cc instead. > /* Maximum bytes moved by a single instruction (load/store pair). */ > #define MOVE_MAX (UNITS_PER_WORD * 2) > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount) > next, amount); > } > > -/* Return a new RTX holding the result of moving POINTER forward by the > - size of the mode it points to. */ > - > -static rtx > -aarch64_progress_pointer (rtx pointer) > -{ > - return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); > -} > - > typedef auto_vec, 12> copy_ops; > > /* Copy one block of size MODE from SRC to DST at offset OFFSET. */ > @@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool > is_memmove) >return true; > } > > -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where > - SRC is a register we have created with the duplicated value to be set. */ > +/* Set one block of size MODE at DST at offset OFFSET to value in SRC. */ > static void > -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst, > - machine_mode mode) > +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode) > { > - /* If we are copying 128bits or 256bits, we can do that straight from > - the SIMD register we prepared. */ > - if (known_eq (GET_MODE_BITSIZE (mode), 256)) > -{ > - mode = GET_MODE (src); > - /* "Cast" the *dst to the correct mode. */ > - *dst = adjust_address (*dst, mode, 0); > - /* Emit the memset. */ > - emit_insn (aarch64_gen_store_pair (*dst, src, src)); > - > - /* Move the pointers forward. */ > - *dst = aarch64_move_pointer (*dst, 32); > - return; > -} > - if (known_eq (GET_MODE_BITSIZE (mode), 128)) > + /* Emit explict store pair instructions for 32-byte writes. */ > + if (known_eq (GET_MODE_SIZE (mode), 32)) > { > - /* "Cast" the *dst to the correct mode. */ > - *dst = adjust_address (*dst, GET_MODE (src), 0); > - /* Emit the memset. */ > - emit_move_insn (*dst, src); > - /* Move the pointers forward. */ > - *dst = aarch64_move_pointer (*dst, 16); > + mode = V16QImode; > + rtx dst1 = adjust_address (dst, mode, offset); > + emit_insn (aarch64_gen_store_pair (dst1, src, src)); >return; > } > - /* For copying less, we have to extract the right amount from src. */ > - rtx reg = lowpart_subreg (mode, src, GET_MODE (src)); > - > - /* "Cast" the *dst to the correct mode. */ > - *dst = adjust_address (*dst, mode, 0); > - /* Emit the memset. */ > - emit_move_insn (*dst, reg); > - /* Move the pointer forward. */ > - *dst = aarch64_progress_pointer (*dst); > + if (known_lt (GET_MODE_SIZE (mode), 16)) > +src = lowpart_subreg (mode, src, GET_MODE (src)); > + emit_move_insn (adjust_address (dst, mode, offset), src); > } > > /* Expand a setmem using the MOPS instructions. OPERANDS are the same > @@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands) > bool > aarch64_expand_setmem (rtx *operands) > { > - int n, mode_bits; > + int mode_bytes; >unsigned HOST_WIDE_INT len; >rtx dst = operands[0]; >rtx val = operands[2], src; > @@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands) >|| (STRICT_
Re: [PATCH v3] AArch64: Cleanup memset expansion
v3: rebased to latest trunk Cleanup memset implementation. Similar to memcpy/memmove, use an offset and bytes throughout. Simplify the complex calculations when optimizing for size by using a fixed limit. Passes regress & bootstrap. gcc/ChangeLog: * config/aarch64/aarch64.h (MAX_SET_SIZE): New define. * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function. (aarch64_set_one_block_and_progress_pointer): Simplify and clean up. (aarch64_expand_setmem): Clean up implementation, use byte offsets, simplify size calculation. --- diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -1178,6 +1178,10 @@ typedef struct mode that should actually be used. We allow pairs of registers. */ #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode) +/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP + and 1 MOVI/DUP (same size as a call). */ +#define MAX_SET_SIZE(speed) (speed ? 256 : 96) + /* Maximum bytes moved by a single instruction (load/store pair). */ #define MOVE_MAX (UNITS_PER_WORD * 2) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount) next, amount); } -/* Return a new RTX holding the result of moving POINTER forward by the - size of the mode it points to. */ - -static rtx -aarch64_progress_pointer (rtx pointer) -{ - return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); -} - typedef auto_vec, 12> copy_ops; /* Copy one block of size MODE from SRC to DST at offset OFFSET. */ @@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove) return true; } -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where - SRC is a register we have created with the duplicated value to be set. */ +/* Set one block of size MODE at DST at offset OFFSET to value in SRC. */ static void -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst, - machine_mode mode) +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode) { - /* If we are copying 128bits or 256bits, we can do that straight from - the SIMD register we prepared. */ - if (known_eq (GET_MODE_BITSIZE (mode), 256)) -{ - mode = GET_MODE (src); - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_insn (aarch64_gen_store_pair (*dst, src, src)); - - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 32); - return; -} - if (known_eq (GET_MODE_BITSIZE (mode), 128)) + /* Emit explict store pair instructions for 32-byte writes. */ + if (known_eq (GET_MODE_SIZE (mode), 32)) { - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, GET_MODE (src), 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 16); + mode = V16QImode; + rtx dst1 = adjust_address (dst, mode, offset); + emit_insn (aarch64_gen_store_pair (dst1, src, src)); return; } - /* For copying less, we have to extract the right amount from src. */ - rtx reg = lowpart_subreg (mode, src, GET_MODE (src)); - - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_move_insn (*dst, reg); - /* Move the pointer forward. */ - *dst = aarch64_progress_pointer (*dst); + if (known_lt (GET_MODE_SIZE (mode), 16)) +src = lowpart_subreg (mode, src, GET_MODE (src)); + emit_move_insn (adjust_address (dst, mode, offset), src); } /* Expand a setmem using the MOPS instructions. OPERANDS are the same @@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands) bool aarch64_expand_setmem (rtx *operands) { - int n, mode_bits; + int mode_bytes; unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; @@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands) || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); - bool size_p = optimize_function_for_size_p (cfun); - /* Default the maximum to 256-bytes when considering only libcall vs SIMD broadcast sequence. */ - unsigned max_set_size = 256; + unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun)); unsigned mops_threshold = aarch64_mops_memset_size_threshold;