Hi, lets start with your unification code and then I will update my patch. > > The patch is bootstrapped and tested on i386/x86_64 (make check, and stability > testing on Spec2k, Spec2k6).
Please do not forget to check also -minline-all-stringop. > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 21fc531..9d5654f 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -22219,13 +22219,16 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx > srcmem, > emit_label (out_label); > } > > -/* Output "rep; mov" instruction. > - Arguments have same meaning as for previous function */ > +/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM > argument. > + When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored. > + When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored. > + Other arguments have same meaning as for previous function. */ You need to document ORIG_VALUE > + > static void > -expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, > - rtx destptr, rtx srcptr, > +expand_movmem_or_setmem_via_rep (rtx destmem, rtx srcmem, lets go with the same name as in first function, expand_set_or_movmem_via_rep. Either I would add issetmem into expand_set_or_movmem_via_loop or I would go for the same scheme - if srcptr/srcmem is NULL, use value. This part of patch is OK with those changes. > @@ -22233,82 +22236,65 @@ expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, > HOST_WIDE_INT rounded_count; > > /* If the size is known, it is shorter to use rep movs. */ > - if (mode == QImode && CONST_INT_P (count) > + if (!issetmem && mode == QImode && CONST_INT_P (count) Instead of !issetmem allow this also for value being 0. Add comment that for memset we would need to promote value and also that the logic should be moved to decide_alg (I believe I have it there in my proposed patch). > @@ -22496,6 +22482,57 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem, > } > } > > +/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM > + with value PROMOTED_VAL. > + SRC is passed by pointer to be updated on return. > + Return value is updated DST. */ So it is same as store_by_pieces only taking promoted_val instead of doing promotion itself? > +static rtx > +emit_memset (rtx destmem, rtx destptr, rtx promoted_val, > + HOST_WIDE_INT size_to_move) > +{ > + rtx dst = destmem, adjust; > + enum insn_code code; > + enum machine_mode move_mode; > + int piece_size, i; > + > + /* Find the widest mode in which we could perform moves. > + Start with the biggest power of 2 less than SIZE_TO_MOVE and half > + it until move of such size is supported. */ > + move_mode = GET_MODE (promoted_val); Won't this be VOIDmode for CONST_INT? > + if (size_to_move < GET_MODE_SIZE (move_mode)) > + { > + move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0); > + promoted_val = gen_lowpart (move_mode, promoted_val); > + } > + piece_size = GET_MODE_SIZE (move_mode); > + code = optab_handler (mov_optab, move_mode); > + gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX); > + > + dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); > + > + /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ > + gcc_assert (size_to_move % piece_size == 0); > + adjust = GEN_INT (piece_size); > + for (i = 0; i < size_to_move; i += piece_size) > + { > + if (piece_size <= GET_MODE_SIZE (word_mode)) > + { > + emit_insn (gen_strset (destptr, dst, promoted_val)); > + continue; > + } Don't we want to go for strmov here, too? > + > + emit_insn (GEN_FCN (code) (dst, promoted_val)); > + > + emit_move_insn (destptr, > + gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); > + > + dst = adjust_automodify_address_nv (dst, move_mode, destptr, > + piece_size); > + } > + > + /* Update DST rtx. */ > + return dst; > +} This is also OK if you either add mode parameter or convince me that we never pass CONST_INT there (and add it into block commnet + assert). > @@ -22511,61 +22548,30 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx > destptr, rtx value, > > /* Output code to set at most count & (max_size - 1) bytes starting by DEST. > */ > static void > -expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int > max_size) > +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, > + rtx count, int max_size) Also OK. > @@ -22637,13 +22643,16 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, > rtx value, rtx count, int max_ > } > } > > -/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to > - DESIRED_ALIGNMENT. > +/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to > + DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN. > + Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE > are > + ignored. > Return value is updated DESTMEM. */ > static rtx > -expand_movmem_prologue (rtx destmem, rtx srcmem, > - rtx destptr, rtx srcptr, rtx count, > - int align, int desired_alignment) > +expand_movmem_or_setmem_prologue (rtx destmem, rtx srcmem, > + rtx destptr, rtx srcptr, rtx value, > + rtx vec_value, rtx count, int align, > + int desired_alignment, bool issetmem) Again, I would go with consistent names. Otherwise OK. How do we ensure that value is sufficiently promoted? (for example for pentiumpro that requires bigger alignment than promotion)? Also don't we need to handle vec_value for AVX? > @@ -22661,22 +22678,28 @@ expand_movmem_prologue (rtx destmem, rtx srcmem, > return destmem; > } > > -/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN. > - ALIGN_BYTES is how many bytes need to be copied. > - The function updates DST and SRC, namely, it sets proper alignment. > - DST is returned via return value, SRC is updated via pointer SRCP. */ > +/* This function is like the previous one, except here we know how many bytes > + need to be copied. That allows us to update alignment not only of DST, > which > + is returned, but also of SRC, which is passed as a pointer for that > + reason. */ > static rtx > -expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg, > - int desired_align, int align_bytes) > +expand_constant_movmem_or_setmem_prologue (rtx dst, rtx *srcp, rtx destreg, Again I would make names consistent > + rtx srcreg, rtx value, rtx vec_value, > + int desired_align, int align_bytes, > + bool issetmem) > { > - rtx src = *srcp; > + rtx src = NULL; > rtx orig_dst = dst; > - rtx orig_src = src; > + rtx orig_src = NULL; > int piece_size = 1; > int copied_bytes = 0; > - int src_align_bytes = get_mem_align_offset (src, desired_align * > BITS_PER_UNIT); > - if (src_align_bytes >= 0) > - src_align_bytes = desired_align - src_align_bytes; > + > + if (!issetmem) > + { > + gcc_assert (srcp != NULL); > + src = *srcp; > + orig_src = src; > + } > > for (piece_size = 1; > piece_size <= desired_align && copied_bytes < align_bytes; > @@ -22684,109 +22707,48 @@ expand_constant_movmem_prologue (rtx dst, rtx > *srcp, rtx destreg, rtx srcreg, > { > if (align_bytes & piece_size) > { > - dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); > + if (issetmem) > + { > + if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value))) > + dst = emit_memset (dst, destreg, vec_value, piece_size); > + else > + dst = emit_memset (dst, destreg, value, piece_size); Perhaps would be easier if emit_memset simply took value and vec_value and made choice on its own? > @@ -22974,24 +23030,24 @@ decide_alignment (int align, > Optional dynamic check for size and libcall for large > blocks is emitted here too, with -minline-stringops-dynamically. > > - 2) Prologue: copy first few bytes in order to get destination > + 2) Prologue: copy/set first few bytes in order to get destination > aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less > than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be > - copied. We emit either a jump tree on power of two sized > + copied/set. We emit either a jump tree on power of two sized > blocks, or a byte loop. > > - 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks > - with specified algorithm. > + 3) Main body: the copying/storing loop itself, copying/storing in > SIZE_NEEDED > + chunks with specified algorithm. > > - 4) Epilogue: code copying tail of the block that is too small to be > + 4) Epilogue: code copying/storing tail of the block that is too small to > be > handled by main body (or up to size guarded by prologue guard). */ > - > -bool > -ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, > - rtx expected_align_exp, rtx expected_size_exp) > +static bool > +ix86_expand_movmem_or_setmem (rtx dst, rtx src, rtx count_exp, rtx val_exp, Name ;) > + rtx align_exp, rtx expected_align_exp, > + rtx expected_size_exp, bool issetmem) > { > rtx destreg; > - rtx srcreg; > + rtx srcreg = NULL; > rtx label = NULL; > rtx tmp; > rtx jump_around_label = NULL; > @@ -23001,6 +23057,9 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, > rtx align_exp, > int size_needed = 0, epilogue_size_needed; > int desired_align = 0, align_bytes = 0; > enum stringop_alg alg; > + rtx promoted_val = NULL; > + rtx vec_promoted_val = NULL; > + bool force_loopy_epilogue = false; > int dynamic_check; > bool need_zero_guard = false; > bool noalign; > @@ -23015,7 +23074,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, > rtx align_exp, > align = INTVAL (expected_align_exp); > /* ALIGN is the minimum of destination and source alignment, but we care > here > just about destination alignment. */ > - else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) > + else if (!issetmem > + && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) > align = MEM_ALIGN (dst) / BITS_PER_UNIT; Hmm, this really should be pushed up into expansion and the alignment lookup should be improved... > + /* For now vector-version of memset is generated only for memory zeroing, > as > + creating of promoted vector value is very cheap in this case. */ > + if (issetmem && alg == vector_loop && val_exp != const0_rtx) > + alg = unrolled_loop; I believe we had vector version of promote_reg once. Load of constant is easy, too. Broadcasting a value is bit different story. Thanks, the patch is OK with those changes. It really seems like an improvement in maintainability. Thanks for working on it! Honza