Hi Haochen,
on 2024/1/10 09:35, HAO CHEN GUI wrote:
> Hi,
> This patch refactors function expand_compare_loop and split it to two
> functions. One is for fixed length and another is for variable length.
> These two functions share some low level common help functions.
I'm expecting refactoring doesn't introduce any functional changes, but
this patch has some enhancements as described below, so I think the
subject is off, it's more like rework.
>
> Besides above changes, the patch also does:
> 1. Don't generate load and compare loop when max_bytes is less than
> loop bytes.
> 2. Remove do_load_mask_compare as it's no needed. All sub-targets
> entering the function should support efficient overlapping load and
> compare.
> 3. Implement an variable length overlapping load and compare for the
> case which remain bytes is less than the loop bytes in variable length
> compare. The 4k boundary test and one-byte load and compare loop are
> removed as they're no need now.
> 4. Remove the codes for "bytes > max_bytes" with fixed length as the
> case is already excluded by pre-checking.
> 5. Remove running time codes for "bytes > max_bytes" with variable length
> as it should jump to call library at the beginning.
> 6. Enhance do_overlap_load_compare to avoid overlapping load and compare
> when the remain bytes can be loaded and compared by a smaller unit.
Considering it's stage 4 now and the impact of this patch, let's defer
this to next stage 1, if possible could you organize the above changes
into patches:
1) Refactor expand_compare_loop by splitting into two functions without
any functional changes.
2) Remove some useless codes like 2, 4, 5.
3) Some more enhancements like 1, 3, 6.
? It would be helpful for the review. Thanks!
BR,
Kewen
>
> Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is this OK for trunk?
>
> Thanks
> Gui Haochen
>
>
> ChangeLog
> rs6000: Refactor expand_compare_loop and split it to two functions
>
> The original expand_compare_loop has a complicated logical as it's
> designed for both fixed and variable length. This patch splits it to
> two functions and make these two functions share common help functions.
> Also the 4K boundary test and corresponding one byte load and compare
> are replaced by variable length overlapping load and compare. The
> do_load_mask_compare is removed as all sub-targets entering the function
> has efficient overlapping load and compare so that mask load is no needed.
>
> gcc/
> * config/rs6000/rs6000-string.cc (do_isel): Remove.
> (do_load_mask_compare): Remove.
> (do_reg_compare): New.
> (do_load_and_compare): New.
> (do_overlap_load_compare): Do load and compare with a small unit
> other than overlapping load and compare when the remain bytes can
> be done by one instruction.
> (expand_compare_loop): Remove.
> (get_max_inline_loop_bytes): New.
> (do_load_compare_rest_of_loop): New.
> (generate_6432_conversion): Set it to a static function and move
> ahead of gen_diff_handle.
> (gen_diff_handle): New.
> (gen_load_compare_loop): New.
> (gen_library_call): New.
> (expand_compare_with_fixed_length): New.
> (expand_compare_with_variable_length): New.
> (expand_block_compare): Call expand_compare_with_variable_length
> to expand block compare for variable length. Call
> expand_compare_with_fixed_length to expand block compare loop for
> fixed length.
>
> gcc/testsuite/
> * gcc.target/powerpc/block-cmp-5.c: New.
> * gcc.target/powerpc/block-cmp-6.c: New.
> * gcc.target/powerpc/block-cmp-7.c: New.
>
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-string.cc
> b/gcc/config/rs6000/rs6000-string.cc
> index f707bb2727e..018b87f2501 100644
> --- a/gcc/config/rs6000/rs6000-string.cc
> +++ b/gcc/config/rs6000/rs6000-string.cc
> @@ -404,21 +404,6 @@ do_ifelse (machine_mode cmpmode, rtx_code comparison,
> LABEL_NUSES (true_label) += 1;
> }
>
> -/* Emit an isel of the proper mode for DEST.
> -
> - DEST is the isel destination register.
> - SRC1 is the isel source if CR is true.
> - SRC2 is the isel source if CR is false.
> - CR is the condition for the isel. */
> -static void
> -do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
> -{
> - if (GET_MODE (dest) == DImode)
> - emit_insn (gen_isel_cc_di (dest, cmp, src_t, src_f, cr));
> - else
> - emit_insn (gen_isel_cc_si (dest, cmp, src_t, src_f, cr));
> -}
> -
> /* Emit a subtract of the proper mode for DEST.
>
> DEST is the destination register for the subtract.
> @@ -499,65 +484,61 @@ do_rotl3 (rtx dest, rtx src1, rtx src2)
> emit_insn (gen_rotlsi3 (dest, src1, src2));
> }
>
> -/* Generate rtl for a load, shift, and compare of less than a full word.
> -
> - LOAD_MODE is the machine mode for the loads.
> - DIFF is the reg for the difference.
> - CMP_REM is the reg containing the remaining bytes to compare.
> - DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
> - SRC1_ADDR is the first source address.
> - SRC2_ADDR is the second source address.
> - ORIG_SRC1 is the original first source block's address rtx.
> - ORIG_SRC2 is the original second source block's address rtx. */
> +/* Do the compare for two registers. */
> static void
> -do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem,
> rtx dcond,
> - rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx
> orig_src2)
> +do_reg_compare (bool use_vec, rtx vec_result, rtx diff, rtx *dcond, rtx d1,
> + rtx d2)
> {
> - HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> - rtx shift_amount = gen_reg_rtx (word_mode);
> - rtx d1 = gen_reg_rtx (word_mode);
> - rtx d2 = gen_reg_rtx (word_mode);
> -
> - do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
> - do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
> - do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
> -
> - if (word_mode == DImode)
> - {
> - emit_insn (gen_ashldi3 (shift_amount, shift_amount,
> - GEN_INT (LOG2_BITS_PER_UNIT)));
> - emit_insn (gen_lshrdi3 (d1, d1,
> - gen_lowpart (SImode, shift_amount)));
> - emit_insn (gen_lshrdi3 (d2, d2,
> - gen_lowpart (SImode, shift_amount)));
> - }
> - else
> - {
> - emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
> - GEN_INT (LOG2_BITS_PER_UNIT)));
> - emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
> - emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
> - }
> + gcc_assert (!use_vec || vec_result != NULL_RTX);
> + gcc_assert (REG_P (d1) && REG_P (d2));
> + gcc_assert (GET_MODE (d1) == GET_MODE (d2));
>
> - if (TARGET_P9_MISC)
> + if (use_vec)
> + emit_insn (gen_altivec_vcmpequb_p (vec_result, d1, d2));
> + else if (TARGET_P9_MISC)
> {
> /* Generate a compare, and convert with a setb later. */
> rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
> - emit_insn (gen_rtx_SET (dcond, cmp));
> + emit_insn (gen_rtx_SET (*dcond, cmp));
> }
> else
> {
> + *dcond = gen_reg_rtx (CCmode);
> if (word_mode == DImode)
> - emit_insn (gen_subfdi3_carry (diff, d2, d1));
> + emit_insn (gen_subfdi3_carry_dot2 (diff, d2, d1, *dcond));
> else
> - emit_insn (gen_subfsi3_carry (diff, d2, d1));
> + emit_insn (gen_subfsi3_carry_dot2 (diff, d2, d1, *dcond));
> }
> }
>
> +/* Load the memory to register and do the compare. */
> +static void
> +do_load_and_compare (machine_mode load_mode, rtx addr1, rtx addr2, rtx
> *dcond,
> + rtx diff, rtx orig_src1, rtx orig_src2)
> +{
> + rtx d1 = gen_reg_rtx (word_mode);
> + rtx d2 = gen_reg_rtx (word_mode);
> +
> + if (MEM_P (addr1))
> + do_load_for_compare (d1, addr1, load_mode);
> + else
> + do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
> +
> + if (MEM_P (addr2))
> + do_load_for_compare (d2, addr2, load_mode);
> + else
> + do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
> +
> + do_reg_compare (false, NULL_RTX, diff, dcond, d1, d2);
> +}
> +
> /* Generate rtl for an overlapping load and compare of less than a
> full load_mode. This assumes that the previous word is part of the
> block being compared so it's ok to back up part of a word so we can
> compare the last unaligned full word that ends at the end of the block.
> + If the remain bytes can be loaded and compared by a small unit with
> + only one instruction, just do the load and compare by the small unit
> + other than the full word overlapping load and compare.
>
> LOAD_MODE is the machine mode for the loads.
> ISCONST tells whether the remaining length is a constant or in a register.
> @@ -569,55 +550,41 @@ do_load_mask_compare (const machine_mode load_mode, rtx
> diff, rtx cmp_rem, rtx d
> SRC2_ADDR is the second source address.
> ORIG_SRC1 is the original first source block's address rtx.
> ORIG_SRC2 is the original second source block's address rtx. */
> +
> static void
> -do_overlap_load_compare (machine_mode load_mode, bool isConst,
> - HOST_WIDE_INT bytes_rem, rtx diff,
> - rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
> - rtx orig_src1, rtx orig_src2)
> +do_overlap_load_compare (machine_mode load_mode, HOST_WIDE_INT bytes_rem,
> + rtx diff, rtx *dcond, rtx orig_src1, rtx orig_src2,
> + HOST_WIDE_INT length)
> {
> HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> - HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
> - rtx d1 = gen_reg_rtx (word_mode);
> - rtx d2 = gen_reg_rtx (word_mode);
> + gcc_assert (IN_RANGE (bytes_rem, 0, load_mode_size - 1));
>
> rtx addr1, addr2;
> - if (!isConst || addr_adj)
> - {
> - rtx adj_reg = gen_reg_rtx (word_mode);
> - if (isConst)
> - emit_move_insn (adj_reg, GEN_INT (-addr_adj));
> - else
> - {
> - rtx reg_lms = gen_reg_rtx (word_mode);
> - emit_move_insn (reg_lms, GEN_INT (load_mode_size));
> - do_sub3 (adj_reg, cmp_rem, reg_lms);
> - }
>
> - addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
> - addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
> - }
> - else
> + switch (bytes_rem)
> {
> - addr1 = src1_addr;
> - addr2 = src2_addr;
> + case 0:
> + return;
> + case 1:
> + load_mode = QImode;
> + break;
> + case 2:
> + load_mode = HImode;
> + break;
> + case 4:
> + load_mode = SImode;
> + break;
> + case 8:
> + if (TARGET_POWERPC64)
> + load_mode = DImode;
> + break;
> }
>
> - do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
> - do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
> -
> - if (TARGET_P9_MISC)
> - {
> - /* Generate a compare, and convert with a setb later. */
> - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
> - emit_insn (gen_rtx_SET (dcond, cmp));
> - }
> - else
> - {
> - if (word_mode == DImode)
> - emit_insn (gen_subfdi3_carry (diff, d2, d1));
> - else
> - emit_insn (gen_subfsi3_carry (diff, d2, d1));
> - }
> + load_mode_size = GET_MODE_SIZE (load_mode);
> + addr1 = adjust_address (orig_src1, load_mode, length - load_mode_size);
> + addr2 = adjust_address (orig_src2, load_mode, length - load_mode_size);
> + do_load_and_compare (load_mode, addr1, addr2, dcond, diff,
> + orig_src1, orig_src2);
> }
>
> /* Generate the sequence of compares for strcmp/strncmp using vec/vsx
> @@ -889,790 +856,550 @@ emit_final_compare_vec (rtx str1, rtx str2, rtx
> result,
> return;
> }
>
> -/* Expand a block compare operation using loop code, and return true
> - if successful. Return false if we should let the compiler generate
> - normal code, probably a memcmp call.
> -
> - OPERANDS[0] is the target (result).
> - OPERANDS[1] is the first source.
> - OPERANDS[2] is the second source.
> - OPERANDS[3] is the length.
> - OPERANDS[4] is the alignment. */
> -bool
> -expand_compare_loop (rtx operands[])
> +static HOST_WIDE_INT
> +get_max_inline_loop_bytes (bool bytes_is_const, int align)
> {
> - rtx target = operands[0];
> - rtx orig_src1 = operands[1];
> - rtx orig_src2 = operands[2];
> - rtx bytes_rtx = operands[3];
> - rtx align_rtx = operands[4];
> -
> - /* This case is complicated to handle because the subtract
> - with carry instructions do not generate the 64-bit
> - carry and so we must emit code to calculate it ourselves.
> - We choose not to implement this yet. */
> - if (TARGET_32BIT && TARGET_POWERPC64)
> - return false;
> -
> - /* Allow non-const length. */
> - int bytes_is_const = CONST_INT_P (bytes_rtx);
> -
> - /* This must be a fixed size alignment. */
> - if (!CONST_INT_P (align_rtx))
> - return false;
> -
> - HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
> - HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
> - HOST_WIDE_INT minalign = MIN (align1, align2);
> -
> - bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
> -
> - gcc_assert (GET_MODE (target) == SImode);
> -
> - /* Anything to move? */
> - HOST_WIDE_INT bytes = 0;
> - if (bytes_is_const)
> - bytes = INTVAL (bytes_rtx);
> -
> - if (bytes_is_const && bytes == 0)
> - return true;
> -
> - /* Limit the amount we compare, if known statically. */
> - HOST_WIDE_INT max_bytes;
> switch (rs6000_tune)
> {
> case PROCESSOR_POWER7:
> if (!bytes_is_const)
> - if (minalign < 8)
> - max_bytes = 0;
> + if (align < 8)
> + return 0;
> else
> - max_bytes = 128;
> + return 128;
> else
> - if (minalign < 8)
> - max_bytes = 32;
> + if (align < 8)
> + return 32;
> else
> - max_bytes = 128;
> + return 128;
> break;
> case PROCESSOR_POWER8:
> if (!bytes_is_const)
> - max_bytes = 0;
> + return 0;
> else
> - if (minalign < 8)
> - max_bytes = 128;
> + if (align < 8)
> + return 128;
> else
> - max_bytes = 64;
> + return 64;
> break;
> case PROCESSOR_POWER9:
> case PROCESSOR_POWER10:
> if (bytes_is_const)
> - max_bytes = 191;
> + return 191;
> else
> - max_bytes = 0;
> + return 0;
> break;
> default:
> - max_bytes = 128;
> + return 128;
> }
> +}
>
> - /* Allow the option to override the default. */
> - if (rs6000_block_compare_inline_loop_limit >= 0)
> - max_bytes = (unsigned HOST_WIDE_INT)
> rs6000_block_compare_inline_loop_limit;
> -
> - if (max_bytes == 0)
> - return false;
> +/* Do the load and compare when remain bytes is less than loop bytes
> + and it's a variable length compare. expand_bytes indicates the
> + maximum bytes needed to be expanded. */
> +static void
> +do_load_compare_rest_of_loop (machine_mode load_mode, rtx src1_addr,
> + rtx src2_addr, rtx cmp_rem, rtx diff,
> + rtx diff_label, rtx *dcond, rtx final_label,
> + rtx orig_src1, rtx orig_src2,
> + HOST_WIDE_INT loop_bytes,
> + HOST_WIDE_INT expand_bytes)
> +{
> + gcc_assert ((TARGET_POWERPC64 && load_mode == DImode)
> + || (!TARGET_POWERPC64 && load_mode == SImode));
> + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> + gcc_assert (loop_bytes = 2 * load_mode_size);
> + gcc_assert (expand_bytes < loop_bytes);
>
> - rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
> - rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop.
> */
> - HOST_WIDE_INT niter;
> - rtx iter = gen_reg_rtx (word_mode);
> - rtx iv1 = gen_reg_rtx (word_mode);
> - rtx iv2 = gen_reg_rtx (word_mode);
> - rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
> - rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
> - rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
> - rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
> + rtx adj_reg = gen_reg_rtx (word_mode);
> + rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> + rtx j;
> + rtx cmp;
> + rtx ccreg = gen_reg_rtx (CCmode);
>
> - /* Strip unneeded subreg from length if there is one. */
> - if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
> - bytes_rtx = SUBREG_REG (bytes_rtx);
> - /* Extend bytes_rtx to word_mode if needed. But, we expect only to
> - maybe have to deal with the case were bytes_rtx is SImode and
> - word_mode is DImode. */
> - if (!bytes_is_const)
> + if (TARGET_POWERPC64 && expand_bytes >= 8)
> {
> - if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
> - /* Do not expect length longer than word_mode. */
> - return false;
> - else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE
> (word_mode))
> + /* Compare with 8 bytes. */
> + rtx cmp_4 = gen_label_rtx ();
> + cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (8));
> + emit_insn (gen_rtx_SET (ccreg, cmp));
> + do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_4,
> + profile_probability::even ());
> + do_load_and_compare (DImode, src1_addr, src2_addr, dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::unlikely ());
> +
> + if (expand_bytes > 8)
> {
> - bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> - bytes_rtx = force_reg (word_mode,
> - gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
> - bytes_rtx));
> + do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label,
> + profile_probability::unlikely ());
> +
> + /* cmp_rem is great than 8 bytes. Do 8 bytes overlap compare. */
> + do_add3 (adj_reg, cmp_rem, GEN_INT (-8));
> + do_add3 (src1_addr, src1_addr, adj_reg);
> + do_add3 (src2_addr, src2_addr, adj_reg);
> + do_load_and_compare (DImode, src1_addr, src2_addr, dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::likely ());
> }
> - else
> - /* Make sure it's in a register before we get started. */
> - bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> - }
> -
> - machine_mode load_mode = word_mode;
> - HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> -
> - /* Number of bytes per iteration of the unrolled loop. */
> - HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
> - /* max iters and bytes compared in the loop. */
> - HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
> - HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
> - int l2lb = floor_log2 (loop_bytes);
>
> - if (bytes_is_const && (max_bytes < load_mode_size
> - || !IN_RANGE (bytes, load_mode_size, max_bytes)))
> - return false;
> + j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> + JUMP_LABEL (j) = final_label;
> + LABEL_NUSES (final_label) += 1;
> + emit_barrier ();
>
> - bool no_remainder_code = false;
> - rtx final_label = gen_label_rtx ();
> - rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> - rtx diff_label = gen_label_rtx ();
> - rtx library_call_label = NULL;
> - rtx cleanup_label = gen_label_rtx ();
> + emit_label (cmp_4);
> + }
>
> - rtx cr;
> + if (expand_bytes >= 4)
> + {
> + /* Compare with 4 bytes. */
> + rtx cmp_2 = gen_label_rtx ();
> + cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (4));
> + emit_insn (gen_rtx_SET (ccreg, cmp));
> + do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_2,
> + profile_probability::even ());
> + do_load_and_compare (SImode, src1_addr, src2_addr, dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::unlikely ());
> +
> + if (expand_bytes > 4)
> + {
> + do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label,
> + profile_probability::unlikely ());
> +
> + /* cmp_rem is great than 4 bytes. Do 4 bytes overlap compare. */
> + do_add3 (adj_reg, cmp_rem, GEN_INT (-4));
> + do_add3 (src1_addr, src1_addr, adj_reg);
> + do_add3 (src2_addr, src2_addr, adj_reg);
> + do_load_and_compare (SImode, src1_addr, src2_addr, dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::likely ());
> + }
>
> - rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
> - rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
> + j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> + JUMP_LABEL (j) = final_label;
> + LABEL_NUSES (final_label) += 1;
> + emit_barrier ();
>
> - /* Difference found is stored here before jump to diff_label. */
> - rtx diff = gen_reg_rtx (word_mode);
> - rtx_insn *j;
> + emit_label (cmp_2);
> + }
>
> - /* Example of generated code for 35 bytes aligned 1 byte.
> -
> - mtctr 8
> - li 6,0
> - li 5,8
> - .L13:
> - ldbrx 7,3,6
> - ldbrx 9,10,6
> - ldbrx 0,3,5
> - ldbrx 4,10,5
> - addi 6,6,16
> - addi 5,5,16
> - subfc. 9,9,7
> - bne 0,.L10
> - subfc. 9,4,0
> - bdnzt 2,.L13
> - bne 0,.L10
> - add 3,3,6
> - add 10,10,6
> - addi 9,3,-5
> - ldbrx 7,0,9
> - addi 9,10,-5
> - ldbrx 9,0,9
> - subfc 9,9,7
> - .p2align 4,,15
> - .L10:
> - popcntd 9,9
> - subfe 10,10,10
> - or 9,9,10
> -
> - Compiled with -fno-reorder-blocks for clarity. */
> -
> - /* Structure of what we're going to do:
> - Two separate lengths: what we will compare before bailing to library
> - call (max_bytes), and the total length to be checked.
> - if length <= 16, branch to linear cleanup code starting with
> - remainder length check (length not known at compile time)
> - set up 2 iv's and load count reg, compute remainder length
> - unrollx2 compare loop
> - if loop exit due to a difference, branch to difference handling code
> - if remainder length < 8, branch to final cleanup compare
> - load and compare 8B
> - final cleanup comparison (depends on alignment and length)
> - load 8B, shift off bytes past length, compare
> - load 8B ending at last byte and compare
> - load/compare 1 byte at a time (short block abutting 4k boundary)
> - difference handling, 64->32 conversion
> - final result
> - branch around memcmp call
> - memcmp library call
> - */
> -
> - /* If bytes is not const, compare length and branch directly
> - to the cleanup code that can handle 0-16 bytes if length
> - is >= 16. Stash away bytes-max_bytes for the library call. */
> - if (bytes_is_const)
> + if (expand_bytes >= 2)
> {
> - /* These need to be set for some of the places we may jump to. */
> - if (bytes > max_bytes)
> - {
> - no_remainder_code = true;
> - niter = max_loop_iter;
> - library_call_label = gen_label_rtx ();
> - }
> - else
> + /* Compare with 2 bytes. */
> + rtx cmp_1 = gen_label_rtx ();
> + cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (2));
> + emit_insn (gen_rtx_SET (ccreg, cmp));
> + do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_1,
> + profile_probability::even ());
> + do_load_and_compare (HImode, src1_addr, src2_addr, dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::unlikely ());
> +
> + if (expand_bytes > 2)
> {
> - niter = bytes / loop_bytes;
> + do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label,
> + profile_probability::unlikely ());
> +
> + /* cmp_rem equals to 3 bytes and leave 1 byte to load and
> + compare. */
> + do_add3 (src1_addr, src1_addr, GEN_INT (2));
> + do_add3 (src2_addr, src2_addr, GEN_INT (2));
> }
> - emit_move_insn (iter, GEN_INT (niter));
> - emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
> - emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
> - }
> - else
> - {
> - library_call_label = gen_label_rtx ();
>
> - /* If we go to the cleanup code, it expects length to be in cmp_rem.
> */
> - emit_move_insn (cmp_rem, bytes_rtx);
> + emit_label (cmp_1);
> + }
>
> - /* Check for > max_bytes bytes. We want to bail out as quickly as
> - possible if we have to go over to memcmp. */
> - do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
> - NULL_RTX, library_call_label, profile_probability::even ());
> + /* Do 1 byte load and compare. */
> + do_load_and_compare (QImode, src1_addr, src2_addr, dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::likely ());
> + j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> + JUMP_LABEL (j) = final_label;
> + LABEL_NUSES (final_label) += 1;
> + emit_barrier ();
> +}
>
> - /* Check for < loop_bytes bytes. */
> - do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
> - NULL_RTX, cleanup_label, profile_probability::even ());
> +/* Generate code to convert a DImode-plus-carry subtract result into
> + a SImode result that has the same <0 / ==0 / >0 properties to
> + produce the final result from memcmp.
>
> - /* Loop compare bytes and iterations if bytes>max_bytes. */
> - rtx mb_reg = gen_reg_rtx (word_mode);
> - emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
> - rtx mi_reg = gen_reg_rtx (word_mode);
> - emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
> + TARGET is the rtx for the register to receive the memcmp result.
> + SUB_RESULT is the rtx for the register contining the subtract result. */
>
> - /* Compute number of loop iterations if bytes <= max_bytes. */
> - if (word_mode == DImode)
> - emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
> - else
> - emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
> +static void
> +generate_6432_conversion (rtx target, rtx sub_result)
> +{
> + /* We need to produce DI result from sub, then convert to target SI
> + while maintaining <0 / ==0 / >0 properties. This sequence works:
> + subfc L,A,B
> + subfe H,H,H
> + popcntd L,L
> + rldimi L,H,6,0
>
> - /* Compute bytes to compare in loop if bytes <= max_bytes. */
> - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
> - if (word_mode == DImode)
> - {
> - emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
> - }
> - else
> - {
> - emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
> - }
> + This is an alternate one Segher cooked up if somebody
> + wants to expand this for something that doesn't have popcntd:
> + subfc L,a,b
> + subfe H,x,x
> + addic t,L,-1
> + subfe v,t,L
> + or z,v,H
>
> - /* Check for bytes <= max_bytes. */
> - if (TARGET_ISEL)
> - {
> - /* P9 has fast isel so we use one compare and two isel. */
> - cr = gen_reg_rtx (CCmode);
> - rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
> - GEN_INT (max_bytes));
> - emit_move_insn (cr, compare_rtx);
> - rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
> - do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
> - do_isel (iter, cmp_rtx, iter, mi_reg, cr);
> - }
> - else
> - {
> - rtx lab_after = gen_label_rtx ();
> - do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
> - NULL_RTX, lab_after, profile_probability::even ());
> - emit_move_insn (loop_cmp, mb_reg);
> - emit_move_insn (iter, mi_reg);
> - emit_label (lab_after);
> - }
> + And finally, p9 can just do this:
> + cmpld A,B
> + setb r
> + . */
>
> - /* Now compute remainder bytes which isn't used until after the loop.
> */
> - do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
> + if (TARGET_64BIT)
> + {
> + rtx tmp_reg_ca = gen_reg_rtx (DImode);
> + emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
> + rtx popcnt = gen_reg_rtx (DImode);
> + emit_insn (gen_popcntddi2 (popcnt, sub_result));
> + rtx tmp2 = gen_reg_rtx (DImode);
> + emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
> + emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
> + }
> + else
> + {
> + rtx tmp_reg_ca = gen_reg_rtx (SImode);
> + emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
> + rtx popcnt = gen_reg_rtx (SImode);
> + emit_insn (gen_popcntdsi2 (popcnt, sub_result));
> + emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
> }
> +}
>
> - rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
> - /* For p9 we need to have just one of these as multiple places define
> - it and it gets used by the setb at the end. */
> +/* Generate the return value when memcmp finds a difference from
> + compare. */
> +static void
> +gen_diff_handle (rtx target, rtx dcond, rtx diff, rtx diff_label,
> + rtx final_label)
> +{
> + emit_label (diff_label);
> if (TARGET_P9_MISC)
> - dcond = gen_reg_rtx (CCUNSmode);
> + emit_insn (gen_setb_unsigned (target, dcond));
> + else
> + generate_6432_conversion (target, diff);
>
> - if (!bytes_is_const || bytes >= loop_bytes)
> - {
> - /* It should not be possible to come here if remaining bytes is
> - < 16 in the runtime case either. Compute number of loop
> - iterations. We compare 2*word_mode per iteration so 16B for
> - 64-bit code and 8B for 32-bit. Set up two induction
> - variables and load count register. */
> -
> - /* HACK ALERT: create hard reg for CTR here. If we just use a
> - pseudo, cse will get rid of it and then the allocator will
> - see it used in the lshr above and won't give us ctr. */
> - rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
> - emit_move_insn (ctr, iter);
> - emit_move_insn (diff, GEN_INT (0));
> - emit_move_insn (iv1, GEN_INT (0));
> - emit_move_insn (iv2, GEN_INT (load_mode_size));
> -
> - /* inner loop to compare 2*word_mode */
> - rtx loop_top_label = gen_label_rtx ();
> - emit_label (loop_top_label);
> -
> - rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
> - rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
> -
> - do_load_for_compare_from_addr (load_mode, d1_1,
> - src1_ix1, orig_src1);
> - do_load_for_compare_from_addr (load_mode, d2_1,
> - src2_ix1, orig_src2);
> - do_add3 (iv1, iv1, GEN_INT (loop_bytes));
> -
> - rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
> - rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
> -
> - do_load_for_compare_from_addr (load_mode, d1_2,
> - src1_ix2, orig_src1);
> - do_load_for_compare_from_addr (load_mode, d2_2,
> - src2_ix2, orig_src2);
> - do_add3 (iv2, iv2, GEN_INT (loop_bytes));
> -
> - if (TARGET_P9_MISC)
> - {
> - /* Generate a compare, and convert with a setb later. */
> - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
> - emit_insn (gen_rtx_SET (dcond, cmp));
> - }
> - else
> - {
> - dcond = gen_reg_rtx (CCmode);
> - if (word_mode == DImode)
> - emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> - else
> - emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> - }
> + rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> + JUMP_LABEL (j) = final_label;
> + LABEL_NUSES (final_label) += 1;
> + emit_barrier ();
> +}
>
> - do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> - dcond, diff_label, profile_probability::unlikely ());
> +static void
> +gen_load_compare_loop (machine_mode load_mode, rtx src1_addr, rtx src2_addr,
> + rtx orig_src1, rtx orig_src2, rtx diff, rtx diff_label,
> + rtx cmp_rem, rtx *dcond, HOST_WIDE_INT loop_bytes,
> + rtx final_label, bool bytes_is_const)
> +{
> + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> + gcc_assert (loop_bytes == 2 * load_mode_size);
>
> - if (TARGET_P9_MISC)
> - {
> - /* Generate a compare, and convert with a setb later. */
> - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
> - emit_insn (gen_rtx_SET (dcond, cmp));
> - }
> - else
> - {
> - dcond = gen_reg_rtx (CCmode);
> - if (word_mode == DImode)
> - emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
> - else
> - emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
> - }
> + rtx iter = gen_reg_rtx (word_mode);
>
> - rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
> - if (TARGET_64BIT)
> - j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
> - eqrtx, dcond));
> - else
> - j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
> - eqrtx, dcond));
> - add_reg_br_prob_note (j, profile_probability::likely ());
> - JUMP_LABEL (j) = loop_top_label;
> - LABEL_NUSES (loop_top_label) += 1;
> - }
> + int l2lb = floor_log2 (loop_bytes);
> + if (word_mode == DImode)
> + emit_insn (gen_lshrdi3 (iter, cmp_rem, GEN_INT (l2lb)));
> + else
> + emit_insn (gen_lshrsi3 (iter, cmp_rem, GEN_INT (l2lb)));
>
> - HOST_WIDE_INT bytes_remaining = 0;
> - if (bytes_is_const)
> - bytes_remaining = (bytes % loop_bytes);
> + rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
> + emit_move_insn (ctr, iter);
>
> - /* If diff is nonzero, branch to difference handling
> - code. If we exit here with a nonzero diff, it is
> - because the second word differed. */
> - if (TARGET_P9_MISC)
> - do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
> - diff_label, profile_probability::unlikely ());
> + rtx iv1 = gen_reg_rtx (word_mode);
> + rtx iv2 = gen_reg_rtx (word_mode);
> + rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1. */
> + rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2. */
> + rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1. */
> + rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2. */
> +
> + emit_move_insn (iv1, GEN_INT (0));
> + emit_move_insn (iv2, GEN_INT (load_mode_size));
> +
> + rtx loop_top_label = gen_label_rtx ();
> + emit_label (loop_top_label);
> +
> + /* Manually put two pair of loads together. */
> + rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
> + rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
> + do_load_for_compare_from_addr (load_mode, d1_1, src1_ix1, orig_src1);
> + do_load_for_compare_from_addr (load_mode, d2_1, src2_ix1, orig_src2);
> + rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
> + rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
> + do_load_for_compare_from_addr (load_mode, d1_2, src1_ix2, orig_src1);
> + do_load_for_compare_from_addr (load_mode, d2_2, src2_ix2, orig_src2);
> +
> + do_reg_compare (false, NULL_RTX, diff, dcond, d1_1, d2_1);
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> + *dcond, diff_label, profile_probability::unlikely ());
> + do_reg_compare (false, NULL_RTX, diff, dcond, d1_2, d2_2);
> +
> + do_add3 (iv1, iv1, GEN_INT (loop_bytes));
> + do_add3 (iv2, iv2, GEN_INT (loop_bytes));
> +
> + rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
> + rtx_insn *j;
> + if (TARGET_64BIT)
> + j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
> + eqrtx, *dcond));
> else
> - do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
> - diff_label, profile_probability::unlikely ());
> + j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
> + eqrtx, *dcond));
> + add_reg_br_prob_note (j, profile_probability::likely ());
> + JUMP_LABEL (j) = loop_top_label;
> + LABEL_NUSES (loop_top_label) += 1;
>
> - if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
> - {
> - /* If the length is known at compile time, then we will always
> - have a remainder to go to the library call with. */
> - rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode,
> library_call_label);
> - j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
> - JUMP_LABEL (j) = library_call_label;
> - LABEL_NUSES (library_call_label) += 1;
> - emit_barrier ();
> - }
> + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, *dcond,
> + diff_label, profile_probability::unlikely ());
>
> - if (bytes_is_const && bytes_remaining == 0)
> + /* If length is fixed, we know how many bytes are left. So skip the
> + remain bytes test. */
> + if (!bytes_is_const)
> {
> - /* No remainder and if we are here then diff is 0 so just return 0 */
> - if (TARGET_64BIT)
> - emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
> - else
> - emit_move_insn (target, diff);
> - j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> - JUMP_LABEL (j) = final_label;
> - LABEL_NUSES (final_label) += 1;
> - emit_barrier ();
> + do_sub3 (cmp_rem, cmp_rem, iv1);
> + do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
> + final_label, profile_probability::unlikely ());
> }
> - else if (!no_remainder_code)
> - {
> - /* Update addresses to point to the next word to examine. */
> - do_add3 (src1_addr, src1_addr, iv1);
> - do_add3 (src2_addr, src2_addr, iv1);
> -
> - emit_label (cleanup_label);
>
> - if (!bytes_is_const)
> - {
> - /* If we're dealing with runtime length, we have to check if
> - it's zero after the loop. When length is known at compile
> - time the no-remainder condition is dealt with above. By
> - doing this after cleanup_label, we also deal with the
> - case where length is 0 at the start and we bypass the
> - loop with a branch to cleanup_label. */
> - emit_move_insn (target, const0_rtx);
> - do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
> - NULL_RTX, final_label, profile_probability::unlikely ());
> - }
> -
> - rtx final_cleanup = gen_label_rtx ();
> - rtx cmp_rem_before = gen_reg_rtx (word_mode);
> - /* Compare one more word_mode chunk if needed. */
> - if (!bytes_is_const || bytes_remaining >= load_mode_size)
> - {
> - /* If remainder length < word length, branch to final
> - cleanup compare. */
> -
> - if (!bytes_is_const)
> - {
> - do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
> - NULL_RTX, final_cleanup, profile_probability::even ());
> - }
> + do_add3 (src1_addr, src1_addr, iv1);
> + do_add3 (src2_addr, src2_addr, iv1);
> +}
>
> - /* load and compare 8B */
> - do_load_for_compare_from_addr (load_mode, d1_1,
> - src1_addr, orig_src1);
> - do_load_for_compare_from_addr (load_mode, d2_1,
> - src2_addr, orig_src2);
> +/* Generate memcmp library call. */
> +static void
> +gen_library_call (rtx target, rtx src1_addr, rtx src2_addr, rtx bytes_rtx,
> + rtx library_label)
> +{
> + emit_label (library_label);
> +
> + rtx len_rtx = gen_reg_rtx (word_mode);
> + emit_move_insn (len_rtx, bytes_rtx);
> + tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
> + emit_library_call_value (XEXP (DECL_RTL (fun), 0),
> + target, LCT_NORMAL, GET_MODE (target),
> + src1_addr, Pmode, src2_addr, Pmode,
> + len_rtx, GET_MODE (len_rtx));
> +}
>
> - /* Compare the word, see if we need to do the last partial. */
> - if (TARGET_P9_MISC)
> - {
> - /* Generate a compare, and convert with a setb later. */
> - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
> - emit_insn (gen_rtx_SET (dcond, cmp));
> - }
> - else
> - {
> - dcond = gen_reg_rtx (CCmode);
> - if (word_mode == DImode)
> - emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> - else
> - emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> - }
> +static bool
> +expand_compare_with_fixed_length (rtx operands[])
> +{
> + rtx target = operands[0];
> + rtx orig_src1 = operands[1];
> + rtx orig_src2 = operands[2];
> + rtx bytes_rtx = operands[3];
> + rtx align_rtx = operands[4];
>
> - do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> - dcond, diff_label, profile_probability::even ());
> + gcc_assert (CONST_INT_P (bytes_rtx));
> + gcc_assert (GET_MODE (target) == SImode);
>
> - do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
> - do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
> - emit_move_insn (cmp_rem_before, cmp_rem);
> - do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
> - if (bytes_is_const)
> - bytes_remaining -= load_mode_size;
> - else
> - /* See if remaining length is now zero. We previously set
> - target to 0 so we can just jump to the end. */
> - do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
> - final_label, profile_probability::unlikely ());
> - }
> + if (TARGET_32BIT && TARGET_POWERPC64)
> + return false;
>
> - /* Cases:
> - bytes_is_const
> - We can always shift back to do an overlapping compare
> - of the last chunk because we know length >= 8.
> -
> - !bytes_is_const
> - align>=load_mode_size
> - Read word_mode and mask
> - align<load_mode_size
> - avoid stepping past end
> -
> - Three strategies:
> - * decrement address and do overlapping compare
> - * read word_mode and mask
> - * carefully avoid crossing 4k boundary
> - */
> -
> - if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
> - && align1 >= load_mode_size && align2 >= load_mode_size)
> - {
> - /* Alignment is larger than word_mode so we do not need to be
> - concerned with extra page crossings. But, we do not know
> - that the length is larger than load_mode_size so we might
> - end up compareing against data before the block if we try
> - an overlapping compare. Also we use this on P7 for fixed length
> - remainder because P7 doesn't like overlapping unaligned.
> - Strategy: load 8B, shift off bytes past length, and compare. */
> - emit_label (final_cleanup);
> - do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
> - src1_addr, src2_addr, orig_src1, orig_src2);
> - }
> - else if (bytes_remaining && bytes_is_const)
> - {
> - /* We do not do loop expand if length < 32 so we know at the
> - end we can do an overlapping compare.
> - Strategy: shift address back and do word_mode load that
> - ends at the end of the block. */
> - emit_label (final_cleanup);
> - do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
> - cmp_rem, dcond, src1_addr, src2_addr,
> - orig_src1, orig_src2);
> - }
> - else if (!bytes_is_const)
> - {
> - rtx handle4k_label = gen_label_rtx ();
> - rtx nonconst_overlap = gen_label_rtx ();
> - emit_label (nonconst_overlap);
> -
> - /* Here we have to handle the case where whe have runtime
> - length which may be too short for overlap compare, and
> - alignment is not at least load_mode_size so we have to
> - tread carefully to avoid stepping across 4k boundaries. */
> -
> - /* If the length after the loop was larger than word_mode
> - size, we can just do an overlapping compare and we're
> - done. We fall through to this code from the word_mode
> - compare that preceeds this. */
> - do_overlap_load_compare (load_mode, false, 0, diff,
> - cmp_rem, dcond, src1_addr, src2_addr,
> - orig_src1, orig_src2);
> -
> - rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
> - j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
> - JUMP_LABEL (j) = diff_label;
> - LABEL_NUSES (diff_label) += 1;
> - emit_barrier ();
> -
> - /* If we couldn't do the overlap compare we have to be more
> - careful of the 4k boundary. Test to see if either
> - address is less than word_mode_size away from a 4k
> - boundary. If not, then we can do a load/shift/compare
> - and we are done. We come to this code if length was less
> - than word_mode_size. */
> -
> - emit_label (final_cleanup);
> -
> - /* We can still avoid the slow case if the length was larger
> - than one loop iteration, in which case go do the overlap
> - load compare path. */
> - do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
> - NULL_RTX, nonconst_overlap, profile_probability::even ());
> -
> - rtx rem4k = gen_reg_rtx (word_mode);
> - rtx dist1 = gen_reg_rtx (word_mode);
> - rtx dist2 = gen_reg_rtx (word_mode);
> - do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
> - if (word_mode == SImode)
> - emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
> - else
> - emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
> - do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
> - handle4k_label, profile_probability::very_unlikely ());
> - if (word_mode == SImode)
> - emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
> - else
> - emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
> - do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
> - handle4k_label, profile_probability::very_unlikely ());
> + /* This must be a fixed size alignment. */
> + if (!CONST_INT_P (align_rtx))
> + return false;
>
> - /* We don't have a 4k boundary to deal with, so do
> - a load/shift/compare and jump to diff. */
> + HOST_WIDE_INT align = INTVAL (align_rtx) / BITS_PER_UNIT;
> + HOST_WIDE_INT bytes = INTVAL (bytes_rtx);
>
> - do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
> - src1_addr, src2_addr, orig_src1, orig_src2);
> + if (bytes == 0)
> + return true;
>
> - j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
> - JUMP_LABEL (j) = diff_label;
> - LABEL_NUSES (diff_label) += 1;
> - emit_barrier ();
> + /* Limit the amount we compare, if known statically. */
> + HOST_WIDE_INT max_bytes = get_max_inline_loop_bytes (true, align);
>
> - /* Finally in the unlikely case we are inching up to a
> - 4k boundary we use a compact lbzx/compare loop to do
> - it a byte at a time. */
> + /* Allow the option to override the default. */
> + if (rs6000_block_compare_inline_loop_limit >= 0)
> + max_bytes = (unsigned HOST_WIDE_INT)
> rs6000_block_compare_inline_loop_limit;
>
> - emit_label (handle4k_label);
> + if (max_bytes == 0)
> + return false;
>
> - rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
> - emit_move_insn (ctr, cmp_rem);
> - rtx ixreg = gen_reg_rtx (Pmode);
> - emit_move_insn (ixreg, const0_rtx);
> + machine_mode load_mode = word_mode;
> + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
>
> - rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
> - rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
> - rtx d1 = gen_reg_rtx (word_mode);
> - rtx d2 = gen_reg_rtx (word_mode);
> + if (max_bytes < load_mode_size
> + || !IN_RANGE (bytes, load_mode_size, max_bytes))
> + return false;
>
> - rtx fc_loop = gen_label_rtx ();
> - emit_label (fc_loop);
> + /* Remainder bytes for compare. */
> + rtx cmp_rem = gen_reg_rtx (word_mode);
> + /* Number of bytes per iteration of the unrolled loop. */
> + HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
>
> - do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
> - do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
> + rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
> + rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
>
> - do_add3 (ixreg, ixreg, const1_rtx);
> + /* Label for set target when finding a diff. */
> + rtx diff_label = gen_label_rtx ();
> + rtx final_label = gen_label_rtx ();
>
> - rtx cond = gen_reg_rtx (CCmode);
> - rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
> - rs6000_emit_dot_insn (diff, subexpr, 2, cond);
> + /* CC used for when we jump to diff_label. */
> + rtx dcond = NULL_RTX;
> + /* For p9 we need to have just one of these as multiple places define
> + it and it gets used by the setb at the end. */
> + if (TARGET_P9_MISC)
> + dcond = gen_reg_rtx (CCUNSmode);
>
> - rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
> - if (TARGET_64BIT)
> - j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
> - eqrtx, cond));
> - else
> - j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
> - eqrtx, cond));
> - add_reg_br_prob_note (j, profile_probability::likely ());
> - JUMP_LABEL (j) = fc_loop;
> - LABEL_NUSES (fc_loop) += 1;
> -
> - if (TARGET_64BIT)
> - emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
> - else
> - emit_move_insn (target, diff);
> -
> - /* Since we are comparing bytes, the difference can be used
> - as the final result and we are done here. */
> - j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> - JUMP_LABEL (j) = final_label;
> - LABEL_NUSES (final_label) += 1;
> - emit_barrier ();
> - }
> - }
> + /* Difference found is stored here before jump to diff_label. */
> + rtx diff = gen_reg_rtx (word_mode);
>
> - emit_label (diff_label);
> - /* difference handling, 64->32 conversion */
> + emit_move_insn (cmp_rem, GEN_INT (bytes));
> + emit_move_insn (target, const0_rtx);
>
> - /* We need to produce DI result from sub, then convert to target SI
> - while maintaining <0 / ==0 / >0 properties. This sequence works:
> - subfc L,A,B
> - subfe H,H,H
> - popcntd L,L
> - rldimi L,H,6,0
> + gen_load_compare_loop (load_mode, src1_addr, src2_addr, orig_src1,
> + orig_src2, diff, diff_label, cmp_rem, &dcond,
> + loop_bytes, final_label, true);
>
> - This is an alternate one Segher cooked up if somebody
> - wants to expand this for something that doesn't have popcntd:
> - subfc L,a,b
> - subfe H,x,x
> - addic t,L,-1
> - subfe v,t,L
> - or z,v,H
> + HOST_WIDE_INT rem_bytes = bytes % loop_bytes;
>
> - And finally, p9 can just do this:
> - cmpld A,B
> - setb r */
> -
> - if (TARGET_P9_MISC)
> - emit_insn (gen_setb_unsigned (target, dcond));
> - else
> + if (rem_bytes >= load_mode_size)
> {
> - if (TARGET_64BIT)
> - {
> - rtx tmp_reg_ca = gen_reg_rtx (DImode);
> - emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
> - emit_insn (gen_popcntddi2 (diff, diff));
> - emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
> - emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
> - }
> - else
> - {
> - rtx tmp_reg_ca = gen_reg_rtx (SImode);
> - emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
> - emit_insn (gen_popcntdsi2 (diff, diff));
> - emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
> - }
> + do_load_and_compare (load_mode, src1_addr, src2_addr, &dcond, diff,
> + orig_src1, orig_src2);
> + do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> + dcond, diff_label, profile_probability::unlikely ());
> + rem_bytes -= load_mode_size;
> }
>
> - if (library_call_label != NULL)
> + if (rem_bytes > 0)
> {
> - /* Branch around memcmp call. */
> - j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> - JUMP_LABEL (j) = final_label;
> - LABEL_NUSES (final_label) += 1;
> - emit_barrier ();
> -
> - /* Make memcmp library call. cmp_rem is the remaining bytes that
> - were compared and cmp_rem is the expected amount to be compared
> - by memcmp. If we don't find a difference in the loop compare, do
> - the library call directly instead of doing a small compare just
> - to get to an arbitrary boundary before calling it anyway.
> - Also, update addresses to point to the next word to examine. */
> - emit_label (library_call_label);
> -
> - rtx len_rtx = gen_reg_rtx (word_mode);
> - if (bytes_is_const)
> - {
> - emit_move_insn (len_rtx, cmp_rem);
> - do_add3 (src1_addr, src1_addr, iv1);
> - do_add3 (src2_addr, src2_addr, iv1);
> - }
> - else
> - emit_move_insn (len_rtx, bytes_rtx);
> -
> - tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
> - emit_library_call_value (XEXP (DECL_RTL (fun), 0),
> - target, LCT_NORMAL, GET_MODE (target),
> - src1_addr, Pmode,
> - src2_addr, Pmode,
> - len_rtx, GET_MODE (len_rtx));
> + do_overlap_load_compare (load_mode, rem_bytes, diff, &dcond,
> + orig_src1, orig_src2, bytes);
> + do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> + dcond, diff_label, profile_probability::unlikely ());
> }
>
> - /* emit final_label */
> + rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> + JUMP_LABEL (j) = final_label;
> + LABEL_NUSES (final_label) += 1;
> + emit_barrier ();
> +
> + gen_diff_handle (target, dcond, diff, diff_label, final_label);
> emit_label (final_label);
> +
> return true;
> }
>
> -/* Generate code to convert a DImode-plus-carry subtract result into
> - a SImode result that has the same <0 / ==0 / >0 properties to
> - produce the final result from memcmp.
> +static bool
> +expand_compare_with_variable_length (rtx operands[])
> +{
> + rtx target = operands[0];
> + rtx orig_src1 = operands[1];
> + rtx orig_src2 = operands[2];
> + rtx bytes_rtx = operands[3];
> + rtx align_rtx = operands[4];
>
> - TARGET is the rtx for the register to receive the memcmp result.
> - SUB_RESULT is the rtx for the register contining the subtract result. */
> + gcc_assert (!CONST_INT_P (bytes_rtx));
>
> -void
> -generate_6432_conversion(rtx target, rtx sub_result)
> -{
> - /* We need to produce DI result from sub, then convert to target SI
> - while maintaining <0 / ==0 / >0 properties. This sequence works:
> - subfc L,A,B
> - subfe H,H,H
> - popcntd L,L
> - rldimi L,H,6,0
> + if (TARGET_32BIT && TARGET_POWERPC64)
> + return false;
>
> - This is an alternate one Segher cooked up if somebody
> - wants to expand this for something that doesn't have popcntd:
> - subfc L,a,b
> - subfe H,x,x
> - addic t,L,-1
> - subfe v,t,L
> - or z,v,H
> + /* This must be a fixed size alignment. */
> + if (!CONST_INT_P (align_rtx))
> + return false;
>
> - And finally, p9 can just do this:
> - cmpld A,B
> - setb r */
> + HOST_WIDE_INT align = INTVAL (align_rtx) / BITS_PER_UNIT;
>
> - if (TARGET_64BIT)
> + /* Limit the amount we compare, if known statically. */
> + HOST_WIDE_INT max_bytes = get_max_inline_loop_bytes (false, align);
> +
> + /* Allow the option to override the default. */
> + if (rs6000_block_compare_inline_loop_limit >= 0)
> + max_bytes = rs6000_block_compare_inline_loop_limit;
> + if (max_bytes == 0)
> + return false;
> +
> + /* Remainder bytes for compare. */
> + rtx cmp_rem = gen_reg_rtx (word_mode);
> +
> + /* Strip unneeded subreg from length if there is one. */
> + if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
> + bytes_rtx = SUBREG_REG (bytes_rtx);
> +
> + if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
> + /* Do not expect length longer than word_mode. */
> + return false;
> + else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
> {
> - rtx tmp_reg_ca = gen_reg_rtx (DImode);
> - emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
> - rtx popcnt = gen_reg_rtx (DImode);
> - emit_insn (gen_popcntddi2 (popcnt, sub_result));
> - rtx tmp2 = gen_reg_rtx (DImode);
> - emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
> - emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
> + bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> + bytes_rtx = force_reg (word_mode,
> + gen_rtx_fmt_e (ZERO_EXTEND, word_mode, bytes_rtx));
> }
> else
> + /* Make sure it's in a register before we get started. */
> + bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> +
> + machine_mode load_mode = word_mode;
> + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> + rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
> + rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
> +
> + rtx library_call_label = gen_label_rtx ();
> +
> + /* Call library if length > max_bytes. */
> + do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
> + NULL_RTX, library_call_label, profile_probability::unlikely ());
> +
> + /* Number of bytes per iteration of the unrolled loop. */
> + HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
> +
> + /* Label for set target when finding a diff. */
> + rtx diff_label = gen_label_rtx ();
> + rtx final_label = gen_label_rtx ();
> +
> + /* CC used for when we jump to diff_label. */
> + rtx dcond = NULL_RTX;
> + /* For p9 we need to have just one of these as multiple places define
> + it and it gets used by the setb at the end. */
> + if (TARGET_P9_MISC)
> + dcond = gen_reg_rtx (CCUNSmode);
> +
> + /* Difference found is stored here before jump to diff_label. */
> + rtx diff = gen_reg_rtx (word_mode);
> +
> + emit_move_insn (target, const0_rtx);
> + emit_move_insn (cmp_rem, bytes_rtx);
> +
> + /* Number of bytes to be expanded at rest of loop. */
> + HOST_WIDE_INT expand_bytes = max_bytes % loop_bytes;
> + if (max_bytes >= loop_bytes)
> {
> - rtx tmp_reg_ca = gen_reg_rtx (SImode);
> - emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
> - rtx popcnt = gen_reg_rtx (SImode);
> - emit_insn (gen_popcntdsi2 (popcnt, sub_result));
> - emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
> + rtx compare_rest_label = gen_label_rtx ();
> + do_ifelse (CCmode, LT, cmp_rem, GEN_INT (loop_bytes), NULL_RTX,
> + compare_rest_label, profile_probability::unlikely ());
> + /* max bytes >= cmp_rem >= loop_bytes. */
> + gen_load_compare_loop (load_mode, src1_addr, src2_addr, orig_src1,
> + orig_src2, diff, diff_label, cmp_rem, &dcond,
> + loop_bytes, final_label, false);
> + emit_label (compare_rest_label);
> + expand_bytes = loop_bytes - 1;
> }
> +
> + /* cmp_rem < loop_bytes. */
> + do_load_compare_rest_of_loop (load_mode, src1_addr, src2_addr, cmp_rem,
> + diff, diff_label, &dcond, final_label,
> + orig_src1, orig_src2, loop_bytes,
> + expand_bytes);
> +
> + gen_diff_handle (target, dcond, diff, diff_label, final_label);
> + gen_library_call (target, src1_addr, src2_addr, bytes_rtx,
> + library_call_label);
> + emit_label (final_label);
> +
> + return true;
> }
>
> /* Generate memcmp expansion using in-line non-loop GPR instructions.
> @@ -1975,7 +1702,7 @@ expand_block_compare (rtx operands[])
> /* If this is not a fixed size compare, try generating loop code and
> if that fails just call memcmp. */
> if (!CONST_INT_P (bytes_rtx))
> - return expand_compare_loop (operands);
> + return expand_compare_with_variable_length (operands);
>
> /* This must be a fixed size alignment. */
> if (!CONST_INT_P (align_rtx))
> @@ -2016,7 +1743,7 @@ expand_block_compare (rtx operands[])
> max_bytes = ((max_bytes + 1) / 2) - 1;
>
> if (!IN_RANGE (bytes, 1, max_bytes))
> - return expand_compare_loop (operands);
> + return expand_compare_with_fixed_length (operands);
>
> rtx final_label = NULL;
>
> @@ -2069,7 +1796,7 @@ expand_block_compare (rtx operands[])
> if (TARGET_P9_MISC)
> emit_insn (gen_setb_unsigned (target, cond));
> else
> - generate_6432_conversion(target, sub_result);
> + generate_6432_conversion (target, sub_result);
> }
> }
>
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c
> b/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c
> new file mode 100644
> index 00000000000..60a38030784
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=1" } */
> +/* { dg-final { scan-assembler-not {\mbdnzt\M} } } */
> +
> +/* Test that no loop will be generated when the inline loop limit is less
> + than the loop bytes (2 * word_mode_size). */
> +
> +int foo (const char* s1, const char* s2, int l)
> +{
> + return __builtin_memcmp (s1, s2, l);
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c
> b/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c
> new file mode 100644
> index 00000000000..0e03f2af943
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c
> @@ -0,0 +1,5 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=4" } */
> +/* { dg-timeout-factor 2 } */
> +
> +#include "../../gcc.dg/memcmp-1.c"
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c
> b/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c
> new file mode 100644
> index 00000000000..499f5faee17
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c
> @@ -0,0 +1,5 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=32" } */
> +/* { dg-timeout-factor 2 } */
> +
> +#include "../../gcc.dg/memcmp-1.c"
BR,
Kewen