Just so there is some record of what I did here -- in order to backport the gpr strncmp expansion improvement patch to gcc 8 I had to pull in some pieces of an earlier cleanup patch from June of this year.
I'll get this committed to gcc-8-branch when I'm done with the bootstrap/regtest on a couple different ppc64 architectures (unless anyone has any objections). Thanks, Aaron 2018-11-26 Aaron Sawdey <acsaw...@linux.ibm.com> Backport from mainline 2018-10-25 Aaron Sawdey <acsaw...@linux.ibm.com> * config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to a shorter sequence with fewer branches. (emit_final_str_compare_gpr): Ditto. Backport from mainline to allow the above code to go in: 2018-06-14 Aaron Sawdey <acsaw...@linux.ibm.com> * config/rs6000/rs6000-string.c (do_and3, do_and3_mask, Index: rs6000-string.c =================================================================== --- rs6000-string.c (revision 266483) +++ rs6000-string.c (working copy) @@ -408,6 +408,54 @@ emit_insn (gen_addsi3 (dest, src1, src2)); } +/* Emit an and of the proper mode for DEST. + + DEST is the destination register for the and. + SRC1 is the first and input. + SRC2 is the second and input. + + Computes DEST = SRC1&SRC2. */ +static void +do_and3 (rtx dest, rtx src1, rtx src2) +{ + if (GET_MODE (dest) == DImode) + emit_insn (gen_anddi3 (dest, src1, src2)); + else + emit_insn (gen_andsi3 (dest, src1, src2)); +} + +/* Emit an cmpb of the proper mode for DEST. + + DEST is the destination register for the cmpb. + SRC1 is the first input. + SRC2 is the second input. + + Computes cmpb of SRC1, SRC2. */ +static void +do_cmpb3 (rtx dest, rtx src1, rtx src2) +{ + if (GET_MODE (dest) == DImode) + emit_insn (gen_cmpbdi3 (dest, src1, src2)); + else + emit_insn (gen_cmpbsi3 (dest, src1, src2)); +} + +/* Emit a rotl of the proper mode for DEST. + + DEST is the destination register for the and. + SRC1 is the first and input. + SRC2 is the second and input. + + Computes DEST = SRC1 rotated left by SRC2. */ +static void +do_rotl3 (rtx dest, rtx src1, rtx src2) +{ + if (GET_MODE (dest) == DImode) + emit_insn (gen_rotldi3 (dest, src1, src2)); + else + emit_insn (gen_rotlsi3 (dest, src1, src2)); +} + /* Generate rtl for a load, shift, and compare of less than a full word. LOAD_MODE is the machine mode for the loads. @@ -640,7 +688,7 @@ { if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) /* Do not expect length longer than word_mode. */ - return false; + return false; else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) { bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); @@ -684,7 +732,7 @@ rtx j; /* Example of generated code for 35 bytes aligned 1 byte. - + mtctr 8 li 6,0 li 5,8 @@ -712,7 +760,7 @@ popcntd 9,9 subfe 10,10,10 or 9,9,10 - + Compiled with -fno-reorder-blocks for clarity. */ /* Structure of what we're going to do: @@ -955,7 +1003,7 @@ if (!bytes_is_const) { /* If we're dealing with runtime length, we have to check if - it's zero after the loop. When length is known at compile + it's zero after the loop. When length is known at compile time the no-remainder condition is dealt with above. By doing this after cleanup_label, we also deal with the case where length is 0 at the start and we bypass the @@ -1325,7 +1373,7 @@ rtx tmp_reg_src1 = gen_reg_rtx (word_mode); rtx tmp_reg_src2 = gen_reg_rtx (word_mode); /* P7/P8 code uses cond for subfc. but P9 uses - it for cmpld which needs CCUNSmode. */ + it for cmpld which needs CCUNSmode. */ rtx cond; if (TARGET_P9_MISC) cond = gen_reg_rtx (CCUNSmode); @@ -1578,7 +1626,7 @@ emit_label (convert_label); /* We need to produce DI result from sub, then convert to target SI - while maintaining <0 / ==0 / >0 properties. This sequence works: + while maintaining <0 / ==0 / >0 properties. This sequence works: subfc L,A,B subfe H,H,H popcntd L,L @@ -1847,6 +1895,9 @@ rtx tmp_reg_src1 = gen_reg_rtx (word_mode); rtx tmp_reg_src2 = gen_reg_rtx (word_mode); + rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); + rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); + /* Generate sequence of ld/ldbrx, cmpb to compare out to the length specified. */ unsigned HOST_WIDE_INT bytes_to_compare = compare_length; @@ -1853,12 +1904,9 @@ while (bytes_to_compare > 0) { /* Compare sequence: - check each 8B with: ld/ld cmpd bne - If equal, use rldicr/cmpb to check for zero byte. + check each 8B with: ld/ld/cmpb/cmpb/orc./bne + cleanup code at end: - cmpb get byte that differs - cmpb look for zero byte - orc combine cntlzd get bit of first zero/diff byte subfic convert for rldcl use rldcl rldcl extract diff/zero byte @@ -1895,64 +1943,54 @@ rid of the extra bytes. */ cmp_bytes = bytes_to_compare; - src1 = adjust_address (orig_src1, load_mode, offset); - src2 = adjust_address (orig_src2, load_mode, offset); + rtx offset_rtx; + if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM) + offset_rtx = GEN_INT (offset); + else + { + offset_rtx = gen_reg_rtx (Pmode); + emit_move_insn (offset_rtx, GEN_INT (offset)); + } + rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx); + rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx); + do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); + do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); - if (!REG_P (XEXP (src1, 0))) - { - rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); - src1 = replace_equiv_address (src1, src1_reg); - } - set_mem_size (src1, load_mode_size); - - if (!REG_P (XEXP (src2, 0))) - { - rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); - src2 = replace_equiv_address (src2, src2_reg); - } - set_mem_size (src2, load_mode_size); - - do_load_for_compare (tmp_reg_src1, src1, load_mode); - do_load_for_compare (tmp_reg_src2, src2, load_mode); - /* We must always left-align the data we read, and clear any bytes to the right that are beyond the string. Otherwise the cmpb sequence won't produce the correct - results. The beginning of the compare will be done - with word_mode so will not have any extra shifts or - clear rights. */ + results. However if there is only one byte left, we + can just subtract to get the final result so the shifts + and clears are not needed. */ - if (load_mode_size < word_mode_size) - { - /* Rotate left first. */ - rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size)); - if (word_mode == DImode) - { - emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh)); - emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh)); - } - else - { - emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh)); - emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh)); - } - } + unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; - if (cmp_bytes < word_mode_size) + /* Loading just a single byte is a special case. If we are + loading more than that, we have to check whether we are + looking at the entire chunk of data. If not, rotate left and + clear right so that bytes we aren't supposed to look at are + zeroed, and the first byte we are supposed to compare is + leftmost. */ + + if (load_mode_size != 1) { - /* Now clear right. This plus the rotate can be - turned into a rldicr instruction. */ - HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); - if (word_mode == DImode) + if (load_mode_size < word_mode_size) { - emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); - emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); + /* Rotate left first. */ + rtx sh = GEN_INT (BITS_PER_UNIT + * (word_mode_size - load_mode_size)); + do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); + do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); } - else + + if (cmp_bytes < word_mode_size) { - emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); - emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); + /* Now clear right. This plus the rotate can be + turned into a rldicr instruction. */ + HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); + rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); + do_and3 (tmp_reg_src1, tmp_reg_src1, mask); + do_and3 (tmp_reg_src2, tmp_reg_src2, mask); } } @@ -1967,8 +2005,6 @@ A == B: branch to result 0. A != B: cleanup code to compute result. */ - unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; - rtx dst_label; if (remain > 0 || equality_compare_rest) { @@ -1982,69 +2018,89 @@ /* Branch to end and produce result of 0. */ dst_label = final_move_label; - rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); - rtx cond = gen_reg_rtx (CCmode); + if (load_mode_size == 1) + { + /* Special case for comparing just single byte. */ + if (equality_compare_rest) + { + /* Use subf./bne to branch to final_move_label if the + byte differs, otherwise fall through to the strncmp + call. We must also check for a zero byte here as we + must not make the library call if this is the end of + the string. */ - /* Always produce the 0 result, it is needed if - cmpb finds a 0 byte in this chunk. */ - rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); - rs6000_emit_dot_insn (result_reg, tmp, 1, cond); + rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); + rtx cond = gen_reg_rtx (CCmode); + rtx diff_rtx = gen_rtx_MINUS (word_mode, + tmp_reg_src1, tmp_reg_src2); + rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); + rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); - rtx cmp_rtx; - if (remain == 0 && !equality_compare_rest) - cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); - else - cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, + lab_ref, pc_rtx); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); + JUMP_LABEL (j) = final_move_label; + LABEL_NUSES (final_move_label) += 1; - rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, - lab_ref, pc_rtx); - rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); - JUMP_LABEL (j) = dst_label; - LABEL_NUSES (dst_label) += 1; + /* Check for zero byte here before fall through to + library call. This catches the case where the + strings are equal and end in a zero byte at this + position. */ - if (remain > 0 || equality_compare_rest) - { - /* Generate a cmpb to test for a 0 byte and branch - to final result if found. */ - rtx cmpb_zero = gen_reg_rtx (word_mode); - rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label); - rtx condz = gen_reg_rtx (CCmode); - rtx zero_reg = gen_reg_rtx (word_mode); - if (word_mode == SImode) - { - emit_insn (gen_movsi (zero_reg, GEN_INT (0))); - emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); - if (cmp_bytes < word_mode_size) - { - /* Don't want to look at zero bytes past end. */ - HOST_WIDE_INT mb = - BITS_PER_UNIT * (word_mode_size - cmp_bytes); - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); - emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask)); - } + rtx cond0 = gen_reg_rtx (CCmode); + emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, + const0_rtx)); + + rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); + + rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, + lab_ref, pc_rtx); + rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); + JUMP_LABEL (j0) = final_move_label; + LABEL_NUSES (final_move_label) += 1; } else { - emit_insn (gen_movdi (zero_reg, GEN_INT (0))); - emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); - if (cmp_bytes < word_mode_size) - { - /* Don't want to look at zero bytes past end. */ - HOST_WIDE_INT mb = - BITS_PER_UNIT * (word_mode_size - cmp_bytes); - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); - emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask)); - } + /* This is the last byte to be compared so we can use + subf to compute the final result and branch + unconditionally to final_move_label. */ + + do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); + + rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); + JUMP_LABEL (j) = final_move_label; + LABEL_NUSES (final_move_label) += 1; + emit_barrier (); } + } + else + { + rtx cmpb_zero = gen_reg_rtx (word_mode); + rtx cmpb_diff = gen_reg_rtx (word_mode); + rtx zero_reg = gen_reg_rtx (word_mode); + rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); + rtx cond = gen_reg_rtx (CCmode); - emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg)); - rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx); - rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx, - lab_ref_fin, pc_rtx); - rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); - JUMP_LABEL (j2) = final_move_label; - LABEL_NUSES (final_move_label) += 1; + emit_move_insn (zero_reg, GEN_INT (0)); + do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); + do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); + rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); + rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); + rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); + + rtx cmp_rtx; + if (remain == 0 && !equality_compare_rest) + cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); + else + cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + + rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, + lab_ref, pc_rtx); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); + JUMP_LABEL (j) = dst_label; + LABEL_NUSES (dst_label) += 1; } offset += cmp_bytes; @@ -2106,9 +2162,6 @@ byte and generates the final result, taking into account zero bytes: - cmpb cmpb_result1, src1, src2 - cmpb cmpb_result2, src1, zero - orc cmpb_result1, cmp_result1, cmpb_result2 cntlzd get bit of first zero/diff byte addi convert for rldcl use rldcl rldcl extract diff/zero byte @@ -2115,10 +2168,7 @@ subf subtract for final result */ - rtx cmpb_diff = gen_reg_rtx (word_mode); - rtx cmpb_zero = gen_reg_rtx (word_mode); rtx rot_amt = gen_reg_rtx (word_mode); - rtx zero_reg = gen_reg_rtx (word_mode); rtx rot1_1 = gen_reg_rtx (word_mode); rtx rot1_2 = gen_reg_rtx (word_mode); @@ -2127,12 +2177,7 @@ if (word_mode == SImode) { - emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); - emit_insn (gen_movsi (zero_reg, GEN_INT (0))); - emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); - emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff)); - emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero)); - emit_insn (gen_clzsi2 (rot_amt, cmpb_diff)); + emit_insn (gen_clzsi2 (rot_amt, result_reg)); emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1, gen_lowpart (SImode, rot_amt))); @@ -2144,12 +2189,7 @@ } else { - emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); - emit_insn (gen_movdi (zero_reg, GEN_INT (0))); - emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); - emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff)); - emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero)); - emit_insn (gen_clzdi2 (rot_amt, cmpb_diff)); + emit_insn (gen_clzdi2 (rot_amt, result_reg)); emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1, gen_lowpart (SImode, rot_amt))); -- Aaron Sawdey, Ph.D. acsaw...@linux.vnet.ibm.com 050-2/C113 (507) 253-7520 home: 507/263-0782 IBM Linux Technology Center - PPC Toolchain