This patch uses the support added in the patch I posted last week for actually doing inline expansion of memmove(). I've added a might_overlap parameter to expand_block_move() to tell it when it must make sure to handle overlapping moves. I changed the code to save up the generated rtx for both loads and stores instead of just stores. In the might_overlap==true case, if we get to MAX_MOVE_REG and the move is not done yet, then we bail out and return false. So what this can now do is inline expand any memmove() that can be done in 4 loads followed by 4 stores. It will use lxv/stxv if size/alignment allows, otherwise it will use unaligned integer loads/stores. So it can expand most memmove() up to 32 bytes, and some that are 33-64 bytes if the arguments are 16 byte aligned.
I've also removed the code from expand_block_move() for dealing with mode==BLKmode because I don't believe that can happen. The big if construct that figures out which size we are going to use has a plain else on it, and every clause in it sets mode to something other than BLKmode. So I removed that code to simplify things and just left a gcc_assert(mode != BLKmode). Regtest in progress on ppc64le (power9), if tests are ok, is this ok for trunk after the movmem optab patch posted last week is approved? Thanks! Aaron 2019-09-30 Aaron Sawdey <acsaw...@linux.ibm.com> * config/rs6000/rs6000-protos.h (expand_block_move): Change prototype. * config/rs6000/rs6000-string.c (expand_block_move): Add might_overlap parm. * config/rs6000/rs6000.md (movmemsi): Add new pattern. (cpymemsi): Add might_overlap parm to expand_block_move() call. Index: gcc/config/rs6000/rs6000-protos.h =================================================================== --- gcc/config/rs6000/rs6000-protos.h (revision 276131) +++ gcc/config/rs6000/rs6000-protos.h (working copy) @@ -69,7 +69,7 @@ extern void rs6000_generate_float2_double_code (rtx, rtx, rtx); extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx); extern int expand_block_clear (rtx[]); -extern int expand_block_move (rtx[]); +extern int expand_block_move (rtx[], bool); extern bool expand_block_compare (rtx[]); extern bool expand_strn_compare (rtx[], int); extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode); Index: gcc/config/rs6000/rs6000-string.c =================================================================== --- gcc/config/rs6000/rs6000-string.c (revision 276131) +++ gcc/config/rs6000/rs6000-string.c (working copy) @@ -2719,7 +2719,7 @@ #define MAX_MOVE_REG 4 int -expand_block_move (rtx operands[]) +expand_block_move (rtx operands[], bool might_overlap) { rtx orig_dest = operands[0]; rtx orig_src = operands[1]; @@ -2730,6 +2730,7 @@ int bytes; int offset; int move_bytes; + rtx loads[MAX_MOVE_REG]; rtx stores[MAX_MOVE_REG]; int num_reg = 0; @@ -2817,47 +2818,35 @@ gen_func.mov = gen_movqi; } + /* Mode is always set to something other than BLKmode by one of the + cases of the if statement above. */ + gcc_assert (mode != BLKmode); + src = adjust_address (orig_src, mode, offset); dest = adjust_address (orig_dest, mode, offset); - if (mode != BLKmode) - { - rtx tmp_reg = gen_reg_rtx (mode); + rtx tmp_reg = gen_reg_rtx (mode); + + loads[num_reg] = (*gen_func.mov) (tmp_reg, src); + stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); - emit_insn ((*gen_func.mov) (tmp_reg, src)); - stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); - } + /* If we didn't succeed in doing it in one pass, we can't do it in the + might_overlap case. Bail out and return failure. */ + if (might_overlap && num_reg >= MAX_MOVE_REG + && bytes > move_bytes) + return 0; - if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) + /* Emit loads and stores saved up. */ + if (num_reg >= MAX_MOVE_REG || bytes == move_bytes) { int i; for (i = 0; i < num_reg; i++) + emit_insn (loads[i]); + for (i = 0; i < num_reg; i++) emit_insn (stores[i]); num_reg = 0; } - - if (mode == BLKmode) - { - /* Move the address into scratch registers. The movmemsi - patterns require zero offset. */ - if (!REG_P (XEXP (src, 0))) - { - rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); - src = replace_equiv_address (src, src_reg); - } - set_mem_size (src, move_bytes); - - if (!REG_P (XEXP (dest, 0))) - { - rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); - dest = replace_equiv_address (dest, dest_reg); - } - set_mem_size (dest, move_bytes); - - emit_insn ((*gen_func.movmemsi) (dest, src, - GEN_INT (move_bytes & 31), - align_rtx)); - } + } return 1; Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 276131) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -9057,7 +9057,7 @@ FAIL; }) -;; String/block move insn. +;; String/block copy insn (source and destination must not overlap). ;; Argument 0 is the destination ;; Argument 1 is the source ;; Argument 2 is the length @@ -9070,11 +9070,31 @@ (use (match_operand:SI 3 ""))])] "" { - if (expand_block_move (operands)) + if (expand_block_move (operands, false)) DONE; else FAIL; }) + +;; String/block move insn (source and destination may overlap). +;; Argument 0 is the destination +;; Argument 1 is the source +;; Argument 2 is the length +;; Argument 3 is the alignment + +(define_expand "movmemsi" + [(parallel [(set (match_operand:BLK 0 "") + (match_operand:BLK 1 "")) + (use (match_operand:SI 2 "")) + (use (match_operand:SI 3 ""))])] + "" +{ + if (expand_block_move (operands, true)) + DONE; + else + FAIL; +}) + ;; Define insns that do load or store with update. Some of these we can ;; get by using pre-decrement or pre-increment, but the hardware can also -- Aaron Sawdey, Ph.D. acsaw...@linux.vnet.ibm.com 050-2/C113 (507) 253-7520 home: 507/263-0782 IBM Linux Technology Center - PPC Toolchain