This patch uses the support added in the patch I posted last week for actually 
doing
inline expansion of memmove(). I've added a might_overlap parameter to 
expand_block_move()
to tell it when it must make sure to handle overlapping moves. I changed the 
code to
save up the generated rtx for both loads and stores instead of just stores. In 
the
might_overlap==true case, if we get to MAX_MOVE_REG and the move is not done 
yet, then
we bail out and return false. So what this can now do is inline expand any 
memmove()
that can be done in 4 loads followed by 4 stores. It will use lxv/stxv if 
size/alignment
allows, otherwise it will use unaligned integer loads/stores. So it can expand 
most
memmove() up to 32 bytes, and some that are 33-64 bytes if the arguments are 16 
byte
aligned.

I've also removed the code from expand_block_move() for dealing with
mode==BLKmode because I don't believe that can happen. The big if construct that
figures out which size we are going to use has a plain else on it, and every
clause in it sets mode to something other than BLKmode. So I removed that code
to simplify things and just left a gcc_assert(mode != BLKmode).

Regtest in progress on ppc64le (power9), if tests are ok, is this ok for trunk
after the movmem optab patch posted last week is approved?

Thanks!
   Aaron

2019-09-30  Aaron Sawdey <acsaw...@linux.ibm.com>

        * config/rs6000/rs6000-protos.h (expand_block_move): Change prototype.
        * config/rs6000/rs6000-string.c (expand_block_move): Add might_overlap 
parm.
        * config/rs6000/rs6000.md (movmemsi): Add new pattern.
        (cpymemsi): Add might_overlap parm to expand_block_move() call.



Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h   (revision 276131)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -69,7 +69,7 @@
 extern void rs6000_generate_float2_double_code (rtx, rtx, rtx);
 extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx);
 extern int expand_block_clear (rtx[]);
-extern int expand_block_move (rtx[]);
+extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
Index: gcc/config/rs6000/rs6000-string.c
===================================================================
--- gcc/config/rs6000/rs6000-string.c   (revision 276131)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2719,7 +2719,7 @@
 #define MAX_MOVE_REG 4

 int
-expand_block_move (rtx operands[])
+expand_block_move (rtx operands[], bool might_overlap)
 {
   rtx orig_dest = operands[0];
   rtx orig_src = operands[1];
@@ -2730,6 +2730,7 @@
   int bytes;
   int offset;
   int move_bytes;
+  rtx loads[MAX_MOVE_REG];
   rtx stores[MAX_MOVE_REG];
   int num_reg = 0;

@@ -2817,47 +2818,35 @@
          gen_func.mov = gen_movqi;
        }

+      /* Mode is always set to something other than BLKmode by one of the
+        cases of the if statement above.  */
+      gcc_assert (mode != BLKmode);
+
       src = adjust_address (orig_src, mode, offset);
       dest = adjust_address (orig_dest, mode, offset);

-      if (mode != BLKmode)
-       {
-         rtx tmp_reg = gen_reg_rtx (mode);
+      rtx tmp_reg = gen_reg_rtx (mode);
+
+      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
+      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);

-         emit_insn ((*gen_func.mov) (tmp_reg, src));
-         stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
-       }
+      /* If we didn't succeed in doing it in one pass, we can't do it in the
+        might_overlap case.  Bail out and return failure.  */
+      if (might_overlap && num_reg >= MAX_MOVE_REG
+         && bytes > move_bytes)
+       return 0;

-      if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
+      /* Emit loads and stores saved up.  */
+      if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
        {
          int i;
          for (i = 0; i < num_reg; i++)
+           emit_insn (loads[i]);
+         for (i = 0; i < num_reg; i++)
            emit_insn (stores[i]);
          num_reg = 0;
        }
-
-      if (mode == BLKmode)
-       {
-         /* Move the address into scratch registers.  The movmemsi
-            patterns require zero offset.  */
-         if (!REG_P (XEXP (src, 0)))
-           {
-             rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
-             src = replace_equiv_address (src, src_reg);
-           }
-         set_mem_size (src, move_bytes);
-
-         if (!REG_P (XEXP (dest, 0)))
-           {
-             rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
-             dest = replace_equiv_address (dest, dest_reg);
-           }
-         set_mem_size (dest, move_bytes);
-
-         emit_insn ((*gen_func.movmemsi) (dest, src,
-                                          GEN_INT (move_bytes & 31),
-                                          align_rtx));
-       }
+       
     }

   return 1;
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 276131)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -9057,7 +9057,7 @@
     FAIL;
 })

-;; String/block move insn.
+;; String/block copy insn (source and destination must not overlap).
 ;; Argument 0 is the destination
 ;; Argument 1 is the source
 ;; Argument 2 is the length
@@ -9070,11 +9070,31 @@
              (use (match_operand:SI 3 ""))])]
   ""
 {
-  if (expand_block_move (operands))
+  if (expand_block_move (operands, false))
     DONE;
   else
     FAIL;
 })
+
+;; String/block move insn (source and destination may overlap).
+;; Argument 0 is the destination
+;; Argument 1 is the source
+;; Argument 2 is the length
+;; Argument 3 is the alignment
+
+(define_expand "movmemsi"
+  [(parallel [(set (match_operand:BLK 0 "")
+                  (match_operand:BLK 1 ""))
+             (use (match_operand:SI 2 ""))
+             (use (match_operand:SI 3 ""))])]
+  ""
+{
+  if (expand_block_move (operands, true))
+    DONE;
+  else
+    FAIL;
+})
+
 
 ;; Define insns that do load or store with update.  Some of these we can
 ;; get by using pre-decrement or pre-increment, but the hardware can also


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain

Reply via email to