Hi.

This is updated version of the patch I've sent here:
https://gcc.gnu.org/ml/gcc-patches/2017-08/msg00149.html

The patch is about introduction of a new target macro that will
drive how we expand mempcpy. Having a target with 
TARGET_HAS_FAST_MEMPCPY_ROUTINE == 1,
we do not expand using memcpy, but mempcy.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

@Wilco: Can you please come up with a test-case for aarch64?

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

2019-05-09  Martin Liska  <mli...@suse.cz>

        PR middle-end/90263
        * builtins.c (expand_builtin_memory_copy_args): When having a
        target with fast mempcpy implementation do now use memcpy.
        * config/i386/i386.h (TARGET_HAS_FAST_MEMPCPY_ROUTINE): New.
        * defaults.h (TARGET_HAS_FAST_MEMPCPY_ROUTINE): By default
        target does not have fast mempcpy routine.
        * doc/tm.texi: Document the new hook.
        * doc/tm.texi.in: Likewise.
        * expr.h (emit_block_move_hints): Add 2 new arguments.
        * expr.c (emit_block_move_hints): Bail out when libcall
        to memcpy would be used.

gcc/testsuite/ChangeLog:

2019-05-09  Martin Liska  <mli...@suse.cz>

        * gcc.c-torture/execute/builtins/mempcpy.c: Use mempcpy
        without LHS.
---
 gcc/builtins.c                                 | 18 ++++++++++++++++--
 gcc/config/i386/i386.h                         |  3 +++
 gcc/defaults.h                                 |  7 +++++++
 gcc/doc/tm.texi                                |  5 +++++
 gcc/doc/tm.texi.in                             |  5 +++++
 gcc/expr.c                                     | 13 ++++++++++++-
 gcc/expr.h                                     |  4 +++-
 .../gcc.c-torture/execute/builtins/mempcpy.c   |  5 ++---
 8 files changed, 53 insertions(+), 7 deletions(-)


diff --git a/gcc/builtins.c b/gcc/builtins.c
index d37d73fc4a0..09d5b540ae8 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3839,6 +3839,8 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
   unsigned HOST_WIDE_INT max_size;
   unsigned HOST_WIDE_INT probable_max_size;
 
+  bool is_move_done;
+
   /* If DEST is not a pointer type, call the normal function.  */
   if (dest_align == 0)
     return NULL_RTX;
@@ -3888,11 +3890,23 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
   if (CALL_EXPR_TAILCALL (exp)
       && (retmode == RETURN_BEGIN || target == const0_rtx))
     method = BLOCK_OP_TAILCALL;
-  if (retmode == RETURN_END && target != const0_rtx)
+  if (TARGET_HAS_FAST_MEMPCPY_ROUTINE
+      && retmode == RETURN_END
+      && target != const0_rtx)
     method = BLOCK_OP_NO_LIBCALL_RET;
   dest_addr = emit_block_move_hints (dest_mem, src_mem, len_rtx, method,
 				     expected_align, expected_size,
-				     min_size, max_size, probable_max_size);
+				     min_size, max_size, probable_max_size,
+				     TARGET_HAS_FAST_MEMPCPY_ROUTINE
+				     && retmode == RETURN_END,
+				     &is_move_done);
+
+  /* Bail out when a mempcpy call would be expanded as libcall and when
+     we have a target that provides a fast implementation
+     of mempcpy routine.  */
+  if (!is_move_done)
+    return NULL_RTX;
+
   if (dest_addr == pc_rtx)
     return NULL_RTX;
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3fee779296f..7d20178f432 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1906,6 +1906,9 @@ typedef struct ix86_args {
 
 #define CLEAR_RATIO(speed) ((speed) ? MIN (6, ix86_cost->move_ratio) : 2)
 
+/* C library provides fast implementation of mempcpy function.  */
+#define TARGET_HAS_FAST_MEMPCPY_ROUTINE 1
+
 /* Define if shifts truncate the shift count which implies one can
    omit a sign-extension or zero-extension of a shift count.
 
diff --git a/gcc/defaults.h b/gcc/defaults.h
index b7534256119..eca19d1977f 100644
--- a/gcc/defaults.h
+++ b/gcc/defaults.h
@@ -1348,6 +1348,13 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define SET_RATIO(speed) MOVE_RATIO (speed)
 #endif
 
+/* By default do not generate libcall to mempcpy and rather use
+   libcall to memcpy and adjustment of return value.  */
+
+#ifndef TARGET_HAS_FAST_MEMPCPY_ROUTINE
+#define TARGET_HAS_FAST_MEMPCPY_ROUTINE 0
+#endif
+
 /* Supply a default definition of STACK_SAVEAREA_MODE for emit_stack_save.
    Normally move_insn, so Pmode stack pointer.  */
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8c8978bb13a..cf709dfb843 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6735,6 +6735,11 @@ optimized for speed rather than size.
 If you don't define this, it defaults to the value of @code{MOVE_RATIO}.
 @end defmac
 
+@defmac TARGET_HAS_FAST_MEMPCPY_ROUTINE
+By default do not generate libcall to mempcpy and rather use
+libcall to memcpy and adjustment of return value.
+@end defmac
+
 @defmac USE_LOAD_POST_INCREMENT (@var{mode})
 A C expression used to determine whether a load postincrement is a good
 thing to use for a given mode.  Defaults to the value of
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index fe1194ef91a..d05c52a36f9 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4619,6 +4619,11 @@ optimized for speed rather than size.
 If you don't define this, it defaults to the value of @code{MOVE_RATIO}.
 @end defmac
 
+@defmac TARGET_HAS_FAST_MEMPCPY_ROUTINE
+By default do not generate libcall to mempcpy and rather use
+libcall to memcpy and adjustment of return value.
+@end defmac
+
 @defmac USE_LOAD_POST_INCREMENT (@var{mode})
 A C expression used to determine whether a load postincrement is a good
 thing to use for a given mode.  Defaults to the value of
diff --git a/gcc/expr.c b/gcc/expr.c
index fa15b7eceae..c78bc74c0d9 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1561,12 +1561,16 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned int expected_align, HOST_WIDE_INT expected_size,
 		       unsigned HOST_WIDE_INT min_size,
 		       unsigned HOST_WIDE_INT max_size,
-		       unsigned HOST_WIDE_INT probable_max_size)
+		       unsigned HOST_WIDE_INT probable_max_size,
+		       bool bail_out_libcall, bool *is_move_done)
 {
   int may_use_call;
   rtx retval = 0;
   unsigned int align;
 
+  if (is_move_done)
+    *is_move_done = true;
+
   gcc_assert (size);
   if (CONST_INT_P (size) && INTVAL (size) == 0)
     return 0;
@@ -1628,6 +1632,13 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	   && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x))
 	   && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (y)))
     {
+      if (bail_out_libcall)
+	{
+	  if (is_move_done)
+	    *is_move_done = false;
+	  return retval;
+	}
+
       if (may_use_call < 0)
 	return pc_rtx;
 
diff --git a/gcc/expr.h b/gcc/expr.h
index 17c3962436a..6eb70bf12f1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -114,7 +114,9 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
-				  unsigned HOST_WIDE_INT);
+				  unsigned HOST_WIDE_INT,
+				  bool bail_out_libcall = false,
+				  bool *is_move_done = NULL);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
 				 by_pieces_constfn, void *);
 extern bool emit_storent_insn (rtx to, rtx from);
diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/mempcpy.c b/gcc/testsuite/gcc.c-torture/execute/builtins/mempcpy.c
index d82e2232d7b..0b84d229cef 100644
--- a/gcc/testsuite/gcc.c-torture/execute/builtins/mempcpy.c
+++ b/gcc/testsuite/gcc.c-torture/execute/builtins/mempcpy.c
@@ -56,9 +56,8 @@ main_test (void)
   if (__builtin_mempcpy (p, "ABCDE", 6) != p + 6 || memcmp (p, "ABCDE", 6))
     abort ();
 
-  /* If the result of mempcpy is ignored, gcc should use memcpy.
-     This should be optimized always, so set inside_main again.  */
-  inside_main = 1;
+  /* Set inside main in order to not abort because of usafe of mempcpy.  */
+  inside_main = 0;
   mempcpy (p + 5, s3, 1);
   if (memcmp (p, "ABCDEFg", 8))
     abort ();

Reply via email to