On Tue, May 19, 2020 at 1:48 AM Uros Bizjak <ubiz...@gmail.com> wrote:
>
> On Sun, May 17, 2020 at 7:06 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> >
> > Duplicate the cmpstrn pattern for cmpmem.  The only difference is that
> > the length argument of cmpmem is guaranteed to be less than or equal to
> > lengths of 2 memory areas.  Since "repz cmpsb" can be much slower than
> > memcmp function implemented with vector instruction, see
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
> >
> > expand cmpmem to "repz cmpsb" only with -mgeneral-regs-only.
>
> If there is no benefit compared to the library implementation, then
> enable these patterns only when -minline-all-stringops is used.

Fixed.

> Eventually these should be reimplemented with SSE4 string instructions.
>
> Honza is the author of the block handling x86 system, I'll leave the
> review to him.

We used to expand memcmp to "repz cmpsb" via cmpstrnsi.  It was changed
by

commit 9b0f6f5e511ca512e4faeabc81d2fd3abad9b02f
Author: Nick Clifton <ni...@redhat.com>
Date:   Fri Aug 12 16:26:11 2011 +0000

    builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi pattern.

            * builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi
            pattern.
            * doc/md.texi (cmpstrn): Note that the comparison stops if both
            fetched bytes are zero.
            (cmpstr): Likewise.
            (cmpmem): Note that the comparison does not stop if both of the
            fetched bytes are zero.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95151

is a regression.

Honza, can you take a look at this?

Thanks.

--
H.J.
From ec91a57f3f91168034f7f5eb391949c301741680 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Thu, 14 May 2020 13:06:23 -0700
Subject: [PATCH] x86: Add cmpmemsi for -minline-all-stringops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We used to expand memcmp to "repz cmpsb" via cmpstrnsi.  It was changed
by

commit 9b0f6f5e511ca512e4faeabc81d2fd3abad9b02f
Author: Nick Clifton <ni...@redhat.com>
Date:   Fri Aug 12 16:26:11 2011 +0000

    builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi pattern.

            * builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi
            pattern.
            * doc/md.texi (cmpstrn): Note that the comparison stops if both
            fetched bytes are zero.
            (cmpstr): Likewise.
            (cmpmem): Note that the comparison does not stop if both of the
            fetched bytes are zero.

Duplicate the cmpstrn pattern for cmpmem.  The only difference is that
the length argument of cmpmem is guaranteed to be less than or equal to
lengths of 2 memory areas.  Since "repz cmpsb" can be much slower than
memcmp function implemented with vector instruction, see

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052

expand cmpmem to "repz cmpsb" only for -minline-all-stringops.

gcc/

	PR target/95151
	* config/i386/i386-expand.c (ix86_expand_cmpstrn_or_cmpmem): New
	function.
	* config/i386/i386-protos.h (ix86_expand_cmpstrn_or_cmpmem): New
	prototype.
	* config/i386/i386.md (cmpmemsi): New pattern.

gcc/testsuite/

	PR target/95151
	* gcc.target/i386/pr95151-1.c: New test.
	* gcc.target/i386/pr95151-2.c: Likewise.
	* gcc.target/i386/pr95151-3.c: Likewise.
	* gcc.target/i386/pr95151-4.c: Likewise.
---
 gcc/config/i386/i386-expand.c             | 80 +++++++++++++++++++++++
 gcc/config/i386/i386-protos.h             |  1 +
 gcc/config/i386/i386.md                   | 80 ++++++-----------------
 gcc/testsuite/gcc.target/i386/pr95151-1.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr95151-2.c | 10 +++
 gcc/testsuite/gcc.target/i386/pr95151-3.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr95151-4.c | 11 ++++
 7 files changed, 158 insertions(+), 59 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-4.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 79f827fd653..d3f4280ad58 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -7656,6 +7656,86 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
   return true;
 }
 
+/* Expand cmpstrn or memcmp.  */
+
+bool
+ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
+			       rtx length, rtx align, bool is_cmpstrn)
+{
+  if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
+    return false;
+
+  /* Can't use this if the user has appropriated ecx, esi or edi.  */
+  if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
+    return false;
+
+  if (is_cmpstrn)
+    {
+      /* For strncmp, length is the maximum length, which can be larger
+	 than actual string lengths.  We can expand the cmpstrn pattern
+	 to "repz cmpsb" only if one of the strings is a constant so
+	 that expand_builtin_strncmp() can write the length argument to
+	 be the minimum of the const string length and the actual length
+	 argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
+      tree t1 = MEM_EXPR (src1);
+      tree t2 = MEM_EXPR (src2);
+      if (!((t1 && TREE_CODE (t1) == MEM_REF
+	     && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
+	     && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
+		 == STRING_CST))
+	    || (t2 && TREE_CODE (t2) == MEM_REF
+		&& TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
+		&& (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
+		    == STRING_CST))))
+	return false;
+    }
+  else
+    {
+      /* Expand memcmp to "repz cmpsb" only for -minline-all-stringops
+	 since "repz cmpsb" can be much slower than memcmp function
+	 implemented with vector instructions, see
+
+	 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
+       */
+      if (!TARGET_INLINE_ALL_STRINGOPS)
+	return false;
+    }
+
+  rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
+  rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
+  if (addr1 != XEXP (src1, 0))
+    src1 = replace_equiv_address_nv (src1, addr1);
+  if (addr2 != XEXP (src2, 0))
+    src2 = replace_equiv_address_nv (src2, addr2);
+
+  rtx lengthreg = ix86_zero_extend_to_Pmode (length);
+
+  /* If we are testing strict equality, we can use known alignment to
+     good advantage.  This may be possible with combine, particularly
+     once cc0 is dead.  */
+  if (CONST_INT_P (length))
+    {
+      if (length == const0_rtx)
+	{
+	  emit_move_insn (result, const0_rtx);
+	  return true;
+	}
+      emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
+				     src1, src2));
+    }
+  else
+    {
+      emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
+      emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
+				  src1, src2));
+    }
+
+  rtx out = gen_lowpart (QImode, result);
+  emit_insn (gen_cmpintqi (out));
+  emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
+
+  return true;
+}
 
 /* Expand the appropriate insns for doing strlen if not just doing
    repnz; scasb
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 39fcaa0ad5f..238aa650b61 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -71,6 +71,7 @@ extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
 extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
 extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
 				       rtx, rtx, rtx, rtx, bool);
+extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool);
 
 extern bool constant_address_p (rtx);
 extern bool legitimate_pic_operand_p (rtx);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index aa4f25b7065..9ab41b8ad01 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17761,6 +17761,22 @@
 	  (const_string "*")))
    (set_attr "mode" "QI")])
 
+(define_expand "cmpmemsi"
+  [(set (match_operand:SI 0 "register_operand" "")
+        (compare:SI (match_operand:BLK 1 "memory_operand" "")
+                    (match_operand:BLK 2 "memory_operand" "") ) )
+   (use (match_operand 3 "general_operand"))
+   (use (match_operand 4 "immediate_operand"))]
+  ""
+{
+  if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1],
+				     operands[2], operands[3],
+				     operands[4], false))
+    DONE;
+  else
+    FAIL;
+})
+
 (define_expand "cmpstrnsi"
   [(set (match_operand:SI 0 "register_operand")
 	(compare:SI (match_operand:BLK 1 "general_operand")
@@ -17769,66 +17785,12 @@
    (use (match_operand 4 "immediate_operand"))]
   ""
 {
-  rtx addr1, addr2, countreg, align, out;
-
-  if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
-    FAIL;
-
-  /* Can't use this if the user has appropriated ecx, esi or edi.  */
-  if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
-    FAIL;
-
-  /* One of the strings must be a constant.  If so, expand_builtin_strncmp()
-     will have rewritten the length arg to be the minimum of the const string
-     length and the actual length arg.  If both strings are the same and
-     shorter than the length arg, repz cmpsb will not stop at the 0 byte and
-     will incorrectly base the results on chars past the 0 byte.  */
-  tree t1 = MEM_EXPR (operands[1]);
-  tree t2 = MEM_EXPR (operands[2]);
-  if (!((t1 && TREE_CODE (t1) == MEM_REF
-         && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
-         && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0)) == STRING_CST)
-      || (t2 && TREE_CODE (t2) == MEM_REF
-          && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
-          && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0)) == STRING_CST)))
-    FAIL;
-
-  addr1 = copy_addr_to_reg (XEXP (operands[1], 0));
-  addr2 = copy_addr_to_reg (XEXP (operands[2], 0));
-  if (addr1 != XEXP (operands[1], 0))
-    operands[1] = replace_equiv_address_nv (operands[1], addr1);
-  if (addr2 != XEXP (operands[2], 0))
-    operands[2] = replace_equiv_address_nv (operands[2], addr2);
-
-  countreg = ix86_zero_extend_to_Pmode (operands[3]);
-
-  /* %%% Iff we are testing strict equality, we can use known alignment
-     to good advantage.  This may be possible with combine, particularly
-     once cc0 is dead.  */
-  align = operands[4];
-
-  if (CONST_INT_P (operands[3]))
-    {
-      if (operands[3] == const0_rtx)
-	{
-	  emit_move_insn (operands[0], const0_rtx);
-	  DONE;
-	}
-      emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align,
-				     operands[1], operands[2]));
-    }
+  if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1],
+				     operands[2], operands[3],
+				     operands[4], true))
+    DONE;
   else
-    {
-      emit_insn (gen_cmp_1 (Pmode, countreg, countreg));
-      emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align,
-				  operands[1], operands[2]));
-    }
-
-  out = gen_lowpart (QImode, operands[0]);
-  emit_insn (gen_cmpintqi (out));
-  emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, out));
-
-  DONE;
+    FAIL;
 })
 
 ;; Produce a tri-state integer (-1, 0, 1) from condition codes.
diff --git a/gcc/testsuite/gcc.target/i386/pr95151-1.c b/gcc/testsuite/gcc.target/i386/pr95151-1.c
new file mode 100644
index 00000000000..54a7510042a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95151-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -minline-all-stringops" } */
+
+struct foo
+{
+  char array[257];
+};
+
+extern struct foo x;
+
+int
+func (struct foo i)
+{
+  return __builtin_memcmp (&x, &i, sizeof (x)) ? 1 : 2;
+}
+
+/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcmp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95151-2.c b/gcc/testsuite/gcc.target/i386/pr95151-2.c
new file mode 100644
index 00000000000..8f9d8ee1bf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95151-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -minline-all-stringops" } */
+
+int
+func (void *d, void *s, unsigned int l)
+{
+  return __builtin_memcmp (d, s, l) ? 1 : 2;
+}
+
+/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcmp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95151-3.c b/gcc/testsuite/gcc.target/i386/pr95151-3.c
new file mode 100644
index 00000000000..14cbdec4c93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95151-3.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-inline-all-stringops" } */
+
+struct foo
+{
+  char array[257];
+};
+
+extern struct foo x;
+
+int
+func (struct foo i)
+{
+  return __builtin_memcmp (&x, &i, sizeof (x)) ? 1 : 2;
+}
+
+/* { dg-final { scan-assembler "call\[\\t \]*_?memcmp" } } */
+/* { dg-final { scan-assembler-not "cmpsb" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95151-4.c b/gcc/testsuite/gcc.target/i386/pr95151-4.c
new file mode 100644
index 00000000000..c93b2b60c5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95151-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-inline-all-stringops" } */
+
+int
+func (void *d, void *s, unsigned int l)
+{
+  return __builtin_memcmp (d, s, l) ? 1 : 2;
+}
+
+/* { dg-final { scan-assembler "call\[\\t \]*_?memcmp" } } */
+/* { dg-final { scan-assembler-not "cmpsb" } } */
-- 
2.26.2

Reply via email to