https://gcc.gnu.org/g:a87cdfd2ca3260126d3c75ddfb5cdea6e721d8d0

commit r17-597-ga87cdfd2ca3260126d3c75ddfb5cdea6e721d8d0
Author: Roger Sayle <[email protected]>
Date:   Tue May 19 07:29:08 2026 -0400

    i386: Optimize ptestz(x,-1) as ptestz(x,x) on x86
    
    This patch, inspired by PR target/90483 and libstdc++/118416, implements
    some RTL expansion-time simplifications of ptest. A common idiom for
    testing a vector against zero is to use ptestz(mask,-1).  Alas the code
    generated for this is suboptimal, requiring materialization of an all_ones
    vector.  Given that ptestz(x,y) is defined as (x & y) == 0, an equivalent
    form is ptestz(mask,mask), saving an instruction (if ~0 isn't available).
    
    Consider the function:
    
    typedef long long v2di __attribute__ ((__vector_size__ (16)));
    
    int foo (v2di x)
    {
      return __builtin_ia32_ptestz128(x,~(v2di){0,0});
    }
    
    with -O2 -mavx2, GCC currently generates:
    
    foo:    vpcmpeqd        %xmm1, %xmm1, %xmm1
            xorl    %eax, %eax
            vptest  %xmm1, %xmm0
            sete    %al
            ret
    
    with this patch, it now generates:
    
    foo:    xorl    %eax, %eax
            vptest  %xmm0, %xmm0
            sete    %al
            ret
    
    2026-05-19  Roger Sayle  <[email protected]>
    
    gcc/ChangeLog
            PR target/90483
            PR libstdc++/118416
            * config/i386/i386-expand.cc (ix86_expand_sse_ptest):  Refactor
            with optimizations for PTESTZ*, PTESTC* and PTESTNZC*, including
            transforming ptestz(x,-1) into ptestz(x,x).
    
    gcc/testsuite/ChangeLog
            PR target/90483
            PR libstdc++/118416
            * gcc.target/i386/sse4_1-ptest-8.c: New test case.
            * gcc.target/i386/sse4_1-ptest-9.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc                 | 78 ++++++++++++++++++++------
 gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c | 21 +++++++
 gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c | 21 +++++++
 3 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 01cff86d20aa..e278b02ffefd 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -11999,35 +11999,79 @@ ix86_expand_sse_ptest (const struct 
builtin_description *d, tree exp,
   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   enum rtx_code comparison = d->comparison;
-
-  /* ptest reg, reg sets the carry flag.  */
-  if (comparison == LTU
-      && (d->code == IX86_BUILTIN_PTESTC
-         || d->code == IX86_BUILTIN_PTESTC256)
-      && rtx_equal_p (op0, op1))
-    {
-      if (!target)
-       target = gen_reg_rtx (SImode);
-      emit_move_insn (target, const1_rtx);
-      return target;
-    }
+  rtx result = NULL_RTX;
 
   if (VECTOR_MODE_P (mode0))
     op0 = safe_vector_operand (op0, mode0);
   if (VECTOR_MODE_P (mode1))
     op1 = safe_vector_operand (op1, mode1);
 
-  target = gen_reg_rtx (SImode);
-  emit_move_insn (target, const0_rtx);
-  target = gen_rtx_SUBREG (QImode, target, 0);
+  switch (d->code)
+    {
+    case IX86_BUILTIN_PTESTZ:
+    case IX86_BUILTIN_PTESTZ256:
+      // Returns (OP0 & OP1) == 0
+      if (rtx_equal_p (op0, CONST0_RTX (mode0))
+         || rtx_equal_p (op1, CONST0_RTX (mode1)))
+       result = const1_rtx;
+      else if (rtx_equal_p (op0, CONSTM1_RTX (mode0)))
+       {
+         op1 = force_reg (mode1, op1);
+         op0 = op1;
+       }
+      else if (rtx_equal_p (op1, CONSTM1_RTX (mode1)))
+       {
+         op0 = force_reg (mode0, op0);
+         op1 = op0;
+       }
+      else if (MEM_P (op0) && !MEM_P (op1))
+       std::swap (op0, op1);
+      break;
+
+    case IX86_BUILTIN_PTESTC:
+    case IX86_BUILTIN_PTESTC256:
+      // Returns (~OP0 & OP1) == 0
+      if (rtx_equal_p (op0, CONSTM1_RTX (mode0))
+         || rtx_equal_p (op1, CONST0_RTX (mode1))
+         || rtx_equal_p (op0, op1))
+       result = const1_rtx;
+      break;
+
+    case IX86_BUILTIN_PTESTNZC:
+    case IX86_BUILTIN_PTESTNZC256:
+      // Returns ((OP0 && OP1) != 0) && ((~OP0 && OP1) != 0)
+      if (rtx_equal_p (op0, CONST0_RTX (mode0))
+         || rtx_equal_p (op0, CONSTM1_RTX (mode0))
+         || rtx_equal_p (op1, CONST0_RTX (mode1))
+         || rtx_equal_p (op0, op1))
+       result = const0_rtx;
+      break;
+
+    default:
+      break;
+    }
 
   if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0)
+      || result)
     op0 = copy_to_mode_reg (mode0, op0);
   if ((optimize && !register_operand (op1, mode1))
-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1)
+      || result)
     op1 = copy_to_mode_reg (mode1, op1);
 
+  if (result)
+    {
+      if (!target)
+       target = gen_reg_rtx (SImode);
+      emit_move_insn (target, result);
+      return target;
+    }
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
   pat = GEN_FCN (d->icode) (op0, op1);
   if (! pat)
     return 0;
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
new file mode 100644
index 000000000000..600b807defaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
@@ -0,0 +1,21 @@
+/* PR target/90483 */
+/* PR libstdc++/118416 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long v2di __attribute__ ((__vector_size__ (16)));
+
+v2di x;
+
+int test1 ()
+{
+  return __builtin_ia32_ptestz128((v2di){-1,-1}, x == 0);
+}
+
+int test2 ()
+{
+  return __builtin_ia32_ptestz128(x == 0, (v2di){-1,-1});
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
+/* { dg-final { scan-assembler-not "pcmpeqd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
new file mode 100644
index 000000000000..fe7e710c6fb7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
@@ -0,0 +1,21 @@
+/* PR target/90483 */
+/* PR libstdc++/118416 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long v2di __attribute__ ((__vector_size__ (16)));
+
+v2di x;
+
+int test1 ()
+{
+  return __builtin_ia32_ptestz128((v2di){0,0}, x == 0);
+}
+
+int test2 ()
+{
+  return __builtin_ia32_ptestz128(x == 0, (v2di){0,0});
+}
+
+/* { dg-final { scan-assembler-not "ptest\[ \\t\]+%" } } */
+/* { dg-final { scan-assembler-not "pcmpeq" } } */

Reply via email to