This patch, inspired by PR target/90483 and libstdc++/118416, implements
some RTL expansion-time simplifications of ptest. A common idiom for
testing a vector against zero is to use ptestz(mask,-1).  Alas the code
generated for this is suboptimal, requiring materialization of an all_ones
vector.  Given that ptestz(x,y) is defined as (x & y) != 0, an equivalent
form is ptestz(mask,mask), saving an instruction (if ~0 isn't available).

Consider the function:

typedef long long v2di __attribute__ ((__vector_size__ (16)));

int foo (v2di x)
{
  return __builtin_ia32_ptestz128(x,~(v2di){0,0});
}

with -O2 -mavx2, GCC currently generates:

foo:    vpcmpeqd        %xmm1, %xmm1, %xmm1
        xorl    %eax, %eax
        vptest  %xmm1, %xmm0
        sete    %al
        ret

with this patch, it now generates:

foo:    xorl    %eax, %eax
        vptest  %xmm0, %xmm0
        sete    %al
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2026-05-18  Roger Sayle  <[email protected]>

gcc/ChangeLog
        PR target/90483
        PR libstdc++/118416
        * config/i386/i386-expand.cc (ix86_expand_sse_ptest):  Refactor
        with optimizations for PTESTZ*, PTESTC* and PTESTNZC*, including
        transforming ptestz(x,-1) into ptestz(x,x).

gcc/testsuite/ChangeLog
        PR target/90483
        PR libstdc++/118416
        * gcc.target/config/i386/sse4_1-ptest-8.c: New test case.
        * gcc.target/config/i386/sse4_1-ptest-9.c: Likewise.


diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index df44a4eb99d..de86d2d24b4 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -11979,35 +11979,79 @@ ix86_expand_sse_ptest (const struct 
builtin_description *d, tree exp,
   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   enum rtx_code comparison = d->comparison;
-
-  /* ptest reg, reg sets the carry flag.  */
-  if (comparison == LTU
-      && (d->code == IX86_BUILTIN_PTESTC
-         || d->code == IX86_BUILTIN_PTESTC256)
-      && rtx_equal_p (op0, op1))
-    {
-      if (!target)
-       target = gen_reg_rtx (SImode);
-      emit_move_insn (target, const1_rtx);
-      return target;
-    }
+  rtx result = NULL_RTX;
 
   if (VECTOR_MODE_P (mode0))
     op0 = safe_vector_operand (op0, mode0);
   if (VECTOR_MODE_P (mode1))
     op1 = safe_vector_operand (op1, mode1);
 
-  target = gen_reg_rtx (SImode);
-  emit_move_insn (target, const0_rtx);
-  target = gen_rtx_SUBREG (QImode, target, 0);
+  switch (d->code)
+    {
+    case IX86_BUILTIN_PTESTZ:
+    case IX86_BUILTIN_PTESTZ256:
+      // Returns (OP0 & OP1) == 0
+      if (rtx_equal_p (op0, CONST0_RTX (mode0))
+         || rtx_equal_p (op1, CONST0_RTX (mode1)))
+       result = const1_rtx;
+      else if (rtx_equal_p (op0, CONSTM1_RTX (mode0)))
+       {
+         op1 = force_reg (mode1, op1);
+         op0 = op1;
+       }
+      else if (rtx_equal_p (op1, CONSTM1_RTX (mode1)))
+       {
+         op0 = force_reg (mode0, op0);
+         op1 = op0;
+       }
+      else if (MEM_P (op0) && !MEM_P (op1))
+       std::swap (op0, op1);
+      break;
+
+    case IX86_BUILTIN_PTESTC:
+    case IX86_BUILTIN_PTESTC256:
+      // Returns (~OP0 & OP1) == 0
+      if (rtx_equal_p (op0, CONSTM1_RTX (mode0))
+         || rtx_equal_p (op1, CONST0_RTX (mode1))
+         || rtx_equal_p (op0, op1))
+       result = const1_rtx;
+      break;
+
+    case IX86_BUILTIN_PTESTNZC:
+    case IX86_BUILTIN_PTESTNZC256:
+      // Returns ((OP0 && OP1) != 0) && ((~OP0 && OP1) != 0)
+      if (rtx_equal_p (op0, CONST0_RTX (mode0))
+         || rtx_equal_p (op0, CONSTM1_RTX (mode0))
+         || rtx_equal_p (op1, CONST0_RTX (mode1))
+         || rtx_equal_p (op0, op1))
+       result = const0_rtx;
+      break;
+
+    default:
+      break;
+    }
 
   if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0)
+      || result)
     op0 = copy_to_mode_reg (mode0, op0);
   if ((optimize && !register_operand (op1, mode1))
-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1)
+      || result)
     op1 = copy_to_mode_reg (mode1, op1);
 
+  if (result)
+    {
+      if (!target)
+       target = gen_reg_rtx (SImode);
+      emit_move_insn (target, result);
+      return target;
+    }
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
   pat = GEN_FCN (d->icode) (op0, op1);
   if (! pat)
     return 0;
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
new file mode 100644
index 00000000000..600b807defa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
@@ -0,0 +1,21 @@
+/* PR target/90483 */
+/* PR libstdc++/118416 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long v2di __attribute__ ((__vector_size__ (16)));
+
+v2di x;
+
+int test1 ()
+{
+  return __builtin_ia32_ptestz128((v2di){-1,-1}, x == 0);
+}
+
+int test2 ()
+{
+  return __builtin_ia32_ptestz128(x == 0, (v2di){-1,-1});
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
+/* { dg-final { scan-assembler-not "pcmpeqd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
new file mode 100644
index 00000000000..fe7e710c6fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
@@ -0,0 +1,21 @@
+/* PR target/90483 */
+/* PR libstdc++/118416 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long v2di __attribute__ ((__vector_size__ (16)));
+
+v2di x;
+
+int test1 ()
+{
+  return __builtin_ia32_ptestz128((v2di){0,0}, x == 0);
+}
+
+int test2 ()
+{
+  return __builtin_ia32_ptestz128(x == 0, (v2di){0,0});
+}
+
+/* { dg-final { scan-assembler-not "ptest\[ \\t\]+%" } } */
+/* { dg-final { scan-assembler-not "pcmpeq" } } */

Reply via email to