This patch, inspired by PR target/90483 and libstdc++/118416, implements
some RTL expansion-time simplifications of ptest. A common idiom for
testing a vector against zero is to use ptestz(mask,-1). Alas the code
generated for this is suboptimal, requiring materialization of an all_ones
vector. Given that ptestz(x,y) is defined as (x & y) != 0, an equivalent
form is ptestz(mask,mask), saving an instruction (if ~0 isn't available).
Consider the function:
typedef long long v2di __attribute__ ((__vector_size__ (16)));
int foo (v2di x)
{
return __builtin_ia32_ptestz128(x,~(v2di){0,0});
}
with -O2 -mavx2, GCC currently generates:
foo: vpcmpeqd %xmm1, %xmm1, %xmm1
xorl %eax, %eax
vptest %xmm1, %xmm0
sete %al
ret
with this patch, it now generates:
foo: xorl %eax, %eax
vptest %xmm0, %xmm0
sete %al
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline?
2026-05-18 Roger Sayle <[email protected]>
gcc/ChangeLog
PR target/90483
PR libstdc++/118416
* config/i386/i386-expand.cc (ix86_expand_sse_ptest): Refactor
with optimizations for PTESTZ*, PTESTC* and PTESTNZC*, including
transforming ptestz(x,-1) into ptestz(x,x).
gcc/testsuite/ChangeLog
PR target/90483
PR libstdc++/118416
* gcc.target/config/i386/sse4_1-ptest-8.c: New test case.
* gcc.target/config/i386/sse4_1-ptest-9.c: Likewise.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index df44a4eb99d..de86d2d24b4 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -11979,35 +11979,79 @@ ix86_expand_sse_ptest (const struct
builtin_description *d, tree exp,
machine_mode mode0 = insn_data[d->icode].operand[0].mode;
machine_mode mode1 = insn_data[d->icode].operand[1].mode;
enum rtx_code comparison = d->comparison;
-
- /* ptest reg, reg sets the carry flag. */
- if (comparison == LTU
- && (d->code == IX86_BUILTIN_PTESTC
- || d->code == IX86_BUILTIN_PTESTC256)
- && rtx_equal_p (op0, op1))
- {
- if (!target)
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const1_rtx);
- return target;
- }
+ rtx result = NULL_RTX;
if (VECTOR_MODE_P (mode0))
op0 = safe_vector_operand (op0, mode0);
if (VECTOR_MODE_P (mode1))
op1 = safe_vector_operand (op1, mode1);
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const0_rtx);
- target = gen_rtx_SUBREG (QImode, target, 0);
+ switch (d->code)
+ {
+ case IX86_BUILTIN_PTESTZ:
+ case IX86_BUILTIN_PTESTZ256:
+ // Returns (OP0 & OP1) == 0
+ if (rtx_equal_p (op0, CONST0_RTX (mode0))
+ || rtx_equal_p (op1, CONST0_RTX (mode1)))
+ result = const1_rtx;
+ else if (rtx_equal_p (op0, CONSTM1_RTX (mode0)))
+ {
+ op1 = force_reg (mode1, op1);
+ op0 = op1;
+ }
+ else if (rtx_equal_p (op1, CONSTM1_RTX (mode1)))
+ {
+ op0 = force_reg (mode0, op0);
+ op1 = op0;
+ }
+ else if (MEM_P (op0) && !MEM_P (op1))
+ std::swap (op0, op1);
+ break;
+
+ case IX86_BUILTIN_PTESTC:
+ case IX86_BUILTIN_PTESTC256:
+ // Returns (~OP0 & OP1) == 0
+ if (rtx_equal_p (op0, CONSTM1_RTX (mode0))
+ || rtx_equal_p (op1, CONST0_RTX (mode1))
+ || rtx_equal_p (op0, op1))
+ result = const1_rtx;
+ break;
+
+ case IX86_BUILTIN_PTESTNZC:
+ case IX86_BUILTIN_PTESTNZC256:
+ // Returns ((OP0 && OP1) != 0) && ((~OP0 && OP1) != 0)
+ if (rtx_equal_p (op0, CONST0_RTX (mode0))
+ || rtx_equal_p (op0, CONSTM1_RTX (mode0))
+ || rtx_equal_p (op1, CONST0_RTX (mode1))
+ || rtx_equal_p (op0, op1))
+ result = const0_rtx;
+ break;
+
+ default:
+ break;
+ }
if ((optimize && !register_operand (op0, mode0))
- || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+ || !insn_data[d->icode].operand[0].predicate (op0, mode0)
+ || result)
op0 = copy_to_mode_reg (mode0, op0);
if ((optimize && !register_operand (op1, mode1))
- || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+ || !insn_data[d->icode].operand[1].predicate (op1, mode1)
+ || result)
op1 = copy_to_mode_reg (mode1, op1);
+ if (result)
+ {
+ if (!target)
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, result);
+ return target;
+ }
+
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const0_rtx);
+ target = gen_rtx_SUBREG (QImode, target, 0);
+
pat = GEN_FCN (d->icode) (op0, op1);
if (! pat)
return 0;
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
new file mode 100644
index 00000000000..600b807defa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c
@@ -0,0 +1,21 @@
+/* PR target/90483 */
+/* PR libstdc++/118416 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long v2di __attribute__ ((__vector_size__ (16)));
+
+v2di x;
+
+int test1 ()
+{
+ return __builtin_ia32_ptestz128((v2di){-1,-1}, x == 0);
+}
+
+int test2 ()
+{
+ return __builtin_ia32_ptestz128(x == 0, (v2di){-1,-1});
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
+/* { dg-final { scan-assembler-not "pcmpeqd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
new file mode 100644
index 00000000000..fe7e710c6fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c
@@ -0,0 +1,21 @@
+/* PR target/90483 */
+/* PR libstdc++/118416 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long v2di __attribute__ ((__vector_size__ (16)));
+
+v2di x;
+
+int test1 ()
+{
+ return __builtin_ia32_ptestz128((v2di){0,0}, x == 0);
+}
+
+int test2 ()
+{
+ return __builtin_ia32_ptestz128(x == 0, (v2di){0,0});
+}
+
+/* { dg-final { scan-assembler-not "ptest\[ \\t\]+%" } } */
+/* { dg-final { scan-assembler-not "pcmpeq" } } */