https://gcc.gnu.org/g:a87cdfd2ca3260126d3c75ddfb5cdea6e721d8d0
commit r17-597-ga87cdfd2ca3260126d3c75ddfb5cdea6e721d8d0 Author: Roger Sayle <[email protected]> Date: Tue May 19 07:29:08 2026 -0400 i386: Optimize ptestz(x,-1) as ptestz(x,x) on x86 This patch, inspired by PR target/90483 and libstdc++/118416, implements some RTL expansion-time simplifications of ptest. A common idiom for testing a vector against zero is to use ptestz(mask,-1). Alas the code generated for this is suboptimal, requiring materialization of an all_ones vector. Given that ptestz(x,y) is defined as (x & y) == 0, an equivalent form is ptestz(mask,mask), saving an instruction (if ~0 isn't available). Consider the function: typedef long long v2di __attribute__ ((__vector_size__ (16))); int foo (v2di x) { return __builtin_ia32_ptestz128(x,~(v2di){0,0}); } with -O2 -mavx2, GCC currently generates: foo: vpcmpeqd %xmm1, %xmm1, %xmm1 xorl %eax, %eax vptest %xmm1, %xmm0 sete %al ret with this patch, it now generates: foo: xorl %eax, %eax vptest %xmm0, %xmm0 sete %al ret 2026-05-19 Roger Sayle <[email protected]> gcc/ChangeLog PR target/90483 PR libstdc++/118416 * config/i386/i386-expand.cc (ix86_expand_sse_ptest): Refactor with optimizations for PTESTZ*, PTESTC* and PTESTNZC*, including transforming ptestz(x,-1) into ptestz(x,x). gcc/testsuite/ChangeLog PR target/90483 PR libstdc++/118416 * gcc.target/i386/sse4_1-ptest-8.c: New test case. * gcc.target/i386/sse4_1-ptest-9.c: Likewise. Diff: --- gcc/config/i386/i386-expand.cc | 78 ++++++++++++++++++++------ gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c | 21 +++++++ gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c | 21 +++++++ 3 files changed, 103 insertions(+), 17 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 01cff86d20aa..e278b02ffefd 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -11999,35 +11999,79 @@ ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, machine_mode mode0 = insn_data[d->icode].operand[0].mode; machine_mode mode1 = insn_data[d->icode].operand[1].mode; enum rtx_code comparison = d->comparison; - - /* ptest reg, reg sets the carry flag. */ - if (comparison == LTU - && (d->code == IX86_BUILTIN_PTESTC - || d->code == IX86_BUILTIN_PTESTC256) - && rtx_equal_p (op0, op1)) - { - if (!target) - target = gen_reg_rtx (SImode); - emit_move_insn (target, const1_rtx); - return target; - } + rtx result = NULL_RTX; if (VECTOR_MODE_P (mode0)) op0 = safe_vector_operand (op0, mode0); if (VECTOR_MODE_P (mode1)) op1 = safe_vector_operand (op1, mode1); - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); + switch (d->code) + { + case IX86_BUILTIN_PTESTZ: + case IX86_BUILTIN_PTESTZ256: + // Returns (OP0 & OP1) == 0 + if (rtx_equal_p (op0, CONST0_RTX (mode0)) + || rtx_equal_p (op1, CONST0_RTX (mode1))) + result = const1_rtx; + else if (rtx_equal_p (op0, CONSTM1_RTX (mode0))) + { + op1 = force_reg (mode1, op1); + op0 = op1; + } + else if (rtx_equal_p (op1, CONSTM1_RTX (mode1))) + { + op0 = force_reg (mode0, op0); + op1 = op0; + } + else if (MEM_P (op0) && !MEM_P (op1)) + std::swap (op0, op1); + break; + + case IX86_BUILTIN_PTESTC: + case IX86_BUILTIN_PTESTC256: + // Returns (~OP0 & OP1) == 0 + if (rtx_equal_p (op0, CONSTM1_RTX (mode0)) + || rtx_equal_p (op1, CONST0_RTX (mode1)) + || rtx_equal_p (op0, op1)) + result = const1_rtx; + break; + + case IX86_BUILTIN_PTESTNZC: + case IX86_BUILTIN_PTESTNZC256: + // Returns ((OP0 && OP1) != 0) && ((~OP0 && OP1) != 0) + if (rtx_equal_p (op0, CONST0_RTX (mode0)) + || rtx_equal_p (op0, CONSTM1_RTX (mode0)) + || rtx_equal_p (op1, CONST0_RTX (mode1)) + || rtx_equal_p (op0, op1)) + result = const0_rtx; + break; + + default: + break; + } if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0) + || result) op0 = copy_to_mode_reg (mode0, op0); if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1) + || result) op1 = copy_to_mode_reg (mode1, op1); + if (result) + { + if (!target) + target = gen_reg_rtx (SImode); + emit_move_insn (target, result); + return target; + } + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + pat = GEN_FCN (d->icode) (op0, op1); if (! pat) return 0; diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c new file mode 100644 index 000000000000..600b807defaa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-8.c @@ -0,0 +1,21 @@ +/* PR target/90483 */ +/* PR libstdc++/118416 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.1" } */ + +typedef long long v2di __attribute__ ((__vector_size__ (16))); + +v2di x; + +int test1 () +{ + return __builtin_ia32_ptestz128((v2di){-1,-1}, x == 0); +} + +int test2 () +{ + return __builtin_ia32_ptestz128(x == 0, (v2di){-1,-1}); +} + +/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */ +/* { dg-final { scan-assembler-not "pcmpeqd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c new file mode 100644 index 000000000000..fe7e710c6fb7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-9.c @@ -0,0 +1,21 @@ +/* PR target/90483 */ +/* PR libstdc++/118416 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.1" } */ + +typedef long long v2di __attribute__ ((__vector_size__ (16))); + +v2di x; + +int test1 () +{ + return __builtin_ia32_ptestz128((v2di){0,0}, x == 0); +} + +int test2 () +{ + return __builtin_ia32_ptestz128(x == 0, (v2di){0,0}); +} + +/* { dg-final { scan-assembler-not "ptest\[ \\t\]+%" } } */ +/* { dg-final { scan-assembler-not "pcmpeq" } } */
