Perhaps the easiest way to demonstrate that tree-ssa's isel pass isn't a
replacement for GCC's RTL expansion pass is with a concrete example where
ISEL hurt's performance (on x86_64), which should be unsurprising given
that gimple-isel.cc doesn't once mention rtx_costs (or cost).
Consider the example:
void foo(char c[])
{
for (int i = 0; i < 16; i++)
c[i] = c[i] != 'a';
}
currently when compiled with -O2 -mavx2 this generates:
foo: movl $1633771873, %eax
vpxor %xmm1, %xmm1, %xmm1
vmovd %eax, %xmm0
vpbroadcastd %xmm0, %xmm0
vpcmpeqb (%rdi), %xmm0, %xmm0
vpcmpeqb %xmm1, %xmm0, %xmm0
vpcmpeqd %xmm1, %xmm1, %xmm1
vpabsb %xmm1, %xmm1
vpand %xmm1, %xmm0, %xmm0
vmovdqu %xmm0, (%rdi)
ret
with the attached patch, when applied on top of the previously
posted https://gcc.gnu.org/pipermail/gcc-patches/2026-February/708351.html
we generate the improved:
movl $1633771873, %eax
vpxor %xmm1, %xmm1, %xmm1
vmovd %eax, %xmm0
vpbroadcastd %xmm0, %xmm0
vpcmpeqb (%rdi), %xmm0, %xmm0
vpcmpeqb %xmm1, %xmm0, %xmm0
vpabsb %xmm0, %xmm0
vmovdqu %xmm0, (%rdi)
ret
The difference is that to convert a vector of 0 and -1 values to a
vector of 0 and 1 values, we don't use AND as in "cond & {1,1,1,1...}"
but can use (in this case) ABS or a vector logical right shift when
available. Clearly using vpabsb is faster, as the materialization of
the vector "{1,1,1,1,1...}" already uses vpabsb, before the vpand.
Unfortunately, the i386-expand.cc change (which understands the various
instruction availabilities and implicit costs) on its own is insufficent,
because isel's gimple_expand_vec_cond_expr blindly lowers IFN_VCOND_MASK
without letting expand or the target backend decide on the best possible
implementation. The patch removes these premature optimizations (the
root of all evil). Aside, I suspect that one cause for confusion is the
poor naming; the "isel" pass has little to do with "instruction selection",
so perhaps internal-fn-lowering or similar would be better. Even the
comment at the top of gimple-isel describes it as "Schedule GIMPLE
vector statements". Perhaps once tree-ssa has a way of querying the
backend for instruction costs things will improve, but until then RTL
expansion makes far more sense.
This patch has been tested (on top of the patch mentioned above) on
x86_64-pc-linux-gnu with make bootstrap and make -k check, both with
and without --target_board=unix{-m32} with no new failures.
Thoughts? (Both) Ok for stage1?
2026-02-17 Roger Sayle <[email protected]>
gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Optimize
case where op_false is a vector of zeros, and op_true is a vector
of ones, using either vector logical right shifts or vector ABS.
* gimple-isel.cc (gimple_expand_vec_cond_expr): Always lower
VEC_COND_EXPR to IFN_VCOND_MASK. Remove the "optimization" of
special cases as these are best performed (by the backend) during
RTL expansion.
Roger
--
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a82bb4399c9..94d2e76c162 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4423,8 +4423,103 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true,
rtx op_false)
}
else if (op_false == CONST0_RTX (mode))
{
- x = expand_simple_binop (mode, AND, cmp, op_true,
- dest, 1, OPTAB_DIRECT);
+ x = NULL_RTX;
+ if (op_true == CONST1_RTX (mode))
+ {
+ switch (mode)
+ {
+ case V16QImode:
+ if (TARGET_SSE2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_absv16qi2 (x, cmp));
+ }
+ break;
+ case V32QImode:
+ if (TARGET_AVX2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_absv32qi2 (x, cmp));
+ }
+ break;
+ case V64QImode:
+ if (TARGET_AVX512BW)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_absv64qi2 (x, cmp));
+ }
+ break;
+ case V8HImode:
+ if (TARGET_SSE2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv8hi3 (x, cmp, GEN_INT (15)));
+ }
+ break;
+ case V16HImode:
+ if (TARGET_AVX2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv16hi3 (x, cmp, GEN_INT (15)));
+ }
+ break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv32hi3 (x, cmp, GEN_INT (15)));
+ }
+ break;
+ case V4SImode:
+ if (TARGET_SSE2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv4si3 (x, cmp, GEN_INT (31)));
+ }
+ break;
+ case V8SImode:
+ if (TARGET_AVX2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv8si3 (x, cmp, GEN_INT (31)));
+ }
+ break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv16si3 (x, cmp, GEN_INT (31)));
+ }
+ break;
+ case V2DImode:
+ if (TARGET_SSE2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv2di3 (x, cmp, GEN_INT (63)));
+ }
+ break;
+ case V4DImode:
+ if (TARGET_AVX2)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv4di3 (x, cmp, GEN_INT (63)));
+ }
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ {
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_lshrv8di3 (x, cmp, GEN_INT (63)));
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!x)
+ x = expand_simple_binop (mode, AND, cmp, op_true,
+ dest, 1, OPTAB_DIRECT);
if (x != dest)
emit_move_insn (dest, x);
return;
diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b193e27b183..c3aa26b1280 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -173,23 +173,19 @@ gimple_expand_vec_set_extract_expr (struct function *fun,
static gimple *
gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi)
{
- tree lhs, op0a = NULL_TREE;
- enum tree_code code;
- enum tree_code tcode;
-
/* Only consider code == GIMPLE_ASSIGN. */
gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
if (!stmt)
return NULL;
- code = gimple_assign_rhs_code (stmt);
+ enum tree_code code = gimple_assign_rhs_code (stmt);
if (code != VEC_COND_EXPR)
return NULL;
tree op0 = gimple_assign_rhs1 (stmt);
tree op1 = gimple_assign_rhs2 (stmt);
tree op2 = gimple_assign_rhs3 (stmt);
- lhs = gimple_assign_lhs (stmt);
+ tree lhs = gimple_assign_lhs (stmt);
machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
/* Lower mask typed, non-vector mode VEC_COND_EXPRs to bitwise operations.
@@ -211,97 +207,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi)
return gimple_build_assign (lhs, tem3);
}
- bool can_compute_op0 = true;
gcc_assert (!COMPARISON_CLASS_P (op0));
- if (TREE_CODE (op0) == SSA_NAME)
- {
- gassign *def_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (op0));
- if (def_stmt)
- {
- tcode = gimple_assign_rhs_code (def_stmt);
- op0a = gimple_assign_rhs1 (def_stmt);
-
- tree op0_type = TREE_TYPE (op0);
- tree op0a_type = TREE_TYPE (op0a);
- if (TREE_CODE_CLASS (tcode) == tcc_comparison)
- can_compute_op0 = expand_vec_cmp_expr_p (op0a_type, op0_type,
- tcode);
- gcc_assert (can_compute_op0);
-
- if (can_compute_op0
- && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE (op0)))
- {
- /* Assuming c = x CMP y. */
- bool op1_minus_onep = integer_minus_onep (op1);
- bool op2_zerop = integer_zerop (op2);
- tree vtype = TREE_TYPE (lhs);
- machine_mode vmode = TYPE_MODE (vtype);
- /* Try to fold r = c ? -1 : 0 to r = c. */
- if (op1_minus_onep && op2_zerop)
- {
- tree conv_op = build1 (VIEW_CONVERT_EXPR, vtype, op0);
- return gimple_build_assign (lhs, conv_op);
- }
- /* Try to fold r = c ? -1 : z to r = c | z, or
- r = c ? c : z. */
- if (op1_minus_onep)
- {
- tree conv_op = build1 (VIEW_CONVERT_EXPR, vtype, op0);
- tree new_op1 = make_ssa_name (vtype);
- gassign *new_stmt = gimple_build_assign (new_op1, conv_op);
- gsi_insert_seq_before (gsi, new_stmt, GSI_SAME_STMT);
- if (optab_handler (ior_optab, vmode) != CODE_FOR_nothing)
- /* r = c | z */
- return gimple_build_assign (lhs, BIT_IOR_EXPR, new_op1,
- op2);
- /* r = c ? c : z */
- op1 = new_op1;
- }
- /* Try to fold r = c ? z : 0 to r = c & z, or
- r = c ? z : c. */
- else if (op2_zerop)
- {
- tree conv_op = build1 (VIEW_CONVERT_EXPR, vtype, op0);
- tree new_op2 = make_ssa_name (vtype);
- gassign *new_stmt = gimple_build_assign (new_op2, conv_op);
- gsi_insert_seq_before (gsi, new_stmt, GSI_SAME_STMT);
- if (optab_handler (and_optab, vmode) != CODE_FOR_nothing)
- /* r = c | z */
- return gimple_build_assign (lhs, BIT_AND_EXPR, new_op2,
- op1);
- /* r = c ? z : c */
- op2 = new_op2;
- }
- bool op1_zerop = integer_zerop (op1);
- bool op2_minus_onep = integer_minus_onep (op2);
- /* Try to fold r = c ? 0 : z to r = .BIT_ANDN (z, c). */
- if (op1_zerop
- && (direct_internal_fn_supported_p (IFN_BIT_ANDN, vtype,
- OPTIMIZE_FOR_BOTH)))
- {
- tree conv_op = build1 (VIEW_CONVERT_EXPR, vtype, op0);
- tree new_op = make_ssa_name (vtype);
- gassign *new_stmt = gimple_build_assign (new_op, conv_op);
- gsi_insert_seq_before (gsi, new_stmt, GSI_SAME_STMT);
- return gimple_build_call_internal (IFN_BIT_ANDN, 2, op2,
- new_op);
- }
- /* Try to fold r = c ? z : -1 to r = .BIT_IORN (z, c). */
- else if (op2_minus_onep
- && (direct_internal_fn_supported_p (IFN_BIT_IORN, vtype,
- OPTIMIZE_FOR_BOTH)))
- {
- tree conv_op = build1 (VIEW_CONVERT_EXPR, vtype, op0);
- tree new_op = make_ssa_name (vtype);
- gassign *new_stmt = gimple_build_assign (new_op, conv_op);
- gsi_insert_seq_before (gsi, new_stmt, GSI_SAME_STMT);
- return gimple_build_call_internal (IFN_BIT_IORN, 2, op1,
- new_op);
- }
- }
- }
- }
-
gcc_assert (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (op0)));
gcc_assert (get_vcond_mask_icode (mode, TYPE_MODE (TREE_TYPE (op0)))
!= CODE_FOR_nothing);