In vect_build_slp_tree_1, when checking whether reduction operands at
different positions can be swapped, only tree_code operations (e.g.
PLUS_EXPR) were recognized as commutative. Internal functions produced
by if-conversion (e.g. .COND_ADD, .COND_SUB) were not handled, causing
"different reduc_idx" failures when the reduction operand appeared at
different commutative positions across SLP lanes.
This patch extends the commutative swap recognition to internal
functions using first_commutative_argument() to identify the swappable
operand pair.
This improves 731.astcenc_r by 7.1% on EMR and 2.48% on Znver5.
gcc/ChangeLog:
* tree-vect-slp.cc (vect_build_slp_tree_1): Allow commutative
operand swap for internal functions (e.g. .COND_ADD) in SLP
reduction matching, not just tree codes.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/slp-reduc-15.c: New test.
Co-authored-by: Hongtao Liu <[email protected]>
---
gcc/testsuite/gcc.dg/vect/slp-reduc-15.c | 81 ++++++++++++++++++++++++
gcc/tree-vect-slp.cc | 9 ++-
2 files changed, 89 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-15.c
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c
b/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c
new file mode 100644
index 00000000000..4745f85511b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-fgimple -march=x86-64-v3" { target x86_64-*-* } }
*/
+
+/* Test that SLP reduction vectorization handles commutative operand swap
+ for .COND_ADD in multi-lane SLP where the reduction operand appears
+ at arg1 in some lanes and arg2 in others. If-conversion can generate
+ COND_ADD with mixed reduction operand positions.
+
+ Without the IFN commutative swap fix, the multi-lane SLP group
+ would be rejected with "different reduc_idx N instead of M". */
+
+float cond_arr[1024];
+float data_a[1024];
+float data_b[1024];
+float data_c[1024];
+float data_d[1024];
+
+void __GIMPLE (ssa, guessed_local(16535624), startwith("loop"))
+foo (float * restrict p0, float * restrict p1,
+ float * restrict p2, float * restrict p3)
+{
+ float s0;
+ float s1;
+ float s2;
+ float s3;
+ int i;
+ float _a;
+ float _b;
+ float _c;
+ float _d;
+ float _5;
+ _Bool _cond;
+ unsigned int _30;
+ unsigned int _31;
+
+ __BB(2, guessed_local(16535624)):
+ goto __BB3(precise(134217728));
+
+ __BB(3, loop_header(1), guessed_local(1057206200)):
+ s0_22 = __PHI (__BB5: s0_10, __BB2: 0.0f);
+ s1_24 = __PHI (__BB5: s1_11, __BB2: 0.0f);
+ s2_26 = __PHI (__BB5: s2_12, __BB2: 0.0f);
+ s3_28 = __PHI (__BB5: s3_13, __BB2: 0.0f);
+ i_27 = __PHI (__BB5: i_19, __BB2: 0);
+ _31 = __PHI (__BB5: _30, __BB2: 1024u);
+ _5 = cond_arr[i_27];
+ _cond = _5 > 0.0f;
+ _a = data_a[i_27];
+ _b = data_b[i_27];
+ _c = data_c[i_27];
+ _d = data_d[i_27];
+ /* s0, s1: reduction operand at position 1 (arg1) → reduc_idx=1 */
+ s0_10 = .COND_ADD (_cond, s0_22, _a, s0_22);
+ s1_11 = .COND_ADD (_cond, s1_24, _b, s1_24);
+ /* s2, s3: reduction operand at position 2 (arg2) → reduc_idx=2 */
+ s2_12 = .COND_ADD (_cond, _c, s2_26, s2_26);
+ s3_13 = .COND_ADD (_cond, _d, s3_28, s3_28);
+ i_19 = i_27 + 1;
+ _30 = _31 - 1u;
+ if (_30 != 0u)
+ goto __BB5(guessed(132118446));
+ else
+ goto __BB4(guessed(2099282));
+
+ __BB(5, guessed_local(1040670576)):
+ goto __BB3(precise(134217728));
+
+ __BB(4, guessed_local(16535624)):
+ *p0_40(D) = s0_10;
+ *p1_41(D) = s1_11;
+ *p2_42(D) = s2_12;
+ *p3_43(D) = s3_13;
+ return;
+}
+
+/* With the IFN commutative swap fix, these 4 reductions should be
+ vectorized using SLP despite different reduc_idx values (1 vs 2). */
+/* { dg-final { scan-tree-dump "swapped operands to match def types in" "vect"
{ target x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" {
target x86_64-*-* } } } */
+
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7be8ca03763..e6f79ce1422 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1401,7 +1401,14 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char
*swap,
&& STMT_VINFO_REDUC_IDX (stmt_info) != -1
&& rhs_code.is_tree_code ()
&& commutative_tree_code (tree_code (rhs_code))
- && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
+ && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
+ && !(first_reduc_idx != -1
+ && STMT_VINFO_REDUC_IDX (stmt_info) != -1
+ && rhs_code.is_internal_fn ()
+ && first_commutative_argument (internal_fn (rhs_code)) >= 0
+ && (first_reduc_idx + STMT_VINFO_REDUC_IDX (stmt_info)
+ == 2 * first_commutative_argument (
+ internal_fn (rhs_code)) + 1)))
{
if (dump_enabled_p ())
{
--
2.34.1