On Mon, Sep 8, 2025 at 10:15 AM liuhongt <hongtao....@intel.com> wrote: > > SLP may take a broadcast as kind of vec_perm, the patch checks the > permutation index to exclude those false positive. > > > > > so the vectorizer costs sth withy count == 0? I'll see to fix that, > > > > but this also > > > > means the code should have used m_num_avx256_vec_perm[where] += count. > Changed. > > > > > && (is_a <bb_vec_info> (m_vinfo) > > || SLP_TREE_LANES (node) % nunits == 0) > For the case mentioned in the comments, it's also > SLP_TREE_LANES (node) % nunits == 0(avx256_avoid_vec_perm-5.c), hence it > can't distinguish avx256_avoid_vec_perm-5.c from avx256_avoid_vec_perm-3.c. > Both of them are “legacy" load permutation in loop vectorization. > > So I just handled is_a <bb_vec_info> (m_vinfo) in the patch, > leave loop vectorization in the follow up patch. > > > > the case of SLP_TREE_PERMUTE_P would need to be added separately, > > but those are also costed as kind == vec_perm. A common use-case were > > blends but now that we lower most load permutations to explicit > > SLP permute nodes there are also those when vectorizing loops. > > > > I guess it's reasonable to first handle SLP_TREE_LOAD_PERMUTATION, > > the other case could be done as followup. > > Bootstrapped and regtested on x86_64-c-pc-linux-gnu{-m32,}. > Ok for trunk.
OK. Thanks, Richard. > gcc/ChangeLog: > > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): > Check permutation index for vec_perm, don't count it if we > know it's not a cross-lane permutation. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx256_avoid_vec_perm.c: Adjust testcase. > * gcc.target/i386/avx256_avoid_vec_perm-2.c: New test. > * gcc.target/i386/avx256_avoid_vec_perm-5.c: New test. > --- > gcc/config/i386/i386.cc | 59 ++++++++++++++++++- > .../gcc.target/i386/avx256_avoid_vec_perm-2.c | 21 +++++++ > .../gcc.target/i386/avx256_avoid_vec_perm-5.c | 24 ++++++++ > .../gcc.target/i386/avx256_avoid_vec_perm.c | 2 +- > 4 files changed, 103 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 55c9b16dd38..932e3feedc3 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -26237,8 +26237,63 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > stmt_cost = ix86_default_vector_cost (kind, mode); > > if (kind == vec_perm && vectype > - && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) > - m_num_avx256_vec_perm[where]++; > + && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32 > + /* BIT_FIELD_REF <vect_**, 64, 0> 0 times vec_perm costs 0 in body. */ > + && count != 0) > + { > + bool real_perm = true; > + unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype); > + > + if (node > + && SLP_TREE_LOAD_PERMUTATION (node).exists () > + /* Loop vectorization will have 4 times vec_perm > + with index as {0, 0, 0, 0}. > + But it actually generates > + vec_perm_expr <vect, vect, 0, 0, 0, 0> > + vec_perm_expr <vect, vect, 1, 1, 1, 1> > + vec_perm_expr <vect, vect, 2, 2, 2, 2> > + Need to be handled separately. */ > + && is_a <bb_vec_info> (m_vinfo)) > + { > + unsigned half = nunits / 2; > + unsigned i = 0; > + bool allsame = true; > + unsigned first = SLP_TREE_LOAD_PERMUTATION (node)[0]; > + bool cross_lane_p = false; > + for (i = 0 ; i != SLP_TREE_LANES (node); i++) > + { > + unsigned tmp = SLP_TREE_LOAD_PERMUTATION (node)[i]; > + /* allsame is just a broadcast. */ > + if (tmp != first) > + allsame = false; > + > + /* 4 times vec_perm with number of lanes multiple of nunits. */ > + tmp = tmp & (nunits - 1); > + unsigned index = i & (nunits - 1); > + if ((index < half && tmp >= half) > + || (index >= half && tmp < half)) > + cross_lane_p = true; > + > + if (!allsame && cross_lane_p) > + break; > + } > + > + if (i == SLP_TREE_LANES (node)) > + real_perm = false; > + } > + > + if (real_perm) > + { > + m_num_avx256_vec_perm[where] += count; > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Detected avx256 cross-lane permutation: "); > + if (stmt_info) > + print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM); > + fprintf (dump_file, " \n"); > + } > + } > + } > > /* Penalize DFmode vector operations for Bonnell. */ > if (TARGET_CPU_P (BONNELL) && kind == vector_stmt > diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c > b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c > new file mode 100644 > index 00000000000..8d4e641444d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=sierraforest -O2 -fdump-tree-slp-details" } */ > +/* { dg-final { scan-tree-dump-times {(?n)Detected avx256 cross-lane > permutation} 1 "slp2" } } */ > + > +void > +foo (double* a, double* __restrict b, int c, int n) > +{ > + a[0] = b[100] * b[2]; > + a[1] = b[100] * b[3]; > + a[2] = b[100] * b[0]; > + a[3] = b[100] * b[1]; > +} > + > +void > +foo1 (double* a, double* __restrict b, int c, int n) > +{ > + a[0] = b[100] * b[0]; > + a[1] = b[100] * b[1]; > + a[2] = b[100] * b[3]; > + a[3] = b[100] * b[2]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c > b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c > new file mode 100644 > index 00000000000..c11bea8c7b3 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=sierraforest -Ofast" } */ > +/* { dg-final { scan-assembler-not {(?n)vpermpd.*%ymm} } } */ > + > +typedef struct { > + unsigned short m1, m2, m3, m4; > +} the_struct_t; > +typedef struct { > + double m1, m2, m3, m4, m5; > +} the_struct2_t; > + > +double bar1 (the_struct2_t*); > + > +double foo (double* k, unsigned int n, the_struct_t* the_struct) { > + unsigned int u; > + the_struct2_t result; > + for (u=0; u < n; u++, k--) { > + result.m1 += (*k)*the_struct[u].m1; > + result.m2 += (*k)*the_struct[u].m2; > + result.m3 += (*k)*the_struct[u].m3; > + result.m4 += (*k)*the_struct[u].m4; > + } > + return bar1 (&result); > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c > b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c > index d4f00b3fb52..e0399041ad9 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c > +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c > @@ -13,7 +13,7 @@ foo (void) > b[i*8+0] = a[i*8+0]; > b[i*8+1] = a[i*8+0]; > b[i*8+2] = a[i*8+3]; > - b[i*8+3] = a[i*8+3]; > + b[i*8+3] = a[i*8+5]; > b[i*8+4] = a[i*8+4]; > b[i*8+5] = a[i*8+6]; > b[i*8+6] = a[i*8+4]; > -- > 2.34.1 >