https://gcc.gnu.org/g:65ee4bb8ec3d6af1a45b6cc93912608f0d124562
commit r16-5329-g65ee4bb8ec3d6af1a45b6cc93912608f0d124562 Author: Richard Biener <[email protected]> Date: Thu Nov 6 13:19:35 2025 +0100 [x86] avoid using masked vector epilogues when no scalar epilog is needed The following arranges for avoiding masked vector epilogues when we'll eventually arrive at a vector epilogue with VF == 1 which implies no scalar epilog will be necessary. This avoids regressing performance in OpenColorIO when the avx512_masked_epilogues tuning is enabled. A testcase for one example case is shown in PR122573. PR tree-optimization/122573 * config/i386/i386.cc (ix86_vector_costs::finish_cost): Avoid using masked epilogues when an SSE epilogue would have a VF of one. * gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c: New testcase. Diff: --- gcc/config/i386/i386.cc | 5 ++++ .../vect/costmodel/x86_64/costmodel-pr122573.c | 30 ++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 6b6febc88709..8aac0820bc22 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26609,6 +26609,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) if (loop_vinfo && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2 + /* Avoid a masked epilog if cascaded epilogues eventually get us + to one with VF 1 as that means no scalar epilog at all. */ + && !((GET_MODE_SIZE (loop_vinfo->vector_mode) + / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () == 16) + && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES]) && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES] && !OPTION_SET_P (param_vect_partial_vector_usage)) { diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c new file mode 100644 index 000000000000..ca3294dca7a4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=znver5" } */ + +struct S { + float m_col1[4]; + float m_col2[4]; + float m_col3[4]; + float m_col4[4]; +}; + +void apply(struct S *s, const float *in, float *out, long numPixels) +{ + for (long idx = 0; idx < numPixels; ++idx) + { + const float r = in[0]; + const float g = in[1]; + const float b = in[2]; + const float a = in[3]; + out[0] = r*s->m_col1[0] + g*s->m_col2[0] + b*s->m_col3[0] + a*s->m_col4[0]; + out[1] = r*s->m_col1[1] + g*s->m_col2[1] + b*s->m_col3[1] + a*s->m_col4[1]; + out[2] = r*s->m_col1[2] + g*s->m_col2[2] + b*s->m_col3[2] + a*s->m_col4[2]; + out[3] = r*s->m_col1[3] + g*s->m_col2[3] + b*s->m_col3[3] + a*s->m_col4[3]; + in += 4; + out += 4; + } +} + +/* Check that we do not use a masked epilog but a SSE one with VF 1 + (and possibly a AVX2 one as well). */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 byte vectors and unroll factor 1" "vect" } } */
