https://gcc.gnu.org/g:523b01b022f74f73f0e20a40950536c06f538910
commit r16-6988-g523b01b022f74f73f0e20a40950536c06f538910 Author: Richard Biener <[email protected]> Date: Thu Jan 22 14:06:50 2026 +0100 Avoid selecting masked epilogs for in-order reduction vectorization When masking an in-order reduction we are applying the mask with a COND_EXPR followed by an in-order accumulation of all elements, including the masked ones. That makes loop masking not profitable. Ideally we'd apply this logic to all loops, even when masking is selected via --param vect-partial-vector-usage=2 but the current way we iterate over modes (and opt-out of cost compares) does not allow do iterate over masked vs. non-masked, so that does not work. I plan to fix that for GCC 17, for now this fixes a regression for tagets opting in to avx512_masked_epilogues. * config/i386/i386.cc (ix86_vector_costs::finish_cost): Avoid selecting masked epilogs for in-order reductions. * gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c: New testcase. Diff: --- gcc/config/i386/i386.cc | 20 ++++++++ .../vect/costmodel/x86_64/costmodel-vect-epil-1.c | 58 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index a3d0f7cb6496..42ae9ccb0518 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26572,6 +26572,12 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) m_costs[vect_body] = INT_MAX; + /* We'd like to avoid using masking if there's an in-order reduction + to vectorize because that will also perform in-order adds of + masked elements (as neutral value, of course) here, but there + is currently no way to indicate to try un-masked with the same + mode. */ + bool any_reduc_p = false; for (int i = 0; i != X86_REDUC_LAST; i++) if (m_num_reduc[i]) @@ -26687,6 +26693,20 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) } } } + /* Avoid using masking if there's an in-order reduction + to vectorize because that will also perform in-order adds of + masked elements (as neutral value, of course). */ + if (!avoid) + { + for (auto inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo)) + if (SLP_INSTANCE_KIND (inst) == slp_inst_kind_reduc_group + && (vect_reduc_type (loop_vinfo, SLP_INSTANCE_TREE (inst)) + == FOLD_LEFT_REDUCTION)) + { + avoid = true; + break; + } + } if (!avoid) { m_suggested_epilogue_mode = loop_vinfo->vector_mode; diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c new file mode 100644 index 000000000000..5b8c358b2a4a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-mavx512bw -mtune-ctrl=avx512_masked_epilogues" } */ + +void test (const unsigned char * __restrict__ pi, + const float * __restrict__ blk, + int texel_count, + float *pp_avg_rgb) +{ + float pp_avg_rgb_0 = 0.0f; + float pp_avg_rgb_1 = 0.0f; + float pp_avg_rgb_2 = 0.0f; + float pp_avg_rgb_3 = 0.0f; + for (int lane_id = 0; lane_id < texel_count; lane_id++) { + unsigned char r_byte = pi[lane_id * 4 + 0]; + unsigned char g_byte = pi[lane_id * 4 + 1]; + unsigned char b_byte = pi[lane_id * 4 + 2]; + unsigned char a_byte = pi[lane_id * 4 + 3]; + + float r_float = blk[lane_id * 4 + 0]; + float g_float = blk[lane_id * 4 + 1]; + float b_float = blk[lane_id * 4 + 2]; + float a_float = blk[lane_id * 4 + 3]; + + int r_is_zero = (r_byte == 0) ? 1 : 0; + int r_in_bounds = (texel_count > lane_id) ? 1 : 0; + int r_mask = r_is_zero * (-r_in_bounds); + if (r_mask != 0) { + pp_avg_rgb_0 += r_float; + } + int g_is_zero = (g_byte == 0) ? 1 : 0; + int g_in_bounds = (texel_count > lane_id) ? 1 : 0; + int g_mask = g_is_zero * (-g_in_bounds); + if (g_mask != 0) { + pp_avg_rgb_1 += g_float; + } + int b_is_zero = (b_byte == 0) ? 1 : 0; + int b_in_bounds = (texel_count > lane_id) ? 1 : 0; + int b_mask = b_is_zero * (-b_in_bounds); + if (b_mask != 0) { + pp_avg_rgb_2 += b_float; + } + int a_is_zero = (a_byte == 0) ? 1 : 0; + int a_in_bounds = (texel_count > lane_id) ? 1 : 0; + int a_mask = a_is_zero * (-a_in_bounds); + if (a_mask != 0) { + pp_avg_rgb_3 += a_float; + } + } + pp_avg_rgb[0] = pp_avg_rgb_0; + pp_avg_rgb[1] = pp_avg_rgb_1; + pp_avg_rgb[2] = pp_avg_rgb_2; + pp_avg_rgb[3] = pp_avg_rgb_3; +} + +/* Even though there's an SLP opportunity in-order reductions should never use + masked epilogs. */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32 byte vectors" "vect" } } */
