https://gcc.gnu.org/g:9a62c1495891032922af5bf9bd1906999cf63605

commit r15-5120-g9a62c1495891032922af5bf9bd1906999cf63605
Author: Richard Biener <rguent...@suse.de>
Date:   Fri Nov 8 11:17:22 2024 +0100

    Add X86_TUNE_AVX512_TWO_EPILOGUES, enable for Zen4 and Zen5
    
    The following adds X86_TUNE_AVX512_TWO_EPILOGUES tuning and directs the
    vectorizer to produce both a vector AVX2 and SSE epilogue for AVX512
    vectorized loops when set.  The tuning is enabled by default for Zen4
    and Zen5 where I benchmarked it to be overall positive on SPEC CPU 2017 both
    in performance and overall code size.  In particular it speeds up
    525.x264_r which with only an AVX2 epilogue ends up in unvectorized code
    at the moment.
    
            * config/i386/i386.cc (ix86_vector_costs::finish_cost): Set
            m_suggested_epilogue_mode according to 
X86_TUNE_AVX512_TWO_EPILOGUES.
            * config/i386/x86-tune.def (X86_TUNE_AVX512_TWO_EPILOGUES): Add.
            Enable for znver4 and znver5.

Diff:
---
 gcc/config/i386/i386.cc      | 12 ++++++++++++
 gcc/config/i386/x86-tune.def |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6ac3a5d55f29..526c9df7618d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25353,6 +25353,18 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
        && TARGET_AVX256_AVOID_VEC_PERM)
       m_costs[i] = INT_MAX;
 
+  /* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both
+     a AVX2 and a SSE epilogue for AVX512 vectorized loops.  */
+  if (loop_vinfo
+      && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
+    {
+      if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64)
+       m_suggested_epilogue_mode = V32QImode;
+      else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+              && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32)
+       m_suggested_epilogue_mode = V16QImode;
+    }
+
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 6ebb2fd3414e..81dd895ac819 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -597,6 +597,11 @@ DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, 
"avx512_move_by_pieces",
 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
          m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
 
+/* X86_TUNE_AVX512_TWO_EPILOGUES: Use two vector epilogues for 512-bit
+   vectorized loops.  */
+DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
+         m_ZNVER4 | m_ZNVER5)
+
 /*****************************************************************************/
 /*****************************************************************************/
 /* Historical relics: tuning flags that helps a specific old CPU designs     */

Reply via email to