DMR from avx512_move_by_pieces tune.

liuhongt Tue, 16 Sep 2025 01:04:45 -0700

From: "hongtao.liu" <hongtao....@intel.com>

Update in V2:
Only remove SPR/GNR/DMR from avx512_move_by_pieces.


Align move_max with prefer_vector_width for SPR/GNR/DMR similar as
below commit.

commit 6ea25c041964bf63014fcf7bb68fb1f5a0a4e123
Author: liuhongt <hongtao....@intel.com>
Date:   Thu Aug 15 12:54:07 2024 +0800

    Align ix86_{move_max,store_max} with vectorizer.

    When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
    avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
    set ix86_{move_max,store_max} as max available vector length except
    for AVX part.

                  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
                      && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
                    opts->x_ix86_move_max = PVW_AVX512;
                  else
                    opts->x_ix86_move_max = PVW_AVX128;

    So for -mavx2, vectorizer will choose 256-bit for vectorization, but
    128-bit is used for struct copy, there could be a potential STLF issue
    due to this "misalign".

gcc/ChangeLog:

        * config/i386/x86-tune.def (X86_TUNE_AVX512_MOVE_BY_PIECES):
        Remove SPR/GNR/DMR.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pieces-memcpy-18.c: Use -mtune=znver5
        instead of -mtune=sapphirerapids.
        * gcc.target/i386/pieces-memcpy-21.c: Ditto.
---
 gcc/config/i386/x86-tune.def                     | 5 +++--
 gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index a86cbad281c..1debc824afc 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -612,6 +612,8 @@ DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 
ops.  */
 DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
 
+/* It's better to align MOVE_MAX with prefer_vector_width to reduce
+   risk of STLF stalls(small store followed by big load.)  */
 /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
    AVX instructions.  */
 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
@@ -625,8 +627,7 @@ DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, 
"avx256_store_by_pieces",
 /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
    AVX instructions.  */
 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
-         m_SAPPHIRERAPIDS | m_GRANITERAPIDS | m_GRANITERAPIDS_D
-         | m_DIAMONDRAPIDS | m_ZNVER4 | m_ZNVER5)
+          m_ZNVER4 | m_ZNVER5)
 
 /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
    AVX instructions.  */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
index b15a0db9ff0..b4995ac0598 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=sapphirerapids" } */
+/* { dg-options "-O2 -march=znver5" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
index ef439f20f74..804a2989d64 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=sapphirerapids -march=x86-64 -mavx2" } */
+/* { dg-options "-O2 -mtune=znver5 -march=x86-64 -mavx2" } */
 
 extern char *dst, *src;
 
-- 
2.34.1

[PATCH v2] Remove SPR/GNR/DMR from avx512_move_by_pieces tune.

Reply via email to