https://gcc.gnu.org/g:5080d98a383de244a7b78ae50456fd41881268c2
commit r16-953-g5080d98a383de244a7b78ae50456fd41881268c2 Author: Pranav Gorantla <pranav.goran...@amd.com> Date: Thu May 29 15:02:24 2025 +0200 i386: Use Shuffles instead of shifts for Reduction in AMD znver4/5 In AMD znver4, znver5 targets vpshufd, vpsrldq have latencies 1,2 and throughput 4 (2 for znver4),2 respectively. It is better to generate shuffles instead of shifts wherever possible. In this patch we try to generate appropriate shuffle instruction to copy higher half to lower half instead of a simple right shift during horizontal vector reduction. gcc/ChangeLog: * config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to generate reduc half for V4SI, similar modes. * config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New Macro. * config/i386/x86-tune.def (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF): New tuning. gcc/testsuite/ChangeLog: * gcc.target/i386/reduc-pshuf.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 27 +++++++++++++++++++++++++++ gcc/config/i386/i386.h | 4 +++- gcc/config/i386/x86-tune.def | 5 +++++ gcc/testsuite/gcc.target/i386/reduc-pshuf.c | 16 ++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 7fd03c88630f..181e64a86bf6 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -18724,6 +18724,33 @@ emit_reduc_half (rtx dest, rtx src, int i) case E_V8HFmode: case E_V4SImode: case E_V2DImode: + if (TARGET_SSE_REDUCTION_PREFER_PSHUF) + { + if (i == 128) + { + d = gen_reg_rtx (V4SImode); + tem = gen_sse2_pshufd_1 ( + d, force_reg (V4SImode, gen_lowpart (V4SImode, src)), + GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3)); + break; + } + else if (i == 64) + { + d = gen_reg_rtx (V4SImode); + tem = gen_sse2_pshufd_1 ( + d, force_reg (V4SImode, gen_lowpart (V4SImode, src)), + GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1)); + break; + } + else if (i == 32) + { + d = gen_reg_rtx (V8HImode); + tem = gen_sse2_pshuflw_1 ( + d, force_reg (V8HImode, gen_lowpart (V8HImode, src)), + GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1)); + break; + } + } d = gen_reg_rtx (V1TImode); tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), GEN_INT (i / 2)); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index ccc62fc3e7ca..d32d9ad997e6 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -490,7 +490,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_SSE_MOVCC_USE_BLENDV \ ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] #define TARGET_ALIGN_TIGHT_LOOPS \ - ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] + ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] +#define TARGET_SSE_REDUCTION_PREFER_PSHUF \ + ix86_tune_features[X86_TUNE_SSE_REDUCTION_PREFER_PSHUF] /* Feature tests against the various architecture variations. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index e6044c6032e4..91cdca7fbfc2 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -572,6 +572,11 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, "sse_movcc_use_blendv", ~m_CORE_ATOM) +/* X86_TUNE_V4SI_REDUCTION_PREFER_SHUFD: Prefer pshuf to reduce V16QI, + V8HI, V8HI, V4SI, V4FI, V2DI modes when lshr are costlier. */ +DEF_TUNE (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF, + "sse_reduction_prefer_pshuf", m_ZNVER4 | m_ZNVER5) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc/testsuite/gcc.target/i386/reduc-pshuf.c b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c new file mode 100644 index 000000000000..e46d2bab9c5f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=znver5 " } */ + +#define N 32 +#define T short +T +foo (T *a) +{ + T sum = 0; + for (int i = 0; i < N; i++) + sum += a[i]; + return sum; +} + +/* { dg-final { scan-assembler-times "vpsrl" 0 } } */ +/* { dg-final { scan-assembler-times "vpshuf" 3 } } */