On Mon, Mar 28, 2022 at 12:50 AM Jan Hubicka via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, > as seen on TSVC, Spec2017, the Zen3 gather instruction is a win only for > vectors with 8 elements. At the time I was implementing the tuning vectorizer > did not know how to open-code gather and thus it was still a win to enable it > for shorter vector, but this has changed. > > The following are results on Zen3 machine: > > | Benchmark | Master | Rate | Patch | Rate | % | > |-----------------+--------+------+-------+------+-------| > | 500.perlbench_r | 246 | 6.47 | 250 | 6.36 | 1.63 | > | 502.gcc_r | 215 | 6.59 | 215 | 6.59 | 0.00 | > | 505.mcf_r | 299 | 5.40 | 299 | 5.41 | 0.00 | > | 520.omnetpp_r | 250 | 5.25 | 249 | 5.27 | -0.40 | > | 523.xalancbmk_r | 197 | 5.37 | 195 | 5.43 | -1.02 | > | 525.x264_r | 160 | 11.0 | 160 | 11.0 | 0.00 | > | 531.deepsjeng_r | 242 | 4.73 | 240 | 4.78 | -0.83 | > | 541.leela_r | 353 | 4.70 | 355 | 4.67 | 0.57 | > | 548.exchange2_r | 146 | 17.9 | 146 | 17.9 | 0.00 | > | 557.xz_r | 290 | 3.72 | 291 | 3.71 | 0.34 | > |-----------------+--------+------+-------+------+-------| > | Geomean | | 6.34 | | 6.34 | | > > | Benchmark | Master | Rate | Patch | Rate | % | > |-----------------+--------+------+-------+------+--------| > | 503.bwaves_r | 130 | 77.2 | 130 | 77.1 | 0.00 | > | 507.cactuBSSN_r | 246 | 5.16 | 245 | 5.17 | -0.41 | > | 508.namd_r | 163 | 5.84 | 162 | 5.85 | -0.61 | > | 510.parest_r | 277 | 9.45 | 218 | 12.0 | -21.30 | > | 511.povray_r | 286 | 8.17 | 281 | 8.31 | -1.75 | > | 519.lbm_r | 138 | 7.62 | 137 | 7.67 | -0.72 | > | 521.wrf_r | 166 | 13.5 | 167 | 13.5 | 0.60 | > | 526.blender_r | 214 | 7.13 | 215 | 7.10 | 0.47 | > | 527.cam4_r | 176 | 9.92 | 173 | 10.1 | -1.70 | > | 538.imagick_r | 306 | 8.13 | 315 | 7.90 | 2.94 | > | 544.nab_r | 199 | 8.46 | 199 | 8.44 | 0.00 | > | 549.fotonik3d_r | 254 | 15.4 | 243 | 16.1 | -4.33 | > | 554.roms_r | 210 | 7.57 | 210 | 7.58 | 0.00 | > |-----------------+--------+------+-------+------+--------| > | Geomean | | 10.0 | | 10.3 | | > > So main wins are on parest and fotonik. I looked into imagemagick and it > looks > like a noise - benchmarks was run by Martin and it did not reproduce for me on > my zen box. > > Bootstrapped/regtested x8_64-linux. I plan to commit tomorrow if there are no > complains. > > Honza > > gcc/ChangeLog: > > 2022-03-28 Jan Hubicka <hubi...@ucw.cz> > > * config/i386/i386-builtins.cc (ix86_vectorize_builtin_gather): Test > TARGET_USE_GATHER_2PARTS and TARGET_USE_GATHER_4PARTS. > * config/i386/i386.h (TARGET_USE_GATHER_2PARTS): New macro. > (TARGET_USE_GATHER_4PARTS): New macro. > * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): New tune > (X86_TUNE_USE_GATHER_4PARTS): New tune > > diff --git a/gcc/config/i386/i386-builtins.cc > b/gcc/config/i386/i386-builtins.cc > index 2570501ae7e..4a222c9f2c7 100644 > --- a/gcc/config/i386/i386-builtins.cc > +++ b/gcc/config/i386/i386-builtins.cc > @@ -1785,7 +1785,12 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, > bool si; > enum ix86_builtins code; > > - if (! TARGET_AVX2 || !TARGET_USE_GATHER) > + if (! TARGET_AVX2 > + || (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)2)
You can write '2u' instead of '(unsigned)2' > + ? !TARGET_USE_GATHER_2PARTS > + : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)4) > + ? !TARGET_USE_GATHER_4PARTS > + : !TARGET_USE_GATHER))) > return NULL_TREE; > > if ((TREE_CODE (index_type) != INTEGER_TYPE > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index b92955177fe..363082ba47b 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -390,6 +390,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; > ix86_tune_features[X86_TUNE_SLOW_PSHUFB] > #define TARGET_AVOID_4BYTE_PREFIXES \ > ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES] > +#define TARGET_USE_GATHER_2PARTS \ > + ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] > +#define TARGET_USE_GATHER_4PARTS \ > + ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] > #define TARGET_USE_GATHER \ > ix86_tune_features[X86_TUNE_USE_GATHER] > #define TARGET_FUSE_CMP_AND_BRANCH_32 \ > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 82ca0ae63ac..09e3cf794db 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -464,7 +464,18 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, > "avoid_4byte_prefixes", > m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | > m_ALDERLAKE > | m_INTEL) > > -/* X86_TUNE_USE_GATHER: Use gather instructions. */ > +/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 > + elements. */ > +DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", > + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC)) > + > +/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 > + elements. */ > +DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", > + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC)) > + > +/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 4 or more 8 or more elements? Otherwise looks OK to me. > + elements. */ > DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", > ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC)) >