On Mon, Mar 28, 2022 at 12:50 AM Jan Hubicka via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
> as seen on TSVC, Spec2017, the Zen3 gather instruction is a win only for
> vectors with 8 elements.  At the time I was implementing the tuning vectorizer
> did not know how to open-code gather and thus it was still a win to enable it
> for shorter vector, but this has changed.
>
> The following are results on Zen3 machine:
>
> | Benchmark       | Master | Rate | Patch | Rate |     % |
> |-----------------+--------+------+-------+------+-------|
> | 500.perlbench_r |    246 | 6.47 |   250 | 6.36 |  1.63 |
> | 502.gcc_r       |    215 | 6.59 |   215 | 6.59 |  0.00 |
> | 505.mcf_r       |    299 | 5.40 |   299 | 5.41 |  0.00 |
> | 520.omnetpp_r   |    250 | 5.25 |   249 | 5.27 | -0.40 |
> | 523.xalancbmk_r |    197 | 5.37 |   195 | 5.43 | -1.02 |
> | 525.x264_r      |    160 | 11.0 |   160 | 11.0 |  0.00 |
> | 531.deepsjeng_r |    242 | 4.73 |   240 | 4.78 | -0.83 |
> | 541.leela_r     |    353 | 4.70 |   355 | 4.67 |  0.57 |
> | 548.exchange2_r |    146 | 17.9 |   146 | 17.9 |  0.00 |
> | 557.xz_r        |    290 | 3.72 |   291 | 3.71 |  0.34 |
> |-----------------+--------+------+-------+------+-------|
> | Geomean         |        | 6.34 |       | 6.34 |       |
>
> | Benchmark       | Master | Rate | Patch | Rate |      % |
> |-----------------+--------+------+-------+------+--------|
> | 503.bwaves_r    |    130 | 77.2 |   130 | 77.1 |   0.00 |
> | 507.cactuBSSN_r |    246 | 5.16 |   245 | 5.17 |  -0.41 |
> | 508.namd_r      |    163 | 5.84 |   162 | 5.85 |  -0.61 |
> | 510.parest_r    |    277 | 9.45 |   218 | 12.0 | -21.30 |
> | 511.povray_r    |    286 | 8.17 |   281 | 8.31 |  -1.75 |
> | 519.lbm_r       |    138 | 7.62 |   137 | 7.67 |  -0.72 |
> | 521.wrf_r       |    166 | 13.5 |   167 | 13.5 |   0.60 |
> | 526.blender_r   |    214 | 7.13 |   215 | 7.10 |   0.47 |
> | 527.cam4_r      |    176 | 9.92 |   173 | 10.1 |  -1.70 |
> | 538.imagick_r   |    306 | 8.13 |   315 | 7.90 |   2.94 |
> | 544.nab_r       |    199 | 8.46 |   199 | 8.44 |   0.00 |
> | 549.fotonik3d_r |    254 | 15.4 |   243 | 16.1 |  -4.33 |
> | 554.roms_r      |    210 | 7.57 |   210 | 7.58 |   0.00 |
> |-----------------+--------+------+-------+------+--------|
> | Geomean         |        | 10.0 |       | 10.3 |        |
>
> So main wins are on parest and fotonik.  I looked into imagemagick and it 
> looks
> like a noise - benchmarks was run by Martin and it did not reproduce for me on
> my zen box.
>
> Bootstrapped/regtested x8_64-linux.  I plan to commit tomorrow if there are no
> complains.
>
> Honza
>
> gcc/ChangeLog:
>
> 2022-03-28  Jan Hubicka  <hubi...@ucw.cz>
>
>         * config/i386/i386-builtins.cc (ix86_vectorize_builtin_gather): Test
>         TARGET_USE_GATHER_2PARTS and TARGET_USE_GATHER_4PARTS.
>         * config/i386/i386.h (TARGET_USE_GATHER_2PARTS): New macro.
>         (TARGET_USE_GATHER_4PARTS): New macro.
>         * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): New tune
>         (X86_TUNE_USE_GATHER_4PARTS): New tune
>
> diff --git a/gcc/config/i386/i386-builtins.cc 
> b/gcc/config/i386/i386-builtins.cc
> index 2570501ae7e..4a222c9f2c7 100644
> --- a/gcc/config/i386/i386-builtins.cc
> +++ b/gcc/config/i386/i386-builtins.cc
> @@ -1785,7 +1785,12 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
>    bool si;
>    enum ix86_builtins code;
>
> -  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
> +  if (! TARGET_AVX2
> +      || (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)2)

You can write '2u' instead of '(unsigned)2'

> +         ? !TARGET_USE_GATHER_2PARTS
> +         : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)4)
> +            ? !TARGET_USE_GATHER_4PARTS
> +            : !TARGET_USE_GATHER)))
>      return NULL_TREE;
>
>    if ((TREE_CODE (index_type) != INTEGER_TYPE
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index b92955177fe..363082ba47b 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -390,6 +390,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>         ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
>  #define TARGET_AVOID_4BYTE_PREFIXES \
>         ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
> +#define TARGET_USE_GATHER_2PARTS \
> +       ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS]
> +#define TARGET_USE_GATHER_4PARTS \
> +       ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
>  #define TARGET_USE_GATHER \
>         ix86_tune_features[X86_TUNE_USE_GATHER]
>  #define TARGET_FUSE_CMP_AND_BRANCH_32 \
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 82ca0ae63ac..09e3cf794db 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -464,7 +464,18 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
> "avoid_4byte_prefixes",
>           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | 
> m_ALDERLAKE
>           | m_INTEL)
>
> -/* X86_TUNE_USE_GATHER: Use gather instructions.  */
> +/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
> +   elements.  */
> +DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
> +         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
> +
> +/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
> +   elements.  */
> +DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
> +         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
> +
> +/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 4 or more

8 or more elements?

Otherwise looks OK to me.

> +   elements.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
>           ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC))
>

Reply via email to