https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108346

            Bug ID: 108346
           Summary: gather/scatter loops optimized too often for znver4
                    (and other zens)
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

The following two benchmarks tests gather/scatter codegen:
s4113.c:

#include <math.h>
#include <malloc.h>

//typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
//    reductions
//    if to max reduction

    real_t x;
    int * __restrict__ ip = (int *) malloc(LEN_1D*sizeof(real_t));

    for (int i = 0; i < LEN_1D; i = i+5){
        (ip)[i]   = (i+4);
        (ip)[i+1] = (i+2);
        (ip)[i+2] = (i);
        (ip)[i+3] = (i+3);
        (ip)[i+4] = (i+1);
    }
    for (int nl = 0; nl < 2*iterations; nl++) {
        for (int i = 1; i < LEN_1D; i += 2) {
            a[ip[i]] = b[ip[i]] + c[i];
        }
        asm("":::"memory");
    }

    return x;
}


s4115.c:
#include <math.h>
#include <malloc.h>

#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
//    reductions
//    if to max reduction

    real_t x;
    int * __restrict__ ip = (int *) malloc(LEN_1D*sizeof(real_t));

    for (int i = 0; i < LEN_1D; i = i+5){
        (ip)[i]   = (i+4);
        (ip)[i+1] = (i+2);
        (ip)[i+2] = (i);
        (ip)[i+3] = (i+3);
        (ip)[i+4] = (i+1);
    }
    for (int nl = 0; nl < 2*iterations; nl++) {
        for (int i = 1; i < LEN_1D; i += 2) {
            x += a[i] * b[ip[i]];
        }
        asm("":::"memory");
    }

    return x;
}

On zver4 I get following times with disabling/enabling vectorization and
disabling/enabling gather&scatter use:

                                         runtime
type      optimization    operation  scalar nogather gather parts instruction
char      avx256_optimal  load+store 14.23  N/A      N/A
char      avx256_optimal  load       14.25  N/A      N/A
char      ^avx256_optimal load+store 14.02  N/A      N/A
char      ^avx256_optimal load       14.25  N/A      N/A
short     avx256_optimal  load+store*14.23  N/A      N/A
short     avx256_optimal  load      *14.23  N/A      N/A
short     ^avx256_optimal load+store 15.22  N/A      N/A
short     ^avx256_optimal load       14.23  N/A      N/A
int       avx256_optimal  load+store*16.51  27.66    25.96  8     vpgatherdd
ymm,vpscatterdd ymm
int       avx256_optimal  load       14.13  13.17   *12.71  8     vpgatherdd
ymm
int       ^avx256_optimal load+store*16.57  33.25    26.06  16    vpgatherdd
zmm,vpscatterdd zmm
int       ^avx256_optimal load       14.14  16.81   *13.63  16    vpgatherdd
zmm
long      avx256_optimal  load+store*20.59  20.66    32.03  4     vpgatherdq
zmm,vpscatterdq zmm
long      avx256_optimal  load       15.36 *15.36    15.82  4     vpgatherdq
zmm
long      ^avx256_optimal load+store 22.42 *20.96    30.54  8     vpgatherdq
zmm,vpscatterdq zmm
long      ^avx256_optimal load      *15.87  16.40    18.68  8     vpgatherdq
zmm
float     avx256_optimal  load+store 16.88  27.78    26.08  8     vgatherdps
ymm, vscatterdps ymm
float     avx256_optimal  load       26.01 *13.19    13.30  8     vgatherdps
ymm
float     ^avx256_optimal load+store*16.89  33.22    26.19  16    vgatherdps
zmm, vscatterdps zmm
float     ^avx256_optimal load       26.01  16.61   *13.85  16    vgatherdps
zmm
double    avx256_optimal  load+store 21.94 *20.81    31.43  4     vgatherdpd
ymm, vscatterdpd ymm
double    avx256_optimal  load       26.01  26.01   *15.20  4     vgatherdpd
ymm
double    ^avx256_optimal load+store 21.44 *21.65    30.73  8     vgatherdpd
zmm, vscatterdpd zmm
double    ^avx256_optimal load       26.01  26.01   *18.24  8     vgatherdpd
zmm


We incorrectly vectorize for int load+store loop causing 60% regression.
Vectorizing avx512 long load loop seems to be also slight loss, but not that
important.  I will post patch todisable scatter instructions since they does
not seem to be win.

Reply via email to