https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101637

            Bug ID: 101637
           Summary: #pragma omp for simd defeats VECT_COMPARE_COSTS
                    optimisations
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rsandifo at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64*-*-*

Compiling this with -O3 -march=armv8.2-a+sve:

void
foo (__INT64_TYPE__ *a, __INT32_TYPE__ *b, __INT16_TYPE__ *c)
{
//#pragma omp for simd
  for (int i = 0; i < 100; ++i)
    a[i] = b[i] + c[i];
}

gives:

.L2:
        ld1w    z1.s, p0/z, [x1, x3, lsl 2]
        ld1sh   z0.s, p0/z, [x2, x3, lsl 1]
        punpklo p1.h, p0.b
        add     z0.s, z0.s, z1.s
        punpkhi p0.h, p0.b
        sunpklo z1.d, z0.s
        sunpkhi z0.d, z0.s
        st1d    z1.d, p1, [x0, x3, lsl 3]
        st1d    z0.d, p0, [x5, x3, lsl 3]
        add     x3, x3, x6
        whilelo p0.s, w3, w4
        b.any   .L2

whereas uncommenting the pragma gives the considerably uglier:

.L2:
        ld1h    z0.h, p0/z, [x2, x3, lsl 1]
        punpklo p1.h, p0.b
        punpkhi p0.h, p0.b
        ld1w    z2.s, p1/z, [x1, x3, lsl 2]
        ld1w    z3.s, p0/z, [x7, x3, lsl 2]
        punpklo p2.h, p1.b
        punpkhi p1.h, p1.b
        sunpklo z1.s, z0.h
        sunpkhi z0.s, z0.h
        add     z1.s, z1.s, z2.s
        add     z0.s, z0.s, z3.s
        sunpklo z2.d, z1.s
        sunpklo z3.d, z0.s
        sunpkhi z1.d, z1.s
        sunpkhi z0.d, z0.s
        st1d    z1.d, p1, [x0, #1, mul vl]
        punpklo p1.h, p0.b
        punpkhi p0.h, p0.b
        st1d    z3.d, p1, [x0, #2, mul vl]
        st1d    z0.d, p0, [x0, #3, mul vl]
        st1d    z2.d, p2, [x0]
        add     x3, x3, x6
        add     x0, x0, x5
        whilelo p0.h, w3, w4
        b.any   .L2

For VECT_COMPARE_COSTS targets, we should probably still consider all
possibilities and pick the “best” vector implementation (ignoring the
comparison with scalar code).

Reply via email to