https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106343

            Bug ID: 106343
           Summary: Addition with constants is not vectorized by SLP when
                    it includes zero
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: manolis.tsamis at vrull dot eu
  Target Milestone: ---
            Target: aarch64

Created attachment 53316
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=53316&action=edit
Does not vectorize

The following test case:

  void foo (uint32_t dst[8], uint8_t src1[8], uint8_t src2[8])
  {
    uint16_t diff_e0 = src1[0] - src2[0];
    uint16_t diff_e1 = src1[1] - src2[1];
    uint16_t diff_e2 = src1[2] - src2[2];
    uint16_t diff_e3 = src1[3] - src2[3];
    uint16_t diff_e4 = src1[4] - src2[4];
    uint16_t diff_e5 = src1[5] - src2[5];
    uint16_t diff_e6 = src1[6] - src2[6];
    uint16_t diff_e7 = src1[7] - src2[7];

    uint32_t a0 = diff_e0 + 1;
    uint32_t a1 = diff_e1 + 3;
    uint32_t a2 = diff_e2 + 4;
    uint32_t a3 = diff_e3 + 2;
    uint32_t a4 = diff_e4 + 12;
    uint32_t a5 = diff_e5 + 11;
    uint32_t a6 = diff_e6 + 9;
    uint32_t a7 = diff_e7 + 3;

    dst[0] = a0;
    dst[1] = a1;
    dst[2] = a2;
    dst[3] = a3;
    dst[4] = a4;
    dst[5] = a5;
    dst[6] = a6;
    dst[7] = a7;
  }

Produces nice vectorized code on aarch64:

  ldr     d2, [x2]
  adrp    x3, .LC0
  ldr     d0, [x1]
  ldr     q1, [x3, #:lo12:.LC0]
  usubl   v0.8h, v0.8b, v2.8b
  uaddl   v2.4s, v0.4h, v1.4h
  uaddl2  v0.4s, v0.8h, v1.8h
  stp     q2, q0, [x0]
  ret

But if any of the constants is replaced with zero instead then scalar code is
produced:

  ldrb    w4, [x2, 1]
  ldrb    w8, [x1, 1]
  ldrb    w3, [x2, 3]
  ldrb    w7, [x1, 3]
  sub     w8, w8, w4
  ldrb    w6, [x1, 4]
  and     w8, w8, 65535
  ldrb    w4, [x2, 4]
  sub     w7, w7, w3
  ldrb    w5, [x1, 5]
  and     w7, w7, 65535
  ldrb    w3, [x2, 5]
  sub     w6, w6, w4
  ldrb    w9, [x2, 6]
  and     w6, w6, 65535
  ldrb    w4, [x1, 6]
  sub     w5, w5, w3
  ldrb    w10, [x2, 7]
  and     w5, w5, 65535
  ldrb    w3, [x1, 7]
  sub     w4, w4, w9
  ldrb    w11, [x2]
  and     w4, w4, 65535
  ldrb    w9, [x1]
  sub     w3, w3, w10
  ldrb    w2, [x2, 2]
  add     w8, w8, 3
  ldrb    w10, [x1, 2]
  sub     w9, w9, w11
  and     w1, w3, 65535
  and     w9, w9, 65535
  sub     w10, w10, w2
  add     w3, w5, 11
  add     w2, w4, 9
  add     w7, w7, 2
  add     w6, w6, 12
  add     w1, w1, 3
  add     w4, w9, 1
  and     w5, w10, 65535
  stp     w4, w8, [x0]
  stp     w5, w7, [x0, 8]
  stp     w6, w3, [x0, 16]
  stp     w2, w1, [x0, 24]
  ret

It would be possible to produce the same vectorized code as above but with zero
in the constants. If I understand correctly, the identity element of addition
is not taken into consideration in the SLP vectorizer, which could be improved.
The same happens with subtraction.

I can reproduce this in any recent version of GCC (e.g. >= 10).

Vectorized case: https://godbolt.org/z/5sbb1an89
Scalar case:     https://godbolt.org/z/v8jPT9jEe

Reply via email to