https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99149

            Bug ID: 99149
           Summary: [11 Regression] ICE during vectorization when shared
                    trees contain different complex patterns
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: tnfchris at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64-*

The following testcase

class a {
  float b;
  float c;

public:
  a(float d, float e) : b(d), c(e) {}
  a operator+(a d) { return a(b + d.b, c + d.c); }
  a operator*(a d) { return a(b * b - c * c, b * c + c * d.b); }
};
int f, g;
class {
  a *h;
  a *i;

public:
  void j() {
    a k = h[0], l = i[g], m = k * i[f];
    i[g] = l + m;
    i[f] = m;
  }
} n;
main() { n.j(); }

crashes with aarch64-none-elf-g++ -w -march=armv8.3-a -O3 -S main.cpp

At -O3 there are two SLP trees created, but they share partially part of a
complex pattern:

   Final SLP tree for instance 0x559a7e0:
   node 0x56f5520 (max_nunits=2, refcnt=2)
   op template: MEM <float> [(struct a *)_16] = _20;
           stmt 0 MEM <float> [(struct a *)_16] = _20;
           stmt 1 MEM <float> [(struct a *)_16 + 4B] = _23;
           children 0x56f5058
   node 0x56f5058 (max_nunits=2, refcnt=3)
   op: VEC_PERM_EXPR
           stmt 0 _20 = _18 - _19;
           stmt 1 _23 = _21 + _22;
           lane permutation { 0[0] 1[1] }
           children 0x56f5410 0x56f5498
   node 0x56f5410 (max_nunits=1, refcnt=1)
   op template: _20 = _18 - _19;
           { }
           children 0x56f50e0 0x56f5278
   node 0x56f50e0 (max_nunits=2, refcnt=3)
   op template: _18 = k$b_4 * k$b_4;
           stmt 0 _18 = k$b_4 * k$b_4;
           stmt 1 _21 = k$b_4 * k$c_5;
           children 0x56f5168 0x56f51f0
   node 0x56f5168 (max_nunits=2, refcnt=2)
   op template: k$b_4 = MEM[(const struct a &)_3].b;
           stmt 0 k$b_4 = MEM[(const struct a &)_3].b;
           stmt 1 k$b_4 = MEM[(const struct a &)_3].b;
           load permutation { 0 0 }
   node 0x56f51f0 (max_nunits=2, refcnt=2)
   op template: k$b_4 = MEM[(const struct a &)_3].b;
           stmt 0 k$b_4 = MEM[(const struct a &)_3].b;
           stmt 1 k$c_5 = MEM[(const struct a &)_3].c;
           load permutation { 0 1 }
   node 0x56f5278 (max_nunits=2, refcnt=3)
   op template: _19 = k$c_5 * k$c_5;
           stmt 0 _19 = k$c_5 * k$c_5;
           stmt 1 _22 = k$c_5 * d$b_17;
           children 0x56f5300 0x56f5388
   node 0x56f5300 (max_nunits=2, refcnt=2)
   op template: k$c_5 = MEM[(const struct a &)_3].c;
           stmt 0 k$c_5 = MEM[(const struct a &)_3].c;
           stmt 1 k$c_5 = MEM[(const struct a &)_3].c;
           load permutation { 1 1 }
   node (external) 0x56f5388 (max_nunits=1, refcnt=1)
           { k$c_5, d$b_17 }
   node 0x56f5498 (max_nunits=1, refcnt=1)
   op template: _23 = _21 + _22;
           { }
           children 0x56f50e0 0x56f5278

These dump contains two roots, one at 0x56f5520 and one at 0x5763380:

   Final SLP tree for instance 0x5763380:
   node 0x56f4ec0 (max_nunits=2, refcnt=2)
   op template: MEM <float> [(struct a *)_10] = _24;
           stmt 0 MEM <float> [(struct a *)_10] = _24;
           stmt 1 MEM <float> [(struct a *)_10 + 4B] = _25;
           children 0x56f4f48
   node 0x56f4f48 (max_nunits=2, refcnt=2)
   op template: _24 = l$b_11 + _20;
           stmt 0 _24 = l$b_11 + _20;
           stmt 1 _25 = l$c_12 + _23;
           children 0x56f4fd0 0x56f5058
   node 0x56f4fd0 (max_nunits=2, refcnt=2)
   op template: l$b_11 = MEM[(const struct a &)_10].b;
           stmt 0 l$b_11 = MEM[(const struct a &)_10].b;
           stmt 1 l$c_12 = MEM[(const struct a &)_10].c;
           load permutation { 0 1 }
   node 0x56f5058 (max_nunits=2, refcnt=2)
   op: VEC_PERM_EXPR
           stmt 0 _20 = _18 - _19;
           stmt 1 _23 = _21 + _22;
           lane permutation { 0[0] 1[1] }
           children 0x56f5410 0x56f5498
   node 0x56f5410 (max_nunits=1, refcnt=1)
   op template: _20 = _18 - _19;
           { }
           children 0x56f50e0 0x56f5278
   node 0x56f50e0 (max_nunits=2, refcnt=3)
   op template: _18 = k$b_4 * k$b_4;
           stmt 0 _18 = k$b_4 * k$b_4;
           stmt 1 _21 = k$b_4 * k$c_5;
           children 0x56f5168 0x56f51f0
   node 0x56f5168 (max_nunits=2, refcnt=2)
   op template: k$b_4 = MEM[(const struct a &)_3].b;
           stmt 0 k$b_4 = MEM[(const struct a &)_3].b;
           stmt 1 k$b_4 = MEM[(const struct a &)_3].b;
           load permutation { 0 0 }
   node 0x56f51f0 (max_nunits=2, refcnt=2)
   op template: k$b_4 = MEM[(const struct a &)_3].b;
           stmt 0 k$b_4 = MEM[(const struct a &)_3].b;
           stmt 1 k$c_5 = MEM[(const struct a &)_3].c;
           load permutation { 0 1 }
   node 0x56f5278 (max_nunits=2, refcnt=3)
   op template: _19 = k$c_5 * k$c_5;
           stmt 0 _19 = k$c_5 * k$c_5;
           stmt 1 _22 = k$c_5 * d$b_17;
           children 0x56f5300 0x56f5388
   node 0x56f5300 (max_nunits=2, refcnt=2)
   op template: k$c_5 = MEM[(const struct a &)_3].c;
           stmt 0 k$c_5 = MEM[(const struct a &)_3].c;
           stmt 1 k$c_5 = MEM[(const struct a &)_3].c;
           load permutation { 1 1 }
   node (external) 0x56f5388 (max_nunits=1, refcnt=1)
           { k$c_5, d$b_17 }
   node 0x56f5498 (max_nunits=1, refcnt=1)
   op template: _23 = _21 + _22;
           { }
           children 0x56f50e0 0x56f5278


It's a bit had to see so let's render the graphs
https://sketchviz.com/@Mistuke/3f10160785531d484f9eba0de186bf65/5cedb98ad6ee1e083c3bf9574dba1c679851412e

The graph rooted to the store in *_10 matches COMPLEX_FMA but that rooted in
the store to *_16 only matches COMPLEX_MUL.

In this case the COMPLEX_FMA shouldn't be recognized as they both share the
VEC_PERM_EXPR.

Disabling the COMPLEX_FMA pattern shows that this does then generate the right
code:

main:
        adrp    x1, .LANCHOR0
        add     x2, x1, :lo12:.LANCHOR0
        movi    v0.2s, 0
        mov     w0, 0
        ldr     x4, [x1, #:lo12:.LANCHOR0]
        ldrsw   x3, [x2, 16]
        ldr     x1, [x2, 8]
        ldr     d1, [x4]
        fcmla   v0.2s, v1.2s, v1.2s, #0
        fcmla   v0.2s, v1.2s, v1.2s, #90
        ldrsw   x2, [x2, 20]
        ldr     d1, [x1, x3, lsl 3]
        fadd    v1.2s, v1.2s, v0.2s
        str     d1, [x1, x3, lsl 3]
        str     d0, [x1, x2, lsl 3]
        ret

So the pattern matcher should probably check that all children it's going to
replace belong to the same instance. But I am not sure how.

Reply via email to