https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71361

            Bug ID: 71361
           Summary: [7 Regression] Changes in ivopts caused perf
                    regression on x86
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: iverbin at gcc dot gnu.org
                CC: amker.cheng at gmail dot com, izamyatin at gmail dot com,
                    kyukhin at gcc dot gnu.org
  Target Milestone: ---

r235805 leads to performance regression on x86.

Reduced testcase:

int arr_1[512];
int arr_2[512];

int main ()
{
    int c1[512];
    int c2[512];
    int res[512];

    for (int i = 0; i < 512; i++)
      arr_1[i] = arr_2[i] = c1[i] = c2[i] = i;

    for (int l = 0; l < 1000000; l++)
      for (int k = 1; k <= 9; k++)
        {
          int n1 = 1 << k;
          int n2 = n1 >> 1;

          for (int j = 0; j < n2; j++)
            for (int i = j; i < 512; i += n1)
              {
                int idx = i + n2;
                int x1 = arr_1[idx] * c1[j] + arr_2[idx] * c2[j];
                int x2 = arr_2[idx] * c1[j] + arr_1[idx] * c2[j];

                arr_1[i] = x1;
                arr_2[i] = x2;
                arr_1[idx] = x1;
                arr_2[idx] = x2;
              }
        }

    return 0;
}

Compilation options: -Ofast -m32 -fPIE
GCC is configured --with-arch=corei7 --with-cpu=corei7 --with-fpmath=sse
Run time on Sandy Bridge increased by ~20%
Run time on Atom increased by ~60%

Below are the dumps of the innermost loop after ivopts pass.

Before regression there are 2 induction variables, which are used as bases for
all 6 memory accesses:

  # i_66 = PHI <i_37(7), j_65(11)>
  # ivtmp.19_63 = PHI <ivtmp.19_95(7), ivtmp.19_76(11)>
  # ivtmp.20_17 = PHI <ivtmp.20_15(7), ivtmp.20_73(11)>
  _59 = (void *) ivtmp.19_63;
  _58 = (sizetype) n2_20;
  _22 = MEM[base: _59, index: _58, step: 4, offset: 0B];
  _24 = _22 * pretmp_105;
  _55 = (void *) ivtmp.20_17;
  _54 = (sizetype) n2_20;
  _25 = MEM[base: _55, index: _54, step: 4, offset: 0B];
  _27 = _25 * pretmp_107;
  x1_28 = _24 + _27;
  _30 = _25 * pretmp_105;
  _31 = _22 * pretmp_107;
  x2_32 = _30 + _31;
  _51 = (void *) ivtmp.19_63;
  MEM[base: _51, offset: 0B] = x1_28;
  _50 = (void *) ivtmp.20_17;
  MEM[base: _50, offset: 0B] = x2_32;
  _57 = (void *) ivtmp.19_63;
  _56 = (sizetype) n2_20;
  MEM[base: _57, index: _56, step: 4, offset: 0B] = x1_28;
  _53 = (void *) ivtmp.20_17;
  _52 = (sizetype) n2_20;
  MEM[base: _53, index: _52, step: 4, offset: 0B] = x2_32;
  i_37 = n1_19 + i_66;
  ivtmp.19_95 = ivtmp.19_63 + _77;
  ivtmp.20_15 = ivtmp.20_17 + _12;
  if (i_37 <= 511)
    goto <bb 7>;
  else
    goto <bb 9>;

After regression there is only one induction variable, which is used as index
for 4 memory accesses.

  # i_66 = PHI <i_37(7), j_65(11)>
  # ivtmp.22_63 = PHI <ivtmp.22_95(7), ivtmp.22_76(11)>
  _22 = MEM[symbol: arr_1, index: ivtmp.22_63, offset: 0B];
  _24 = _22 * pretmp_105;
  _25 = MEM[symbol: arr_2, index: ivtmp.22_63, offset: 0B];
  _27 = _25 * pretmp_107;
  x1_28 = _24 + _27;
  _30 = _25 * pretmp_105;
  _31 = _22 * pretmp_107;
  x2_32 = _30 + _31;
  _17 = (sizetype) i_66;
  _15 = _17 * 4;
  MEM[symbol: arr_1, index: _15, offset: 0B] = x1_28;
  _14 = (sizetype) i_66;
  _12 = _14 * 4;
  MEM[symbol: arr_2, index: _12, offset: 0B] = x2_32;
  MEM[symbol: arr_1, index: ivtmp.22_63, offset: 0B] = x1_28;
  MEM[symbol: arr_2, index: ivtmp.22_63, offset: 0B] = x2_32;
  i_37 = n1_19 + i_66;
  ivtmp.22_95 = ivtmp.22_63 + _77;
  if (i_37 <= 511)
    goto <bb 7>;
  else
    goto <bb 9>;

As a result, the final assembly contains 13% more instructions.

Before regression:

  .L5:
        movl    (%edi,%ebx,4), %eax
        movd    %xmm1, %edx
        movd    %xmm0, %ecx
        imull   (%esi,%ebx,4), %ecx
        imull   %eax, %edx
        addl    %ecx, %edx
        movd    %xmm0, %ecx
        imull   %ecx, %eax
        movd    %xmm1, %ecx
        imull   (%esi,%ebx,4), %ecx
        movl    %edx, (%esi)
        movl    %edx, (%esi,%ebx,4)
        movd    %xmm5, %edx
        addl    %edx, %esi
        addl    %ecx, %eax
        movl    %eax, (%edi)
        movl    %eax, (%edi,%ebx,4)
        movd    %xmm4, %eax
        addl    %edx, %edi
        addl    %eax, -4124(%ebp)
        movl    -4124(%ebp), %ecx
        cmpl    $511, %ecx
        jle     .L5

After regression:

  .L5:
        movd    %xmm5, %edi
        movd    %xmm3, %edx
        movd    %xmm1, %ebx
        imull   (%eax,%edx), %ebx
        movd    %xmm4, %ecx
        movd    %xmm4, %edx
        imull   (%eax,%edi), %ecx
        addl    %ecx, %ebx
        movd    %xmm1, %ecx
        imull   (%eax,%edi), %ecx
        movd    %ecx, %xmm0
        movd    %xmm3, %ecx
        imull   (%eax,%ecx), %edx
        movd    %xmm0, %ecx
        addl    %edx, %ecx
        movd    %xmm3, %edx
        movl    %ebx, (%edx,%esi,4)
        movd    %xmm3, %edx
        movl    %ecx, (%edi,%esi,4)
        addl    -4124(%ebp), %esi
        movd    %xmm5, %edi
        movl    %ebx, (%eax,%edx)
        movl    %ecx, (%eax,%edi)
        addl    -4128(%ebp), %eax
        cmpl    $511, %esi
        jle     .L5

Reply via email to