https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71361
Bug ID: 71361 Summary: [7 Regression] Changes in ivopts caused perf regression on x86 Product: gcc Version: 7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: iverbin at gcc dot gnu.org CC: amker.cheng at gmail dot com, izamyatin at gmail dot com, kyukhin at gcc dot gnu.org Target Milestone: --- r235805 leads to performance regression on x86. Reduced testcase: int arr_1[512]; int arr_2[512]; int main () { int c1[512]; int c2[512]; int res[512]; for (int i = 0; i < 512; i++) arr_1[i] = arr_2[i] = c1[i] = c2[i] = i; for (int l = 0; l < 1000000; l++) for (int k = 1; k <= 9; k++) { int n1 = 1 << k; int n2 = n1 >> 1; for (int j = 0; j < n2; j++) for (int i = j; i < 512; i += n1) { int idx = i + n2; int x1 = arr_1[idx] * c1[j] + arr_2[idx] * c2[j]; int x2 = arr_2[idx] * c1[j] + arr_1[idx] * c2[j]; arr_1[i] = x1; arr_2[i] = x2; arr_1[idx] = x1; arr_2[idx] = x2; } } return 0; } Compilation options: -Ofast -m32 -fPIE GCC is configured --with-arch=corei7 --with-cpu=corei7 --with-fpmath=sse Run time on Sandy Bridge increased by ~20% Run time on Atom increased by ~60% Below are the dumps of the innermost loop after ivopts pass. Before regression there are 2 induction variables, which are used as bases for all 6 memory accesses: # i_66 = PHI <i_37(7), j_65(11)> # ivtmp.19_63 = PHI <ivtmp.19_95(7), ivtmp.19_76(11)> # ivtmp.20_17 = PHI <ivtmp.20_15(7), ivtmp.20_73(11)> _59 = (void *) ivtmp.19_63; _58 = (sizetype) n2_20; _22 = MEM[base: _59, index: _58, step: 4, offset: 0B]; _24 = _22 * pretmp_105; _55 = (void *) ivtmp.20_17; _54 = (sizetype) n2_20; _25 = MEM[base: _55, index: _54, step: 4, offset: 0B]; _27 = _25 * pretmp_107; x1_28 = _24 + _27; _30 = _25 * pretmp_105; _31 = _22 * pretmp_107; x2_32 = _30 + _31; _51 = (void *) ivtmp.19_63; MEM[base: _51, offset: 0B] = x1_28; _50 = (void *) ivtmp.20_17; MEM[base: _50, offset: 0B] = x2_32; _57 = (void *) ivtmp.19_63; _56 = (sizetype) n2_20; MEM[base: _57, index: _56, step: 4, offset: 0B] = x1_28; _53 = (void *) ivtmp.20_17; _52 = (sizetype) n2_20; MEM[base: _53, index: _52, step: 4, offset: 0B] = x2_32; i_37 = n1_19 + i_66; ivtmp.19_95 = ivtmp.19_63 + _77; ivtmp.20_15 = ivtmp.20_17 + _12; if (i_37 <= 511) goto <bb 7>; else goto <bb 9>; After regression there is only one induction variable, which is used as index for 4 memory accesses. # i_66 = PHI <i_37(7), j_65(11)> # ivtmp.22_63 = PHI <ivtmp.22_95(7), ivtmp.22_76(11)> _22 = MEM[symbol: arr_1, index: ivtmp.22_63, offset: 0B]; _24 = _22 * pretmp_105; _25 = MEM[symbol: arr_2, index: ivtmp.22_63, offset: 0B]; _27 = _25 * pretmp_107; x1_28 = _24 + _27; _30 = _25 * pretmp_105; _31 = _22 * pretmp_107; x2_32 = _30 + _31; _17 = (sizetype) i_66; _15 = _17 * 4; MEM[symbol: arr_1, index: _15, offset: 0B] = x1_28; _14 = (sizetype) i_66; _12 = _14 * 4; MEM[symbol: arr_2, index: _12, offset: 0B] = x2_32; MEM[symbol: arr_1, index: ivtmp.22_63, offset: 0B] = x1_28; MEM[symbol: arr_2, index: ivtmp.22_63, offset: 0B] = x2_32; i_37 = n1_19 + i_66; ivtmp.22_95 = ivtmp.22_63 + _77; if (i_37 <= 511) goto <bb 7>; else goto <bb 9>; As a result, the final assembly contains 13% more instructions. Before regression: .L5: movl (%edi,%ebx,4), %eax movd %xmm1, %edx movd %xmm0, %ecx imull (%esi,%ebx,4), %ecx imull %eax, %edx addl %ecx, %edx movd %xmm0, %ecx imull %ecx, %eax movd %xmm1, %ecx imull (%esi,%ebx,4), %ecx movl %edx, (%esi) movl %edx, (%esi,%ebx,4) movd %xmm5, %edx addl %edx, %esi addl %ecx, %eax movl %eax, (%edi) movl %eax, (%edi,%ebx,4) movd %xmm4, %eax addl %edx, %edi addl %eax, -4124(%ebp) movl -4124(%ebp), %ecx cmpl $511, %ecx jle .L5 After regression: .L5: movd %xmm5, %edi movd %xmm3, %edx movd %xmm1, %ebx imull (%eax,%edx), %ebx movd %xmm4, %ecx movd %xmm4, %edx imull (%eax,%edi), %ecx addl %ecx, %ebx movd %xmm1, %ecx imull (%eax,%edi), %ecx movd %ecx, %xmm0 movd %xmm3, %ecx imull (%eax,%ecx), %edx movd %xmm0, %ecx addl %edx, %ecx movd %xmm3, %edx movl %ebx, (%edx,%esi,4) movd %xmm3, %edx movl %ecx, (%edi,%esi,4) addl -4124(%ebp), %esi movd %xmm5, %edi movl %ebx, (%eax,%edx) movl %ecx, (%eax,%edi) addl -4128(%ebp), %eax cmpl $511, %esi jle .L5