https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80283
Bug ID: 80283
Summary: bad SIMD register allocation
Product: gcc
Version: 7.0.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: trippels at gcc dot gnu.org
Target Milestone: ---
Michael S reported the following testcase on
http://www.realworldtech.com/forum/?threadid=166719&curpostid=166719
markus@x4 tmp % cat vec_reg.c
#include <immintrin.h>
#define MM_FMADD(a, b, c) _mm256_add_ps(_mm256_mul_ps((a), (b)), (c))
void foo(const float *A, const __m256 *B, __m256 *C, int kSteps) {
__m256 acc00 = _mm256_setzero_ps();
__m256 acc10 = _mm256_setzero_ps();
__m256 acc01 = _mm256_setzero_ps();
__m256 acc11 = _mm256_setzero_ps();
__m256 acc02 = _mm256_setzero_ps();
__m256 acc12 = _mm256_setzero_ps();
__m256 acc03 = _mm256_setzero_ps();
__m256 acc13 = _mm256_setzero_ps();
__m256 acc04 = _mm256_setzero_ps();
__m256 acc14 = _mm256_setzero_ps();
for (int k = 0; k < kSteps; ++k) {
__m256 a, b0, b1;
b0 = B[0];
b1 = B[1];
B += 2;
a = _mm256_broadcast_ss(&A[4 * 0 + 0]);
acc00 = MM_FMADD(a, b0, acc00);
acc10 = MM_FMADD(a, b1, acc10);
a = _mm256_broadcast_ss(&A[4 * 1 + 0]);
acc01 = MM_FMADD(a, b0, acc01);
acc11 = MM_FMADD(a, b1, acc11);
a = _mm256_broadcast_ss(&A[4 * 2 + 0]);
acc02 = MM_FMADD(a, b0, acc02);
acc12 = MM_FMADD(a, b1, acc12);
a = _mm256_broadcast_ss(&A[4 * 3 + 0]);
acc03 = MM_FMADD(a, b0, acc03);
acc13 = MM_FMADD(a, b1, acc13);
a = _mm256_broadcast_ss(&A[4 * 4 + 0]);
acc04 = MM_FMADD(a, b0, acc04);
acc14 = MM_FMADD(a, b1, acc14);
b0 = B[0];
b1 = B[1];
B += 2;
a = _mm256_broadcast_ss(&A[4 * 0 + 1]);
acc00 = MM_FMADD(a, b0, acc00);
acc10 = MM_FMADD(a, b1, acc10);
a = _mm256_broadcast_ss(&A[4 * 1 + 1]);
acc01 = MM_FMADD(a, b0, acc01);
acc11 = MM_FMADD(a, b1, acc11);
a = _mm256_broadcast_ss(&A[4 * 2 + 1]);
acc02 = MM_FMADD(a, b0, acc02);
acc12 = MM_FMADD(a, b1, acc12);
a = _mm256_broadcast_ss(&A[4 * 3 + 1]);
acc03 = MM_FMADD(a, b0, acc03);
acc13 = MM_FMADD(a, b1, acc13);
a = _mm256_broadcast_ss(&A[4 * 4 + 1]);
acc04 = MM_FMADD(a, b0, acc04);
acc14 = MM_FMADD(a, b1, acc14);
b0 = B[0];
b1 = B[1];
B += 2;
a = _mm256_broadcast_ss(&A[4 * 0 + 2]);
acc00 = MM_FMADD(a, b0, acc00);
acc10 = MM_FMADD(a, b1, acc10);
a = _mm256_broadcast_ss(&A[4 * 1 + 2]);
acc01 = MM_FMADD(a, b0, acc01);
acc11 = MM_FMADD(a, b1, acc11);
a = _mm256_broadcast_ss(&A[4 * 2 + 2]);
acc02 = MM_FMADD(a, b0, acc02);
acc12 = MM_FMADD(a, b1, acc12);
a = _mm256_broadcast_ss(&A[4 * 3 + 2]);
acc03 = MM_FMADD(a, b0, acc03);
acc13 = MM_FMADD(a, b1, acc13);
a = _mm256_broadcast_ss(&A[4 * 4 + 2]);
acc04 = MM_FMADD(a, b0, acc04);
acc14 = MM_FMADD(a, b1, acc14);
b0 = B[0];
b1 = B[1];
B += 2;
a = _mm256_broadcast_ss(&A[4 * 0 + 3]);
acc00 = MM_FMADD(a, b0, acc00);
acc10 = MM_FMADD(a, b1, acc10);
a = _mm256_broadcast_ss(&A[4 * 1 + 3]);
acc01 = MM_FMADD(a, b0, acc01);
acc11 = MM_FMADD(a, b1, acc11);
a = _mm256_broadcast_ss(&A[4 * 2 + 3]);
acc02 = MM_FMADD(a, b0, acc02);
acc12 = MM_FMADD(a, b1, acc12);
a = _mm256_broadcast_ss(&A[4 * 3 + 3]);
acc03 = MM_FMADD(a, b0, acc03);
acc13 = MM_FMADD(a, b1, acc13);
a = _mm256_broadcast_ss(&A[4 * 4 + 3]);
acc04 = MM_FMADD(a, b0, acc04);
acc14 = MM_FMADD(a, b1, acc14);
A += 4 * 5;
}
C[0] = acc00;
C[1] = acc10;
C[2] = acc01;
C[3] = acc11;
C[4] = acc02;
C[5] = acc12;
C[6] = acc03;
C[7] = acc13;
C[8] = acc04;
C[9] = acc14;
}
with -O3 -mavx the resulting code looks bad.
clang generated a much nicer tight loop without extra spills.
The issue looks related to PR79826.