https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80283
Bug ID: 80283 Summary: bad SIMD register allocation Product: gcc Version: 7.0.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: trippels at gcc dot gnu.org Target Milestone: --- Michael S reported the following testcase on http://www.realworldtech.com/forum/?threadid=166719&curpostid=166719 markus@x4 tmp % cat vec_reg.c #include <immintrin.h> #define MM_FMADD(a, b, c) _mm256_add_ps(_mm256_mul_ps((a), (b)), (c)) void foo(const float *A, const __m256 *B, __m256 *C, int kSteps) { __m256 acc00 = _mm256_setzero_ps(); __m256 acc10 = _mm256_setzero_ps(); __m256 acc01 = _mm256_setzero_ps(); __m256 acc11 = _mm256_setzero_ps(); __m256 acc02 = _mm256_setzero_ps(); __m256 acc12 = _mm256_setzero_ps(); __m256 acc03 = _mm256_setzero_ps(); __m256 acc13 = _mm256_setzero_ps(); __m256 acc04 = _mm256_setzero_ps(); __m256 acc14 = _mm256_setzero_ps(); for (int k = 0; k < kSteps; ++k) { __m256 a, b0, b1; b0 = B[0]; b1 = B[1]; B += 2; a = _mm256_broadcast_ss(&A[4 * 0 + 0]); acc00 = MM_FMADD(a, b0, acc00); acc10 = MM_FMADD(a, b1, acc10); a = _mm256_broadcast_ss(&A[4 * 1 + 0]); acc01 = MM_FMADD(a, b0, acc01); acc11 = MM_FMADD(a, b1, acc11); a = _mm256_broadcast_ss(&A[4 * 2 + 0]); acc02 = MM_FMADD(a, b0, acc02); acc12 = MM_FMADD(a, b1, acc12); a = _mm256_broadcast_ss(&A[4 * 3 + 0]); acc03 = MM_FMADD(a, b0, acc03); acc13 = MM_FMADD(a, b1, acc13); a = _mm256_broadcast_ss(&A[4 * 4 + 0]); acc04 = MM_FMADD(a, b0, acc04); acc14 = MM_FMADD(a, b1, acc14); b0 = B[0]; b1 = B[1]; B += 2; a = _mm256_broadcast_ss(&A[4 * 0 + 1]); acc00 = MM_FMADD(a, b0, acc00); acc10 = MM_FMADD(a, b1, acc10); a = _mm256_broadcast_ss(&A[4 * 1 + 1]); acc01 = MM_FMADD(a, b0, acc01); acc11 = MM_FMADD(a, b1, acc11); a = _mm256_broadcast_ss(&A[4 * 2 + 1]); acc02 = MM_FMADD(a, b0, acc02); acc12 = MM_FMADD(a, b1, acc12); a = _mm256_broadcast_ss(&A[4 * 3 + 1]); acc03 = MM_FMADD(a, b0, acc03); acc13 = MM_FMADD(a, b1, acc13); a = _mm256_broadcast_ss(&A[4 * 4 + 1]); acc04 = MM_FMADD(a, b0, acc04); acc14 = MM_FMADD(a, b1, acc14); b0 = B[0]; b1 = B[1]; B += 2; a = _mm256_broadcast_ss(&A[4 * 0 + 2]); acc00 = MM_FMADD(a, b0, acc00); acc10 = MM_FMADD(a, b1, acc10); a = _mm256_broadcast_ss(&A[4 * 1 + 2]); acc01 = MM_FMADD(a, b0, acc01); acc11 = MM_FMADD(a, b1, acc11); a = _mm256_broadcast_ss(&A[4 * 2 + 2]); acc02 = MM_FMADD(a, b0, acc02); acc12 = MM_FMADD(a, b1, acc12); a = _mm256_broadcast_ss(&A[4 * 3 + 2]); acc03 = MM_FMADD(a, b0, acc03); acc13 = MM_FMADD(a, b1, acc13); a = _mm256_broadcast_ss(&A[4 * 4 + 2]); acc04 = MM_FMADD(a, b0, acc04); acc14 = MM_FMADD(a, b1, acc14); b0 = B[0]; b1 = B[1]; B += 2; a = _mm256_broadcast_ss(&A[4 * 0 + 3]); acc00 = MM_FMADD(a, b0, acc00); acc10 = MM_FMADD(a, b1, acc10); a = _mm256_broadcast_ss(&A[4 * 1 + 3]); acc01 = MM_FMADD(a, b0, acc01); acc11 = MM_FMADD(a, b1, acc11); a = _mm256_broadcast_ss(&A[4 * 2 + 3]); acc02 = MM_FMADD(a, b0, acc02); acc12 = MM_FMADD(a, b1, acc12); a = _mm256_broadcast_ss(&A[4 * 3 + 3]); acc03 = MM_FMADD(a, b0, acc03); acc13 = MM_FMADD(a, b1, acc13); a = _mm256_broadcast_ss(&A[4 * 4 + 3]); acc04 = MM_FMADD(a, b0, acc04); acc14 = MM_FMADD(a, b1, acc14); A += 4 * 5; } C[0] = acc00; C[1] = acc10; C[2] = acc01; C[3] = acc11; C[4] = acc02; C[5] = acc12; C[6] = acc03; C[7] = acc13; C[8] = acc04; C[9] = acc14; } with -O3 -mavx the resulting code looks bad. clang generated a much nicer tight loop without extra spills. The issue looks related to PR79826.