https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80283

            Bug ID: 80283
           Summary: bad SIMD register allocation
           Product: gcc
           Version: 7.0.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: trippels at gcc dot gnu.org
  Target Milestone: ---

Michael S reported the following testcase on
http://www.realworldtech.com/forum/?threadid=166719&curpostid=166719

markus@x4 tmp % cat vec_reg.c
#include <immintrin.h>

#define MM_FMADD(a, b, c) _mm256_add_ps(_mm256_mul_ps((a), (b)), (c))

void foo(const float *A, const __m256 *B, __m256 *C, int kSteps) {
  __m256 acc00 = _mm256_setzero_ps();
  __m256 acc10 = _mm256_setzero_ps();
  __m256 acc01 = _mm256_setzero_ps();
  __m256 acc11 = _mm256_setzero_ps();
  __m256 acc02 = _mm256_setzero_ps();
  __m256 acc12 = _mm256_setzero_ps();
  __m256 acc03 = _mm256_setzero_ps();
  __m256 acc13 = _mm256_setzero_ps();
  __m256 acc04 = _mm256_setzero_ps();
  __m256 acc14 = _mm256_setzero_ps();

  for (int k = 0; k < kSteps; ++k) {
    __m256 a, b0, b1;
    b0 = B[0];
    b1 = B[1];
    B += 2;

    a = _mm256_broadcast_ss(&A[4 * 0 + 0]);
    acc00 = MM_FMADD(a, b0, acc00);
    acc10 = MM_FMADD(a, b1, acc10);

    a = _mm256_broadcast_ss(&A[4 * 1 + 0]);
    acc01 = MM_FMADD(a, b0, acc01);
    acc11 = MM_FMADD(a, b1, acc11);

    a = _mm256_broadcast_ss(&A[4 * 2 + 0]);
    acc02 = MM_FMADD(a, b0, acc02);
    acc12 = MM_FMADD(a, b1, acc12);

    a = _mm256_broadcast_ss(&A[4 * 3 + 0]);
    acc03 = MM_FMADD(a, b0, acc03);
    acc13 = MM_FMADD(a, b1, acc13);

    a = _mm256_broadcast_ss(&A[4 * 4 + 0]);
    acc04 = MM_FMADD(a, b0, acc04);
    acc14 = MM_FMADD(a, b1, acc14);

    b0 = B[0];
    b1 = B[1];
    B += 2;

    a = _mm256_broadcast_ss(&A[4 * 0 + 1]);
    acc00 = MM_FMADD(a, b0, acc00);
    acc10 = MM_FMADD(a, b1, acc10);

    a = _mm256_broadcast_ss(&A[4 * 1 + 1]);
    acc01 = MM_FMADD(a, b0, acc01);
    acc11 = MM_FMADD(a, b1, acc11);

    a = _mm256_broadcast_ss(&A[4 * 2 + 1]);
    acc02 = MM_FMADD(a, b0, acc02);
    acc12 = MM_FMADD(a, b1, acc12);

    a = _mm256_broadcast_ss(&A[4 * 3 + 1]);
    acc03 = MM_FMADD(a, b0, acc03);
    acc13 = MM_FMADD(a, b1, acc13);

    a = _mm256_broadcast_ss(&A[4 * 4 + 1]);
    acc04 = MM_FMADD(a, b0, acc04);
    acc14 = MM_FMADD(a, b1, acc14);

    b0 = B[0];
    b1 = B[1];
    B += 2;

    a = _mm256_broadcast_ss(&A[4 * 0 + 2]);
    acc00 = MM_FMADD(a, b0, acc00);
    acc10 = MM_FMADD(a, b1, acc10);

    a = _mm256_broadcast_ss(&A[4 * 1 + 2]);
    acc01 = MM_FMADD(a, b0, acc01);
    acc11 = MM_FMADD(a, b1, acc11);

    a = _mm256_broadcast_ss(&A[4 * 2 + 2]);
    acc02 = MM_FMADD(a, b0, acc02);
    acc12 = MM_FMADD(a, b1, acc12);

    a = _mm256_broadcast_ss(&A[4 * 3 + 2]);
    acc03 = MM_FMADD(a, b0, acc03);
    acc13 = MM_FMADD(a, b1, acc13);

    a = _mm256_broadcast_ss(&A[4 * 4 + 2]);
    acc04 = MM_FMADD(a, b0, acc04);
    acc14 = MM_FMADD(a, b1, acc14);

    b0 = B[0];
    b1 = B[1];
    B += 2;

    a = _mm256_broadcast_ss(&A[4 * 0 + 3]);
    acc00 = MM_FMADD(a, b0, acc00);
    acc10 = MM_FMADD(a, b1, acc10);

    a = _mm256_broadcast_ss(&A[4 * 1 + 3]);
    acc01 = MM_FMADD(a, b0, acc01);
    acc11 = MM_FMADD(a, b1, acc11);

    a = _mm256_broadcast_ss(&A[4 * 2 + 3]);
    acc02 = MM_FMADD(a, b0, acc02);
    acc12 = MM_FMADD(a, b1, acc12);

    a = _mm256_broadcast_ss(&A[4 * 3 + 3]);
    acc03 = MM_FMADD(a, b0, acc03);
    acc13 = MM_FMADD(a, b1, acc13);

    a = _mm256_broadcast_ss(&A[4 * 4 + 3]);
    acc04 = MM_FMADD(a, b0, acc04);
    acc14 = MM_FMADD(a, b1, acc14);

    A += 4 * 5;
  }

  C[0] = acc00;
  C[1] = acc10;
  C[2] = acc01;
  C[3] = acc11;
  C[4] = acc02;
  C[5] = acc12;
  C[6] = acc03;
  C[7] = acc13;
  C[8] = acc04;
  C[9] = acc14;
}

with -O3 -mavx the resulting code looks bad.
clang generated a much nicer tight loop without extra spills.

The issue looks related to PR79826.

Reply via email to