https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188

--- Comment #4 from Zbynek Vyskovsky <kvr000 at gmail dot com> ---
Sure, the code:

#include <stdio.h>

#ifndef NO_VECTORIZE
#ifdef __x86_64__
#include <immintrin.h>
#include <x86intrin.h>
#endif
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#endif

typedef union Mat44 {
        float m[4][4];
#ifndef NO_VECTORIZE
#ifdef __x86_64__
        __m128 row[4];
        __m256 rowDuet[2];
        __m512 rowQuad;
#endif
#ifdef __aarch64__
        float32x4_t row[4];
#endif
#endif
} Mat44;

__attribute__((noipa)) void matmult_avx512(union Mat44 *out, const Mat44 *a,
const Mat44 *b)
{
        __m512 a0123 = _mm512_loadu_ps(a->m[0]);
        __m512 b0000 = _mm512_broadcast_f32x4(b->row[0]);
        __m512 b1111 = _mm512_broadcast_f32x4(b->row[1]);
        __m512 b2222 = _mm512_broadcast_f32x4(b->row[2]);
        __m512 b3333 = _mm512_broadcast_f32x4(b->row[3]);

        __m512 result = _mm512_mul_ps(_mm512_permute_ps(a0123, 0x00), b0000);
        result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0x55), b1111,
result);
        result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xaa), b2222,
result);
        result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xff), b3333,
result);

        _mm512_storeu_ps(out->m[0], result);
}

__attribute__((noipa)) void matmult_ref(Mat44 *out, const Mat44 *a, const Mat44
*b)
{
        Mat44 t;
        for (int i = 0; i < 4; i++) {
                for (int j = 0; j < 4; j++) {
                        t.m[i][j] = a->m[i][0]*b->m[0][j] +
a->m[i][1]*b->m[1][j] + a->m[i][2]*b->m[2][j] + a->m[i][3]*b->m[3][j];
                }
        }

        *out = t;
}

int main(void)
{
        Mat44 in = { m: { { 1, 2, 3, 4 }, { 5, 6, 7, 8 }, { 9, 10, 11, 12 }, {
13, 14, 15, 16 } } };
        Mat44 avx512_out;
        Mat44 ref_out;
        matmult_ref(&ref_out, &in, &in);
        matmult_avx512(&avx512_out, &in, &in);
        for (int r = 0; r < 4; ++r) {
                printf("%5.0f %5.0f %5.0f %5.0f      %5.0f %5.0f %5.0f
%5.0f\n", avx512_out.m[r][0], avx512_out.m[r][1], avx512_out.m[r][2],
avx512_out.m[r][3], ref_out.m[r][0], ref_out.m[r][1], ref_out.m[r][2],
ref_out.m[r][3]);
        }
        return 0;
}


Output (note the repeating first column on first side, caused by duplicating
single element instead of four):

  90    90    90    90         90   100   110   120
 202   202   202   202        202   228   254   280
 314   314   314   314        314   356   398   440
 426   426   426   426        426   484   542   600

Reply via email to