Issue 76727
Summary AVX512 constants not chached in a register
Labels new issue
Assignees
Reporter piotr-topnotch
    When the same AVX512 constant is used in a function, it is usually not cached in a register, but the compiler chooses to emit the same memory reference. This wastes DCache bandwidth and emits code longer than a variant with the register. Consider this function (compiles with `-march=tigerlake -O2` for x64 targets, e.g on godbolt).

```
#include <immintrin.h>
static const __m128i ars_weyl_increment = _mm_set_epi64x(0xbb67ae8584caa73b, 0x9e3779b97f4a7c15);

__m512i fn_vaes(__m512i counter, __m128i key) {

   // const __m512i ars_increment = _mm512_broadcast_i32x4(*(const volatile __m128i*) &ars_weyl_increment);
    const __m512i ars_increment = _mm512_broadcast_i32x4(ars_weyl_increment);

    const __m512i key0 = _mm512_broadcast_i32x4(key);
    const __m512i r0 = _mm512_aesenc_epi128(counter, key0);

    const __m512i key1 = _mm512_add_epi64(key0, ars_increment); 
    const __m512i r1 = _mm512_aesenc_epi128(r0, key1);

    const __m512i key2 = _mm512_add_epi64(key1, ars_increment); 
    const __m512i r2 = _mm512_aesenc_epi128(r1, key2);

    const __m512i key3 = _mm512_add_epi64(key2, ars_increment); 
    const __m512i r3 = _mm512_aesenc_epi128(r2, key3);

    const __m512i key4 = _mm512_add_epi64(key3, ars_increment); 
    const __m512i r4 = _mm512_aesenc_epi128(r3, key4);

    return r4;
}
```

Then you'll see

```
        vinserti128     $1, %xmm1, %ymm1, %ymm1
 vinserti64x4    $1, %ymm1, %zmm1, %zmm1
        vaesenc %zmm1, %zmm0, %zmm0
        vpaddq  **.LCPI0_0(%rip),** %zmm1, %zmm2
 vaesenc %zmm2, %zmm0, %zmm0
        vpaddq  **.LCPI0_1(%rip),** %zmm1, %zmm2
        vaesenc %zmm2, %zmm0, %zmm0
        vpaddq **.LCPI0_2(%rip),** %zmm1, %zmm2
        vaesenc %zmm2, %zmm0, %zmm0
 vpaddq  **.LCPI0_3(%rip),** %zmm1, %zmm1
        vaesenc %zmm1, %zmm0, %zmm0
        retq
```

The compiler could be forced to emit better code by uncommenting the first definition of  `const __m512i ars_increment`:

```
        vmovdqa ars_weyl_increment(%rip), %xmm2
        vinserti128     $1, %xmm2, %ymm2, %ymm2
 vinserti64x4    $1, %ymm2, %zmm2, %zmm2
        vinserti128     $1, %xmm1, %ymm1, %ymm1
        vinserti64x4    $1, %ymm1, %zmm1, %zmm1
 vaesenc %zmm1, %zmm0, %zmm0
        vpaddq  %zmm1, %zmm2, %zmm1
 vaesenc %zmm1, %zmm0, %zmm0
        vpaddq  **%zmm2**, %zmm1, %zmm1
 vaesenc %zmm1, %zmm0, %zmm0
        vpaddq  %zmm2, %zmm1, %zmm1
 vaesenc %zmm1, %zmm0, %zmm0
        vpaddq  %zmm2, %zmm1, %zmm1
 vaesenc %zmm1, %zmm0, %zmm0
        retq

```

But then the compiler builds `ars_increment` in an extremely complicated way with the sequence of inserts instead of simply using a single `vbroadcasti32x4`.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to