https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113025
--- Comment #6 from Xi Ruoyao <xry111 at gcc dot gnu.org> ---
Works for me:
#include <xmmintrin.h>
#include <stdint.h>
#define LOAD_SI128(ptr) \
( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) :
_mm_loadu_si128((__m128i*)(ptr))
extern char x[16];
__m128i y;
void
test ()
{
y = LOAD_SI128 (&x);
}
compiled to:
test:
.LFB532:
.cfi_startproc
movdqu x(%rip), %xmm0
movaps %xmm0, y(%rip)
ret
.cfi_endproc
Note that if x is not extern, GCC will generate:
test:
.LFB532:
.cfi_startproc
movdqa x(%rip), %xmm0
movaps %xmm0, y(%rip)
ret
.cfi_endproc
but it's legal because GCC places x at 16-byte boundary:
.align 16
.type x, @object
.size x, 16
x:
.zero 16