https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114908
--- Comment #5 from Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> --- https://godbolt.org/z/P6cfbjT9f #include <stdint.h> typedef uint64_t T; typedef T V [[gnu::vector_size(32)]]; typedef struct simd4 { V data; } simd4; typedef struct simd1 { T data; } simd1; typedef struct tup3_1 { simd4 a; simd1 b; } tup3_1; simd1 load1(const T* ptr) { simd1 ret = {ptr[0]}; return ret; } simd4 load3(const T* ptr) { simd4 ret = {}; __builtin_memcpy(&ret, ptr, 3 * sizeof(T)); return ret; } tup3_1 split3_1(simd4 x) { const T* ptr = (T*)&x; tup3_1 ret = {load3(ptr), load1(ptr + 3)}; return ret; } simd4 concat1_3(simd1 a, simd4 b) { simd4 ret = {}; char* ptr = (char*)&ret; __builtin_memcpy(ptr, &a, sizeof(T)); __builtin_memcpy(ptr + sizeof(T), &b, 3 * sizeof(T)); return ret; } simd4 perm(simd4 data) { tup3_1 carry = split3_1(data); simd1 zero = {}; return concat1_3(zero, carry.a); }