On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
<[email protected]> wrote:
>
> From: dianhong xu <[email protected]>
>
> Add -muse-unaligned-vector-move option to emit unaligned vector move
> instaructions.
Why would you ever want to have such option?! Should the documentation
at least read "emit unaligned vector moves even for aligned storage or when
using aligned move intrinsics"?
Richard.
> gcc/ChangeLog:
>
> * config/i386/i386-options.c (ix86_target_string): Add
> -muse-unaligned-vector-move.
> * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if use
> the new option.
> * config/i386/i386.opt (muse-unaligned-vector-move): New.
> * config/i386/sse.md: Emit unaligned vector if use this new option
> * doc/invoke.texi: Document -muse-unaligned-vector-move
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
> * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
> * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
> * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New test.
> ---
> gcc/config/i386/i386-options.c | 3 +-
> gcc/config/i386/i386.c | 41 +++----
> gcc/config/i386/i386.opt | 4 +
> gcc/config/i386/sse.md | 30 +++--
> gcc/doc/invoke.texi | 7 ++
> .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
> .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
> .../i386/avx2-vector-unaligned-load-store-3.c | 11 ++
> .../avx512vl-vector-unaligned-load-store-1.c | 13 +++
> 9 files changed, 287 insertions(+), 31 deletions(-)
> create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> create mode 100644
> gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>
> diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> index c9523b26f49..eacbd0f5451 100644
> --- a/gcc/config/i386/i386-options.c
> +++ b/gcc/config/i386/i386-options.c
> @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
> { "-mstv", MASK_STV },
> { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
> { "-mavx256-split-unaligned-store",
> MASK_AVX256_SPLIT_UNALIGNED_STORE },
> - { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
> + { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES },
> + { "-muse-unaligned-vector-move", MASK_USE_UNALIGNED_VECTOR_MOVE }
> };
>
> /* Additional flag options. */
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index f111411e599..7581e854021 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> enum attr_mode insn_mode, machine_mode mode)
> {
> char buf[128];
> - bool misaligned_p = (misaligned_operand (operands[0], mode)
> - || misaligned_operand (operands[1], mode));
> + bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[0], mode)
> + || misaligned_operand (operands[1], mode));
> bool evex_reg_p = (size == 64
> || EXT_REX_SSE_REG_P (operands[0])
> || EXT_REX_SSE_REG_P (operands[1]));
> @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> {
> case opcode_int:
> if (scalar_mode == E_HFmode)
> - opcode = (misaligned_p
> + opcode = (need_unaligned_p
> ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> : "vmovdqa64");
> else
> - opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> + opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> break;
> case opcode_float:
> - opcode = misaligned_p ? "vmovups" : "vmovaps";
> + opcode = need_unaligned_p ? "vmovups" : "vmovaps";
> break;
> case opcode_double:
> - opcode = misaligned_p ? "vmovupd" : "vmovapd";
> + opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
> break;
> }
> }
> @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> switch (scalar_mode)
> {
> case E_HFmode:
> - opcode = (misaligned_p
> + opcode = (need_unaligned_p
> ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> : "vmovdqa64");
> break;
> case E_SFmode:
> - opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> + opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
> break;
> case E_DFmode:
> - opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
> + opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
> break;
> case E_TFmode:
> if (evex_reg_p)
> - opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> + opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> else
> - opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> + opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> break;
> default:
> gcc_unreachable ();
> @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> {
> case E_QImode:
> if (evex_reg_p)
> - opcode = (misaligned_p
> + opcode = (need_unaligned_p
> ? (TARGET_AVX512BW
> ? "vmovdqu8"
> : "vmovdqu64")
> : "vmovdqa64");
> else
> - opcode = (misaligned_p
> + opcode = (need_unaligned_p
> ? (TARGET_AVX512BW
> ? "vmovdqu8"
> : "%vmovdqu")
> @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> break;
> case E_HImode:
> if (evex_reg_p)
> - opcode = (misaligned_p
> + opcode = (need_unaligned_p
> ? (TARGET_AVX512BW
> ? "vmovdqu16"
> : "vmovdqu64")
> : "vmovdqa64");
> else
> - opcode = (misaligned_p
> + opcode = (need_unaligned_p
> ? (TARGET_AVX512BW
> ? "vmovdqu16"
> : "%vmovdqu")
> @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> break;
> case E_SImode:
> if (evex_reg_p)
> - opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> + opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> else
> - opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> + opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> break;
> case E_DImode:
> case E_TImode:
> case E_OImode:
> if (evex_reg_p)
> - opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> + opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> else
> - opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> + opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> break;
> case E_XImode:
> - opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> + opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> break;
> default:
> gcc_unreachable ();
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index ad366974b5b..2162d10925a 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and code
> generation.
> mavx512fp16
> Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
> Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and
> AVX512FP16 built-in functions and code generation.
> +
> +muse-unaligned-vector-move
> +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
> +Emit unaligned vector move instructions.
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index fbf056bf9e6..dc99597f195 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17059,24 +17059,28 @@
> switch (<MODE>mode)
> {
> case E_V8DFmode:
> - if (misaligned_operand (operands[2], <ssequartermode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[2], <ssequartermode>mode))
> return "vmovupd\t{%2, %x0|%x0, %2}";
> else
> return "vmovapd\t{%2, %x0|%x0, %2}";
> case E_V16SFmode:
> - if (misaligned_operand (operands[2], <ssequartermode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[2], <ssequartermode>mode))
> return "vmovups\t{%2, %x0|%x0, %2}";
> else
> return "vmovaps\t{%2, %x0|%x0, %2}";
> case E_V8DImode:
> - if (misaligned_operand (operands[2], <ssequartermode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[2], <ssequartermode>mode))
> return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
> : "vmovdqu\t{%2, %x0|%x0, %2}";
> else
> return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
> : "vmovdqa\t{%2, %x0|%x0, %2}";
> case E_V16SImode:
> - if (misaligned_operand (operands[2], <ssequartermode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[2], <ssequartermode>mode))
> return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
> : "vmovdqu\t{%2, %x0|%x0, %2}";
> else
> @@ -25238,27 +25242,32 @@
> switch (get_attr_mode (insn))
> {
> case MODE_V16SF:
> - if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> return "vmovups\t{%1, %t0|%t0, %1}";
> else
> return "vmovaps\t{%1, %t0|%t0, %1}";
> case MODE_V8DF:
> - if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> return "vmovupd\t{%1, %t0|%t0, %1}";
> else
> return "vmovapd\t{%1, %t0|%t0, %1}";
> case MODE_V8SF:
> - if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> return "vmovups\t{%1, %x0|%x0, %1}";
> else
> return "vmovaps\t{%1, %x0|%x0, %1}";
> case MODE_V4DF:
> - if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> return "vmovupd\t{%1, %x0|%x0, %1}";
> else
> return "vmovapd\t{%1, %x0|%x0, %1}";
> case MODE_XI:
> - if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> {
> if (which_alternative == 2)
> return "vmovdqu\t{%1, %t0|%t0, %1}";
> @@ -25277,7 +25286,8 @@
> return "vmovdqa32\t{%1, %t0|%t0, %1}";
> }
> case MODE_OI:
> - if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> + if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> + || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> {
> if (which_alternative == 2)
> return "vmovdqu\t{%1, %x0|%x0, %1}";
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 0cc8a8edd05..13777d62437 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
> -mstack-protector-guard-offset=@var{offset} @gol
> -mstack-protector-guard-symbol=@var{symbol} @gol
> -mgeneral-regs-only -mcall-ms2sysv-xlogues @gol
> +-muse-unaligned-vector-move @gol
> -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol
> -mindirect-branch-register -mneeded}
>
> @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and epilogues.
> Using
> use stubs in the static portion of libgcc to perform these saves and
> restores,
> thus reducing function size at the cost of a few extra instructions.
>
> +@item -muse-unaligned-vector-move
> +@opindex muse-unaligned-vector-move
> +@opindex mno-use-unaligned-vector-move
> +Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
> +instructions like vmovdqu, vmovups, vmovupd.
> +
> @item -mtls-dialect=@var{type}
> @opindex mtls-dialect
> Generate code to access thread-local storage using the @samp{gnu} or
> diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> new file mode 100644
> index 00000000000..d21eee562ac
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> @@ -0,0 +1,102 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> +
> +#define N 1024
> +
> +char **cp;
> +char **ep;
> +char **fp;
> +
> +void
> +test_char ()
> +{
> + int i;
> + char **ap = __builtin_assume_aligned (ep, 32);
> + char **zp;
> + for (i = 128; i > 0; i--)
> + {
> + *ap++ = *cp++;
> + *zp++ = *fp++;
> + }
> +}
> +
> +float f1[N], f2[N], f3[N];
> +
> +void
> +test_float (void)
> +{
> + for (int i = 0; i < N; i++)
> + {
> + f3[i] = f1[i] * f2[i];
> + }
> +}
> +
> +double d1[N], d2[N], d3[N];
> +
> +void
> +test_double_load (void)
> +{
> + for (int i = 0; i < N; i++)
> + {
> + d3[i] = d1[i] * d2[i];
> +
> + }
> +}
> +
> +unsigned char uc1[N], uc2[N], uc3[N];
> +void
> +test_unchar ()
> +{
> + for (int i=0;i<N;i++) {
> + uc3[i] = uc1[i] * uc2[i];
> + }
> +}
> +
> +short st1[N], st2[N], st3[N];
> +void
> +test_short ()
> +{
> + for (int i=0;i<N;i++) {
> + st3[i] = st1[i] * st2[i];
> + }
> +}
> +
> +int n1[N], n2[N], n3[N];
> +void
> +test_int ()
> +{
> + for (int i=0;i<N;i++) {
> + n3[i] = n1[i] * n2[i];
> + }
> +}
> +
> +long l1[N], l2[N], l3[N];
> +
> +void
> +test_long ()
> +{
> + for (int i=0; i<N; i++)
> + {
> + l3[i] = l1[i] *l2[i];
> + }
> +}
> +
> +long long ll1[N], ll2[N], ll3[N];
> +
> +void
> +test_long_long()
> +{
> + for (int i=0;i<N;i++)
> + {
> + ll3[i] = ll1[i]*ll2[i];
> + }
> +}
> +
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovapd" } } */
> +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
> +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
> +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
> +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
> diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> new file mode 100644
> index 00000000000..65c81105ebd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> @@ -0,0 +1,107 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> +
> +#include <immintrin.h>
> +__m128 value128;
> +char src128[16];
> +
> +__m256 value256;
> +float src256[8];
> +
> +void add128(__m128* pointer) {
> + value128 = _mm_add_ps(value128, *pointer);
> +}
> +
> +void add256(__m256* pointer) {
> + value256 = _mm256_add_ps(value256, *pointer);
> +}
> +
> +__m128d value128d;
> +__m128d aux128d;
> +float src128f[4];
> +float res128f[4];
> +double src128d[2];
> +double res128d[2];
> +
> +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
> + value128d = _mm_add_pd(value128d, *pointer);
> + __m128d s1 = _mm_add_pd(aux, *pointer);
> + *res128d = _mm_add_pd(s1, value128d);
> +}
> +
> +__m256d value256d;
> +__m256d aux256d;
> +float src256f[8];
> +float res256f[8];
> +double src256d[4];
> +double res256d[4];
> +
> +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
> + value256d = _mm256_add_pd(value256d, *pointer);
> + __m256d s1 = _mm256_add_pd(aux, *pointer);
> + *res = _mm256_add_pd(s1, value256d);
> +}
> +
> +__m256i value256i;
> +__m256i aux256i;
> +char src256c[32];
> +char res256c[32];
> +short src256s[16];
> +short res256s[16];
> +int src256i[8];
> +int res256i[8];
> +long long src256l[4];
> +long long res256l[4];
> +
> +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
> + value256i = _mm256_add_epi32(value256i, *pointer);
> + __m256i s1 = _mm256_add_epi32(aux, *pointer);
> + *res = _mm256_add_epi32(s1, value256i);
> +}
> +
> +void foo1() {
> + add128((__m128*)src128);
> +}
> +
> +void foo2() {
> + add256((__m256*)src256);
> +}
> +
> +void foo3() {
> + add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
> +}
> +
> +void foo4() {
> + add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
> +}
> +
> +void foo5() {
> + add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
> +}
> +
> +void foo6() {
> + add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
> +}
> +
> +void foo7() {
> + add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
> +}
> +
> +void foo8() {
> + add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
> +}
> +
> +void foo9() {
> + add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
> +}
> +
> +void foo11() {
> + add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
> +}
> +
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovapd" } } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler "vmovups" } } */
> +/* { dg-final { scan-assembler "vmovupd" } } */
> +/* { dg-final { scan-assembler "vmovdqu" } } */
> diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> new file mode 100644
> index 00000000000..59924304bae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
> +
> +#include "avx2-vector-unaligned-load-store-2.c"
> +
> +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not "vmovdqu" } } */
> +/* { dg-final { scan-assembler "vmovaps" } } */
> +/* { dg-final { scan-assembler "vmovapd" } } */
> +/* { dg-final { scan-assembler "vmovdqa" } } */
> diff --git
> a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> new file mode 100644
> index 00000000000..3759fd9f2f4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
> +
> +#include "avx2-vector-unaligned-load-store-1.c"
> +
> +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
> +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovapd" } } */
> +/* { dg-final { scan-assembler "vmovdqu32" } } */
> +/* { dg-final { scan-assembler "vmovdqu64" } } */
> +/* { dg-final { scan-assembler "vmovups" } } */
> +/* { dg-final { scan-assembler "vmovupd" } } */
> --
> 2.18.1
>