On Wed, May 13, 2026 at 09:24:39AM +0100, Tamar Christina wrote:
> Hi Artemiy,
>
> > -----Original Message-----
> > From: Artemiy Volkov <[email protected]>
> > Sent: 27 April 2026 09:06
> > To: [email protected]
> > Cc: Tamar Christina <[email protected]>; Wilco Dijkstra
> > <[email protected]>; [email protected]; Richard
> > Earnshaw <[email protected]>; [email protected]; Alice
> > Carlotti <[email protected]>; Alex Coplan <[email protected]>;
> > Artemiy Volkov <[email protected]>
> > Subject: [PATCH 4/4] aarch64/sve: combine AdvSIMD and SVE vec_duplicates
> >
> > Currently, to duplicate a 64-bit or narrower value into a SVE register, we
> > choose to go via an intermediate 128-bit AdvSIMD register, viz.:
> >
> > svfloat32_t foo(float x) {
> > return svdupq_n_f32(x, x, x, x);
> > }
> >
> > which will produce the following code:
> >
> > dup v0.4s, v0.s[0]
> > dup z0.q, z0.q[0]
> > ret
> >
> > when compiled with -O2 -march=armv9-a+sve.
> >
> > This can be simplified into a single dup instruction going to an SVE
> > register directly from a scalar (or a smaller vector) value:
> >
> > mov z0.s, s0
> > ret
> >
> > To facilitate this, this patch adds a pattern that combine can use to
> > merge two vec_duplicate instructions (scalar -> AdvSIMD and AdvSIMD ->
> > SVE) into a single one (scalar -> SVE).
> >
> > To demonstrate the effect of this patch, the vec-init-23.c test from
> > AdvSIMD was reused as a new SVE test (vec_init_5.c).
> >
>
> Awesome!
>
> Patch is OK.
FWIW it looks like the new SVE pattern suffers from the same issue as
AdvSIMD patterns from patch #2: many mode combinations result in illegal
RTL. I'll fix this in v2 by switching to a single-iterator driven
approach.
Thanks,
Artemiy
>
> Thanks,
> Tamar
>
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-sve.md
> > (*aarch64_vec_duplicate_subvector<SVE_FULL:mode><VQ:mode><V
> > DUP:mode>):
> > New pattern.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/vec_init_5.c: New test.
> > ---
> > gcc/config/aarch64/aarch64-sve.md | 14 +
> > .../gcc.target/aarch64/sve/vec_init_5.c | 380 ++++++++++++++++++
> > 2 files changed, 394 insertions(+)
> > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > b/gcc/config/aarch64/aarch64-sve.md
> > index 019630eb8d2..46bb8eb82ad 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -2890,6 +2890,20 @@
> > [(set_attr "sve_type" "sve_int_general")]
> > )
> >
> > +;; Initialize an SVE vector by duplicating a 128-bit AdvSIMD register that
> > +;; itself contains a duplicated scalar or subvector value.
> > +(define_insn
> > "*aarch64_vec_duplicate_subvector<SVE_FULL:mode><VQ:mode><VDUP:mo
> > de>"
> > + [(set (match_operand:SVE_FULL 0 "register_operand")
> > + (vec_duplicate:SVE_FULL
> > + (vec_duplicate:VQ
> > + (match_operand:VDUP 1 "register_operand"))))]
> > + "TARGET_SVE"
> > + {@ [ cons: =0 , 1 ]
> > + [ w , r ] mov\t%0.<VDUP:single_type>, %<VDUP:single_wx>1
> > + [ w , w ] mov\t%0.<VDUP:single_type>, %<VDUP:single_type>1
> > + }
> > +)
> > +
> > ;; This is used for vec_duplicate<mode>s from memory, but can also
> > ;; be used by combine to optimize selects of a vec_duplicate<mode>
> > ;; with zero.
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> > new file mode 100644
> > index 00000000000..3decc9ab1b6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> > @@ -0,0 +1,380 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_sve.h>
> > +
> > +#define TESTCASE(TYPE, ETYPE, T, SZ, NUM, MULT, ...)\
> > + sv##TYPE##SZ##_t test_##TYPE##SZ##_##NUM (ETYPE x0, ETYPE x1, ETYPE
> > x2, ETYPE x3,\
> > + ETYPE x4, ETYPE x5, ETYPE x6, ETYPE
> > x7)\
> > + {\
> > + return svdupq_n_##T##SZ (__VA_ARGS__);\
> > + }
> > +
> > +#include "../vec-init-23.c"
> > +
> > +/*
> > +** test_int8_1:
> > +** mov z0\.b, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_2:
> > +** bfi w0, w1, 8, 8
> > +** mov z0\.h, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_3:
> > +** bfi w0, w1, 8, 8
> > +** bfi w0, w2, 16, 8
> > +** bfi w0, w3, 24, 8
> > +** mov z0\.s, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_4:
> > +** bfi w0, w2, 8, 8
> > +** bfi w1, w3, 8, 8
> > +** bfi w0, w4, 16, 8
> > +** bfi w1, w5, 16, 8
> > +** bfi w0, w6, 24, 8
> > +** bfi w1, w7, 24, 8
> > +** dup v31\.2s, w0
> > +** dup v30\.2s, w1
> > +** zip1 v31\.16b, v31\.16b, v30\.16b
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_5:
> > +** uxtb w0, w0
> > +** mov z0\.h, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_6:
> > +** mov w1, 0
> > +** bfi w1, w0, 8, 8
> > +** mov z0\.h, w1
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_7:
> > +** mov w2, 16777472
> > +** bfi w2, w0, 0, 8
> > +** bfi w2, w1, 8, 8
> > +** mov z0\.s, w2
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_8:
> > +** mov w2, 16777472
> > +** bfi w2, w0, 16, 8
> > +** bfi w2, w1, 24, 8
> > +** mov z0\.s, w2
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_9:
> > +** mov w2, 16842752
> > +** bfi w2, w0, 0, 8
> > +** bfi w2, w1, 16, 8
> > +** mov z0.s, w2
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_10:
> > +** bfi w0, w1, 8, 8
> > +** bfi w0, w2, 16, 8
> > +** bfi w0, w3, 24, 8
> > +** dup v31\.2s, w0
> > +** adrp x0, .LANCHOR[0-9]+
> > +** ldr d30, \[x0, #:lo12:.LANCHOR[0-9]+\]
> > +** zip1 v31\.16b, v31\.16b, v30\.16b
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_11:
> > +** bfi w0, w1, 8, 8
> > +** adrp x4, .LANCHOR[0-9]+
> > +** bfi w0, w2, 16, 8
> > +** ldr d31, \[x4, #:lo12:\.LANCHOR[0-9]+\]
> > +** bfi w0, w3, 24, 8
> > +** dup v30\.2s, w0
> > +** zip1 v31\.16b, v31\.16b, v30\.16b
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_12:
> > +** mov w4, 33685504
> > +** bfi w4, w0, 0, 8
> > +** mov w0, 257
> > +** movk w0, 0x303, lsl 16
> > +** bfi w0, w1, 0, 8
> > +** bfi w4, w2, 16, 8
> > +** bfi w0, w3, 16, 8
> > +** dup v31\.2s, w4
> > +** dup v30\.2s, w0
> > +** zip1 v31\.16b, v31\.16b, v30\.16b
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int8_13:
> > +** mov w4, 33685504
> > +** bfi w4, w0, 8, 8
> > +** mov w0, 257
> > +** movk w0, 0x303, lsl 16
> > +** bfi w0, w1, 8, 8
> > +** bfi w4, w2, 24, 8
> > +** bfi w0, w3, 24, 8
> > +** dup v31\.2s, w4
> > +** dup v30\.2s, w0
> > +** zip1 v31\.16b, v31\.16b, v30\.16b
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_1:
> > +** fcvt h0, s0
> > +** mov z0\.h, h0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_2:
> > +** fcvt h1, s1
> > +** fcvt h0, s0
> > +** uzp1 v0\.4h, v0\.4h, v1\.4h
> > +** mov z0\.s, s0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_3:
> > +** uzp1 v2\.2s, v0\.2s, v2\.2s
> > +** uzp1 v3\.2s, v1\.2s, v3\.2s
> > +** zip1 v3\.4s, v2\.4s, v3\.4s
> > +** fcvtn v3\.4h, v3\.4s
> > +** mov z0\.d, d3
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_4:
> > +** fcvt h0, s0
> > +** fmov h0, h0
> > +** mov z0\.s, s0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_5:
> > +** movi v31\.4h, #0
> > +** fcvt h0, s0
> > +** uzp1 v0\.4h, v31\.4h, v0\.4h
> > +** mov z0\.s, s0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_6:
> > +** fcvt h2, s0
> > +** fcvt h1, s1
> > +** fmov h31, 1.0e\+0
> > +** fmov h2, h2
> > +** uzp1 v1\.4h, v1\.4h, v31\.4h
> > +** dup v0\.2s, v2\.s\[0\]
> > +** dup v1\.2s, v1\.s\[0\]
> > +** zip1 v0\.8h, v0\.8h, v1\.8h
> > +** dup z0\.q, z0\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_7:
> > +** fcvt h3, s0
> > +** fcvt h2, s1
> > +** movi v0\.4h, #0
> > +** fmov h1, 1.0e\+0
> > +** uzp1 v1\.4h, v1\.4h, v2\.4h
> > +** uzp1 v0\.4h, v0\.4h, v3\.4h
> > +** dup v1\.2s, v1\.s\[0\]
> > +** dup v0\.2s, v0\.s\[0\]
> > +** zip1 v0\.8h, v0\.8h, v1\.8h
> > +** dup z0\.q, z0\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float16_8:
> > +** fcvt h1, s1
> > +** fcvt h0, s0
> > +** movi v31\.2s, 0x3c, lsl 24
> > +** uzp1 v0\.4h, v0\.4h, v1.4h
> > +** dup v0\.2s, v0\.s\[0\]
> > +** zip1 v0\.8h, v31\.8h, v0\.8h
> > +** dup z0\.q, z0\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_1:
> > +** mov z0\.h, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_2:
> > +** bfi w0, w1, 16, 16
> > +** mov z0\.s, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_3:
> > +** bfi w0, w2, 16, 16
> > +** bfi w1, w3, 16, 16
> > +** dup v31\.2s, w0
> > +** dup v30\.2s, w1
> > +** zip1 v31\.8h, v31\.8h, v30\.8h
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_4:
> > +** uxth w0, w0
> > +** mov z0\.s, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_5:
> > +** mov w1, 0
> > +** bfi w1, w0, 16, 16
> > +** mov z0\.s, w1
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_6:
> > +** uxth w0, w0
> > +** dup v31\.2s, w0
> > +** mov w0, 1
> > +** bfi w1, w0, 16, 16
> > +** dup v30\.2s, w1
> > +** zip1 v31\.8h, v31\.8h, v30\.8h
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_7:
> > +** mov w2, 0
> > +** bfi w2, w0, 16, 16
> > +** mov w0, 65537
> > +** bfi w0, w1, 16, 16
> > +** dup v31\.2s, w2
> > +** dup v30\.2s, w0
> > +** zip1 v31\.8h, v31\.8h, v30\.8h
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int16_8:
> > +** bfi w0, w1, 16, 16
> > +** movi v31\.2s, 0x1, lsl 16
> > +** dup v30\.2s, w0
> > +** zip1 v31\.8h, v31\.8h, v30\.8h
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float32_1:
> > +** mov z0\.s, s0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float32_2:
> > +** uzp1 v0\.2s, v0\.2s, v1\.2s
> > +** mov z0\.d, d0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float32_3:
> > +** fmov s0, s0
> > +** mov z0\.d, d0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float32_4:
> > +** movi v31\.2s, #0
> > +** uzp1 v0\.2s, v31\.2s, v0\.2s
> > +** mov z0\.d, d0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int32_1:
> > +** mov z0\.s, w0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int32_2:
> > +** fmov s0, w0
> > +** ins v0\.s\[1\], w1
> > +** mov z0\.d, d0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int32_3:
> > +** fmov s0, w0
> > +** mov z0\.d, d0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int32_4:
> > +** dup v30\.2s, w0
> > +** movi v31\.2s, 0
> > +** zip1 v31\.4s, v31\.4s, v30\.4s
> > +** dup z0\.q, z31\.q\[0\]
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_int64_1:
> > +** mov z0\.d, x0
> > +** ret
> > +*/
> > +
> > +/*
> > +** test_float64_1:
> > +** mov z0\.d, d0
> > +** ret
> > +*/
> > +
> > --
> > 2.43.0
>