Hi Artemiy,
> -----Original Message-----
> From: Artemiy Volkov <[email protected]>
> Sent: 27 April 2026 09:06
> To: [email protected]
> Cc: Tamar Christina <[email protected]>; Wilco Dijkstra
> <[email protected]>; [email protected]; Richard
> Earnshaw <[email protected]>; [email protected]; Alice
> Carlotti <[email protected]>; Alex Coplan <[email protected]>;
> Artemiy Volkov <[email protected]>
> Subject: [PATCH 4/4] aarch64/sve: combine AdvSIMD and SVE vec_duplicates
>
> Currently, to duplicate a 64-bit or narrower value into a SVE register, we
> choose to go via an intermediate 128-bit AdvSIMD register, viz.:
>
> svfloat32_t foo(float x) {
> return svdupq_n_f32(x, x, x, x);
> }
>
> which will produce the following code:
>
> dup v0.4s, v0.s[0]
> dup z0.q, z0.q[0]
> ret
>
> when compiled with -O2 -march=armv9-a+sve.
>
> This can be simplified into a single dup instruction going to an SVE
> register directly from a scalar (or a smaller vector) value:
>
> mov z0.s, s0
> ret
>
> To facilitate this, this patch adds a pattern that combine can use to
> merge two vec_duplicate instructions (scalar -> AdvSIMD and AdvSIMD ->
> SVE) into a single one (scalar -> SVE).
>
> To demonstrate the effect of this patch, the vec-init-23.c test from
> AdvSIMD was reused as a new SVE test (vec_init_5.c).
>
Awesome!
Patch is OK.
Thanks,
Tamar
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve.md
> (*aarch64_vec_duplicate_subvector<SVE_FULL:mode><VQ:mode><V
> DUP:mode>):
> New pattern.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/sve/vec_init_5.c: New test.
> ---
> gcc/config/aarch64/aarch64-sve.md | 14 +
> .../gcc.target/aarch64/sve/vec_init_5.c | 380 ++++++++++++++++++
> 2 files changed, 394 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index 019630eb8d2..46bb8eb82ad 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2890,6 +2890,20 @@
> [(set_attr "sve_type" "sve_int_general")]
> )
>
> +;; Initialize an SVE vector by duplicating a 128-bit AdvSIMD register that
> +;; itself contains a duplicated scalar or subvector value.
> +(define_insn
> "*aarch64_vec_duplicate_subvector<SVE_FULL:mode><VQ:mode><VDUP:mo
> de>"
> + [(set (match_operand:SVE_FULL 0 "register_operand")
> + (vec_duplicate:SVE_FULL
> + (vec_duplicate:VQ
> + (match_operand:VDUP 1 "register_operand"))))]
> + "TARGET_SVE"
> + {@ [ cons: =0 , 1 ]
> + [ w , r ] mov\t%0.<VDUP:single_type>, %<VDUP:single_wx>1
> + [ w , w ] mov\t%0.<VDUP:single_type>, %<VDUP:single_type>1
> + }
> +)
> +
> ;; This is used for vec_duplicate<mode>s from memory, but can also
> ;; be used by combine to optimize selects of a vec_duplicate<mode>
> ;; with zero.
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> new file mode 100644
> index 00000000000..3decc9ab1b6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> @@ -0,0 +1,380 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_sve.h>
> +
> +#define TESTCASE(TYPE, ETYPE, T, SZ, NUM, MULT, ...)\
> + sv##TYPE##SZ##_t test_##TYPE##SZ##_##NUM (ETYPE x0, ETYPE x1, ETYPE
> x2, ETYPE x3,\
> + ETYPE x4, ETYPE x5, ETYPE x6, ETYPE
> x7)\
> + {\
> + return svdupq_n_##T##SZ (__VA_ARGS__);\
> + }
> +
> +#include "../vec-init-23.c"
> +
> +/*
> +** test_int8_1:
> +** mov z0\.b, w0
> +** ret
> +*/
> +
> +/*
> +** test_int8_2:
> +** bfi w0, w1, 8, 8
> +** mov z0\.h, w0
> +** ret
> +*/
> +
> +/*
> +** test_int8_3:
> +** bfi w0, w1, 8, 8
> +** bfi w0, w2, 16, 8
> +** bfi w0, w3, 24, 8
> +** mov z0\.s, w0
> +** ret
> +*/
> +
> +/*
> +** test_int8_4:
> +** bfi w0, w2, 8, 8
> +** bfi w1, w3, 8, 8
> +** bfi w0, w4, 16, 8
> +** bfi w1, w5, 16, 8
> +** bfi w0, w6, 24, 8
> +** bfi w1, w7, 24, 8
> +** dup v31\.2s, w0
> +** dup v30\.2s, w1
> +** zip1 v31\.16b, v31\.16b, v30\.16b
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int8_5:
> +** uxtb w0, w0
> +** mov z0\.h, w0
> +** ret
> +*/
> +
> +/*
> +** test_int8_6:
> +** mov w1, 0
> +** bfi w1, w0, 8, 8
> +** mov z0\.h, w1
> +** ret
> +*/
> +
> +/*
> +** test_int8_7:
> +** mov w2, 16777472
> +** bfi w2, w0, 0, 8
> +** bfi w2, w1, 8, 8
> +** mov z0\.s, w2
> +** ret
> +*/
> +
> +/*
> +** test_int8_8:
> +** mov w2, 16777472
> +** bfi w2, w0, 16, 8
> +** bfi w2, w1, 24, 8
> +** mov z0\.s, w2
> +** ret
> +*/
> +
> +/*
> +** test_int8_9:
> +** mov w2, 16842752
> +** bfi w2, w0, 0, 8
> +** bfi w2, w1, 16, 8
> +** mov z0.s, w2
> +** ret
> +*/
> +
> +/*
> +** test_int8_10:
> +** bfi w0, w1, 8, 8
> +** bfi w0, w2, 16, 8
> +** bfi w0, w3, 24, 8
> +** dup v31\.2s, w0
> +** adrp x0, .LANCHOR[0-9]+
> +** ldr d30, \[x0, #:lo12:.LANCHOR[0-9]+\]
> +** zip1 v31\.16b, v31\.16b, v30\.16b
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int8_11:
> +** bfi w0, w1, 8, 8
> +** adrp x4, .LANCHOR[0-9]+
> +** bfi w0, w2, 16, 8
> +** ldr d31, \[x4, #:lo12:\.LANCHOR[0-9]+\]
> +** bfi w0, w3, 24, 8
> +** dup v30\.2s, w0
> +** zip1 v31\.16b, v31\.16b, v30\.16b
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int8_12:
> +** mov w4, 33685504
> +** bfi w4, w0, 0, 8
> +** mov w0, 257
> +** movk w0, 0x303, lsl 16
> +** bfi w0, w1, 0, 8
> +** bfi w4, w2, 16, 8
> +** bfi w0, w3, 16, 8
> +** dup v31\.2s, w4
> +** dup v30\.2s, w0
> +** zip1 v31\.16b, v31\.16b, v30\.16b
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int8_13:
> +** mov w4, 33685504
> +** bfi w4, w0, 8, 8
> +** mov w0, 257
> +** movk w0, 0x303, lsl 16
> +** bfi w0, w1, 8, 8
> +** bfi w4, w2, 24, 8
> +** bfi w0, w3, 24, 8
> +** dup v31\.2s, w4
> +** dup v30\.2s, w0
> +** zip1 v31\.16b, v31\.16b, v30\.16b
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_float16_1:
> +** fcvt h0, s0
> +** mov z0\.h, h0
> +** ret
> +*/
> +
> +/*
> +** test_float16_2:
> +** fcvt h1, s1
> +** fcvt h0, s0
> +** uzp1 v0\.4h, v0\.4h, v1\.4h
> +** mov z0\.s, s0
> +** ret
> +*/
> +
> +/*
> +** test_float16_3:
> +** uzp1 v2\.2s, v0\.2s, v2\.2s
> +** uzp1 v3\.2s, v1\.2s, v3\.2s
> +** zip1 v3\.4s, v2\.4s, v3\.4s
> +** fcvtn v3\.4h, v3\.4s
> +** mov z0\.d, d3
> +** ret
> +*/
> +
> +/*
> +** test_float16_4:
> +** fcvt h0, s0
> +** fmov h0, h0
> +** mov z0\.s, s0
> +** ret
> +*/
> +
> +/*
> +** test_float16_5:
> +** movi v31\.4h, #0
> +** fcvt h0, s0
> +** uzp1 v0\.4h, v31\.4h, v0\.4h
> +** mov z0\.s, s0
> +** ret
> +*/
> +
> +/*
> +** test_float16_6:
> +** fcvt h2, s0
> +** fcvt h1, s1
> +** fmov h31, 1.0e\+0
> +** fmov h2, h2
> +** uzp1 v1\.4h, v1\.4h, v31\.4h
> +** dup v0\.2s, v2\.s\[0\]
> +** dup v1\.2s, v1\.s\[0\]
> +** zip1 v0\.8h, v0\.8h, v1\.8h
> +** dup z0\.q, z0\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_float16_7:
> +** fcvt h3, s0
> +** fcvt h2, s1
> +** movi v0\.4h, #0
> +** fmov h1, 1.0e\+0
> +** uzp1 v1\.4h, v1\.4h, v2\.4h
> +** uzp1 v0\.4h, v0\.4h, v3\.4h
> +** dup v1\.2s, v1\.s\[0\]
> +** dup v0\.2s, v0\.s\[0\]
> +** zip1 v0\.8h, v0\.8h, v1\.8h
> +** dup z0\.q, z0\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_float16_8:
> +** fcvt h1, s1
> +** fcvt h0, s0
> +** movi v31\.2s, 0x3c, lsl 24
> +** uzp1 v0\.4h, v0\.4h, v1.4h
> +** dup v0\.2s, v0\.s\[0\]
> +** zip1 v0\.8h, v31\.8h, v0\.8h
> +** dup z0\.q, z0\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int16_1:
> +** mov z0\.h, w0
> +** ret
> +*/
> +
> +/*
> +** test_int16_2:
> +** bfi w0, w1, 16, 16
> +** mov z0\.s, w0
> +** ret
> +*/
> +
> +/*
> +** test_int16_3:
> +** bfi w0, w2, 16, 16
> +** bfi w1, w3, 16, 16
> +** dup v31\.2s, w0
> +** dup v30\.2s, w1
> +** zip1 v31\.8h, v31\.8h, v30\.8h
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int16_4:
> +** uxth w0, w0
> +** mov z0\.s, w0
> +** ret
> +*/
> +
> +/*
> +** test_int16_5:
> +** mov w1, 0
> +** bfi w1, w0, 16, 16
> +** mov z0\.s, w1
> +** ret
> +*/
> +
> +/*
> +** test_int16_6:
> +** uxth w0, w0
> +** dup v31\.2s, w0
> +** mov w0, 1
> +** bfi w1, w0, 16, 16
> +** dup v30\.2s, w1
> +** zip1 v31\.8h, v31\.8h, v30\.8h
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int16_7:
> +** mov w2, 0
> +** bfi w2, w0, 16, 16
> +** mov w0, 65537
> +** bfi w0, w1, 16, 16
> +** dup v31\.2s, w2
> +** dup v30\.2s, w0
> +** zip1 v31\.8h, v31\.8h, v30\.8h
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int16_8:
> +** bfi w0, w1, 16, 16
> +** movi v31\.2s, 0x1, lsl 16
> +** dup v30\.2s, w0
> +** zip1 v31\.8h, v31\.8h, v30\.8h
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_float32_1:
> +** mov z0\.s, s0
> +** ret
> +*/
> +
> +/*
> +** test_float32_2:
> +** uzp1 v0\.2s, v0\.2s, v1\.2s
> +** mov z0\.d, d0
> +** ret
> +*/
> +
> +/*
> +** test_float32_3:
> +** fmov s0, s0
> +** mov z0\.d, d0
> +** ret
> +*/
> +
> +/*
> +** test_float32_4:
> +** movi v31\.2s, #0
> +** uzp1 v0\.2s, v31\.2s, v0\.2s
> +** mov z0\.d, d0
> +** ret
> +*/
> +
> +/*
> +** test_int32_1:
> +** mov z0\.s, w0
> +** ret
> +*/
> +
> +/*
> +** test_int32_2:
> +** fmov s0, w0
> +** ins v0\.s\[1\], w1
> +** mov z0\.d, d0
> +** ret
> +*/
> +
> +/*
> +** test_int32_3:
> +** fmov s0, w0
> +** mov z0\.d, d0
> +** ret
> +*/
> +
> +/*
> +** test_int32_4:
> +** dup v30\.2s, w0
> +** movi v31\.2s, 0
> +** zip1 v31\.4s, v31\.4s, v30\.4s
> +** dup z0\.q, z31\.q\[0\]
> +** ret
> +*/
> +
> +/*
> +** test_int64_1:
> +** mov z0\.d, x0
> +** ret
> +*/
> +
> +/*
> +** test_float64_1:
> +** mov z0\.d, d0
> +** ret
> +*/
> +
> --
> 2.43.0