[gcc r16-5420] AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn.

Tamar Christina via Gcc-cvs Wed, 19 Nov 2025 06:29:24 -0800

https://gcc.gnu.org/g:3027010d8bcc854eb43425cb1da573ff7345a5ac


commit r16-5420-g3027010d8bcc854eb43425cb1da573ff7345a5ac
Author: Tamar Christina <[email protected]>
Date:   Wed Nov 19 14:27:55 2025 +0000

    AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn.
    
    For this example using the Adv.SIMD/SVE Bridge
    
    #include <arm_neon.h>
    #include <arm_neon_sve_bridge.h>
    #include <stdint.h>
    
    svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
        return svset_neonq_s16(svundef_s16(),
                vsubq_s16(vmovl_high_s8(svget_neonq(a)),
                          vmovl_high_s8(svget_neonq(b))));
    }
    
    we generate:
    
    sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
            sxtl2   v0.8h, v0.16b
            ssubw2  v0.8h, v0.8h, v1.16b
            ret
    
    instead of just
    
    sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
            ssubl2  v0.8h, v0.16b, v1.16b
            ret
    
    Commit g:abf865732a7313cf79ffa325faed3467ed28d8b8 added a framework to fold
    uses of instrinsics combined with lo/hi extractions into the appropriate low
    or highpart instructions.
    
    However this doesn't trigger because the Adv.SIMD from SVE extraction code 
for
    
    vmovl_high_s8(svget_neonq(a))
    
    does not have one argument as constant and only supports folding 2 insn, 
not 3
    into 1.
    
    The above in RTL generates
    
    (insn 7 4 8 2 (set (reg:V8QI 103 [ _6 ])
            (vec_select:V8QI (subreg:V16QI (reg/v:VNx16QI 109 [ a ]) 0)
                (parallel:V16QI [
                        (const_int 8 [0x8])
                        (const_int 9 [0x9])
                        (const_int 10 [0xa])
                        (const_int 11 [0xb])
                        (const_int 12 [0xc])
                        (const_int 13 [0xd])
                        (const_int 14 [0xe])
                        (const_int 15 [0xf])
                    ]))) "":3174:43 -1
         (nil))
    
    Since the SVE and the Adv. SIMD modes are tieable this is a valid 
instruction to
    make, however it's suboptimal in that we can't fold this into the existing
    instruction patterns.  Eventually early-ra will split off the SVE reg from 
the
    patterns but by then we're passed combine and insn foldings so we miss all 
the
    optimizations.
    
    This patch introduces vec_extract optabs for 128-bit and 64-bit Adv.SIMD 
vector
    extraction from SVE registers and emits an explicit separate instruction 
for the
    subregs.  This then gives combine and rtl folding the opportunity to form 
the
    combined instructions and if not we arrive at the same RTL after early-ra.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-sve.md (vec_extract<mode><v128>,
            vec_extract<mode><v64>): New.
            * config/aarch64/iterators.md (V64, v64): New.
            * config/aarch64/predicates.md (const0_to_1_operand): New.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/simd/fold_to_highpart_6.c: Update codegen.
            * gcc.target/aarch64/sve/fold_to_highpart_1.c: New test.
            * gcc.target/aarch64/sve/fold_to_highpart_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md                  |  42 +++
 gcc/config/aarch64/iterators.md                    |  12 +
 gcc/config/aarch64/predicates.md                   |   4 +
 .../gcc.target/aarch64/simd/fold_to_highpart_6.c   |   9 +-
 .../gcc.target/aarch64/sve/fold_to_highpart_1.c    |  19 ++
 .../gcc.target/aarch64/sve/fold_to_highpart_2.c    | 295 +++++++++++++++++++++
 6 files changed, 380 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 4648aa67e0c3..26c08dbd9208 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3112,6 +3112,48 @@
   }
 )
 
+;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs.
+;; Doing so prevents combine from matching instructions generated by the
+;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions.
+;; Eventually early-ra or reload will split them but by then we've lost the
+;; combinations.  Instead split them early and allow fwprop or combine to
+;; push them into instructions where they are actually supported as part of the
+;; instruction.
+(define_expand "vec_extract<mode><v128>"
+  [(match_operand:<V128> 0 "register_operand")
+   (match_operand:SVE_FULL 1 "register_operand")
+   (match_operand:SI 2 "const0_operand")]
+  "TARGET_SVE"
+{
+    emit_move_insn (operands[0],
+                   force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
+    DONE;
+})
+
+;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors.  For
+;; these extractions we can support offsets 0 and 1 by first extracting a
+;; 128-bit vector and then selecting the appropriate half.
+(define_expand "vec_extract<mode><v64>"
+  [(match_operand:<V64> 0 "register_operand")
+   (match_operand:SVE_FULL_BHS 1 "register_operand")
+   (match_operand:SI 2 "const0_to_1_operand")]
+  "TARGET_SVE"
+{
+    if (CONST0_RTX (SImode) == operands[2])
+      emit_move_insn (operands[0],
+                     force_lowpart_subreg (<V64>mode, operands[1],
+                                           <MODE>mode));
+    else
+      {
+       rtx tmp = gen_reg_rtx (<V128>mode);
+       emit_move_insn (tmp,
+                       force_lowpart_subreg (<V128>mode, operands[1],
+                                             <MODE>mode));
+       emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
+      }
+    DONE;
+})
+
 ;; Extract element zero.  This is a special case because we want to force
 ;; the registers to be the same for the second alternative, and then
 ;; split the instruction into nothing after RA.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 32e5009a1f96..0c80b7adeaef 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1820,6 +1820,18 @@
                        (VNx4SI  "v4si") (VNx4SF "v4sf")
                        (VNx2DI  "v2di") (VNx2DF "v2df")])
 
+;; Gives the mode of the 64-bit lowpart of an SVE vector.
+(define_mode_attr V64 [(VNx16QI "V8QI")
+                       (VNx8HI  "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
+                       (VNx4SI  "V2SI") (VNx4SF "V2SF")
+                       (VNx2DI  "DI") (VNx2DF "DF")])
+
+;; ...and again in lower case.
+(define_mode_attr v64 [(VNx16QI "v8qi")
+                       (VNx8HI  "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
+                       (VNx4SI  "v2si") (VNx4SF "v2sf")
+                       (VNx2DI  "di") (VNx2DF "df")])
+
 (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
 
 ;; 64-bit container modes the inner or scalar source mode.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 3214476497c6..f53591d4b045 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -46,6 +46,10 @@
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_predicate "const0_to_1_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
 (define_predicate "const_0_to_7_operand"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
index 3570d4da34b5..83ef2148fd84 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target aarch64_little_endian } */
 /* { dg-options "-O2 -march=armv8-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon_sve_bridge.h>
 
@@ -16,6 +17,11 @@ test_addressable ()
   return vmovl_s8 (vget_high_s8 (z));
 }
 
+/*
+** test_scalable_type:
+**     sxtl2   v0.8h, v0.16b
+**     ret
+*/
 int16x8_t
 test_scalable_type (svint8_t scalable)
 {
@@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
   return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
 }
 
-/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
+/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
new file mode 100644
index 000000000000..a3d59a498bf6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <stdint.h>
+
+/*
+** sub_neon_i16_sve_bridged:
+**     ssubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
+    return svset_neonq_s16(svundef_s16(),
+            vsubq_s16(vmovl_high_s8(svget_neonq(a)),
+                      vmovl_high_s8(svget_neonq(b))));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
new file mode 100644
index 000000000000..6cca4adb8651
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
@@ -0,0 +1,295 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// ============================================================================
+// 8 -> 16 : SIGNED
+// ============================================================================
+
+/* 
+** add_neon_i16_from_i8_low_sve_bridged:
+**     saddl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** add_neon_i16_from_i8_high_sve_bridged:
+**     saddl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_low_sve_bridged:
+**     ssubl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_high_sve_bridged:
+**     ssubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+// ============================================================================
+// 8 -> 16 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u16_from_u8_low_sve_bridged:
+**     uaddl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** add_neon_u16_from_u8_high_sve_bridged:
+**     uaddl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_low_sve_bridged:
+**     usubl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_high_sve_bridged:
+**     usubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i32_from_i16_low_sve_bridged:
+**     saddl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** add_neon_i32_from_i16_high_sve_bridged:
+**     saddl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_low_sve_bridged:
+**     ssubl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_high_sve_bridged:
+**     ssubl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u32_from_u16_low_sve_bridged:
+**     uaddl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** add_neon_u32_from_u16_high_sve_bridged:
+**     uaddl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_low_sve_bridged:
+**     usubl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_high_sve_bridged:
+**     usubl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i64_from_i32_low_sve_bridged:
+**     saddl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** add_neon_i64_from_i32_high_sve_bridged:
+**     saddl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_low_sve_bridged:
+**     ssubl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_high_sve_bridged:
+**     ssubl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u64_from_u32_low_sve_bridged:
+**     uaddl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** add_neon_u64_from_u32_high_sve_bridged:
+**     uaddl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_low_sve_bridged:
+**     usubl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
+
+/* 
+** sub_neon_u64_from_u32_high_sve_bridged:
+**     usubl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}

[gcc r16-5420] AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn.

Reply via email to