[PING] [PATCH] [AArch64, NEON] More NEON intrinsics improvement

Yangfei (Felix) Wed, 03 Dec 2014 23:53:59 -0800

Any comments?  Thanks.


> Hi,
>      This patch converts more intrinsics to use builtin functions instead of 
> the
> previous inline assembly syntax.
>      Passed the glorious testsuite of Christophe Lyon.
> 
>      Three testcases are added for the testing of intriniscs which are not
> covered by the testsuite:
>      gcc.target/aarch64/vfma.c
>      gcc.target/aarch64/vfma_n.c
>      gcc.target/aarch64/vfms.c
> 
>      Regtested with aarch64-linux-gnu on QEMU.  OK for the trunk?
> 
> 
> Index: gcc/ChangeLog
> =============================================================
> ======
> --- gcc/ChangeLog     (revision 217394)
> +++ gcc/ChangeLog     (working copy)
> @@ -1,3 +1,26 @@
> +2014-11-18  Felix Yang  <felix.y...@huawei.com>
> +         Haijian Zhang  <z.zhanghaij...@huawei.com>
> +         Jiji Jiang  <jiangj...@huawei.com>
> +         Pengfei Sui  <suipeng...@huawei.com>
> +
> +     * config/aarch64/arm_neon.h (vrecpe_u32, vrecpeq_u32): Rewrite using
> +     builtin functions.
> +     (vfma_f32, vfmaq_f32, vfmaq_f64, vfma_n_f32, vfmaq_n_f32,
> vfmaq_n_f64,
> +     vfms_f32, vfmsq_f32, vfmsq_f64): Likewise.
> +     (vhsub_s8, vhsub_u8, vhsub_s16, vhsub_u16, vhsub_s32, vhsub_u32,
> +     vhsubq_s8, vhsubq_u8, vhsubq_s16, vhsubq_u16, vhsubq_s32,
> vhsubq_u32,
> +     vsubhn_s16, vsubhn_u16, vsubhn_s32, vsubhn_u32, vsubhn_s64,
> vsubhn_u66,
> +     vrsubhn_s16, vrsubhn_u16, vrsubhn_s32, vrsubhn_u32, vrsubhn_s64,
> +     vrsubhn_u64, vsubhn_high_s16, vsubhn_high_u16, vsubhn_high_s32,
> +     vsubhn_high_u32, vsubhn_high_s64, vsubhn_high_u64, vrsubhn_high_s16,
> +     vrsubhn_high_u16, vrsubhn_high_s32, vrsubhn_high_u32,
> vrsubhn_high_s64,
> +     vrsubhn_high_u64): Likewise.
> +     * config/aarch64/iterators.md (VDQ_SI): New mode iterator.
> +     * config/aarch64/aarch64.md (define_c_enum "unspec"): Add
> UNSPEC_URECPE.
> +     * config/aarch64/aarch64-simd.md (aarch64_urecpe<mode>): New
> pattern.
> +     * config/aarch64/aarch64-simd-builtins.def (shsub, uhsub, subhn, rsubhn,
> +     subhn2, rsubhn2, urecpe): New builtins.
> +
>  2014-11-11  Andrew Pinski  <apin...@cavium.com>
> 
>       Bug target/61997
> Index: gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
> =============================================================
> ======
> --- gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c (revision 
> 217394)
> +++ gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c (working copy)
> @@ -107,9 +107,9 @@ ONE (vmovn_high, uint16x8_t, uint16x4_t, uint32x4_
> ONE (vmovn_high, uint32x4_t, uint32x2_t, uint64x2_t, u64)
> 
> 
> -/* { dg-final { scan-assembler-times "\\tsubhn2 v" 6} }  */
> +/* { dg-final { scan-assembler-times "\\tsubhn2\\tv" 6} }  */
>  /* { dg-final { scan-assembler-times "\\taddhn2\\tv" 6} }  */
> -/* { dg-final { scan-assembler-times "rsubhn2 v" 6} }  */
> +/* { dg-final { scan-assembler-times "rsubhn2\\tv" 6} }  */
>  /* { dg-final { scan-assembler-times "raddhn2\\tv" 6} }  */
>  /* { dg-final { scan-assembler-times "\\trshrn2 v" 6} }  */
>  /* { dg-final { scan-assembler-times "\\tshrn2 v" 6} }  */
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c
> =============================================================
> ======
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c      
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c      
> (revision
> 0)
> @@ -0,0 +1,69 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
> +VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8,
> +0x4486deb8, 0x4486feb8 };
> +VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520,
> +0x40890ee1532b8520 };
> +
> +#define VECT_VAR_ASSIGN(S,Q,T1,W) S##Q##_##T1##W #define ASSIGN(S,
> Q,
> +T, W, V) T##W##_t S##Q##_##T##W = V #define TEST_MSG "VFMA/VFMAQ"
> +void exec_vfma_n (void)
> +{
> +  /* Basic test: v4=vfma_n(v1,v2), then store the result.  */
> +#define TEST_VFMA(Q, T1, T2, W, N)                                   \
> +  VECT_VAR(vector_res, T1, W, N) =                                   \
> +    vfma##Q##_n_##T2##W(VECT_VAR(vector1, T1, W, N),                 \
> +                   VECT_VAR(vector2, T1, W, N),                      \
> +                       VECT_VAR_ASSIGN(Scalar, Q, T1, W));                   
> \
> +  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res,
> +T1, W, N))
> +
> +#define CHECK_VFMA_RESULTS(test_name,comment)                                
> \
> +  {                                                                  \
> +    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);    \
> +    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);    \
> +     CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);   \
> +  }
> +
> +#define DECL_VABD_VAR(VAR)                   \
> +  DECL_VARIABLE(VAR, float, 32, 2);          \
> +  DECL_VARIABLE(VAR, float, 32, 4);          \
> +  DECL_VARIABLE(VAR, float, 64, 2);
> +
> +  DECL_VABD_VAR(vector1);
> +  DECL_VABD_VAR(vector2);
> +  DECL_VABD_VAR(vector3);
> +  DECL_VABD_VAR(vector_res);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */  VLOAD(vector1,
> + buffer, , float, f, 32, 2);  VLOAD(vector1, buffer, q, float, f, 32,
> + 4);  VLOAD(vector1, buffer, q, float, f, 64, 2);
> +
> +  /* Choose init value arbitrarily.  */  VDUP(vector2, , float, f, 32,
> + 2, 9.3f);  VDUP(vector2, q, float, f, 32, 4, 29.7f);  VDUP(vector2, q,
> + float, f, 64, 2, 15.8f);
> +
> +  /* Choose init value arbitrarily.  */  ASSIGN(Scalar, , float, 32,
> + 81.2f);  ASSIGN(Scalar, q, float, 32, 36.8f);  ASSIGN(Scalar, q,
> + float, 64, 51.7f);
> +
> +  /* Execute the tests.  */
> +  TEST_VFMA(, float, f, 32, 2);
> +  TEST_VFMA(q, float, f, 32, 4);
> +  TEST_VFMA(q, float, f, 64, 2);
> +
> +  CHECK_VFMA_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  exec_vfma_n ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
> =============================================================
> ======
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c        
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c        
> (revision 0)
> @@ -0,0 +1,67 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
> +VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8,
> +0x4486deb8, 0x4486feb8 };
> +VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520,
> +0x40890ee1532b8520 };
> +
> +#define TEST_MSG "VFMA/VFMAQ"
> +void exec_vfma (void)
> +{
> +  /* Basic test: v4=vfma(v1,v2), then store the result.  */
> +#define TEST_VFMA(Q, T1, T2, W, N)                                   \
> +  VECT_VAR(vector_res, T1, W, N) =                                   \
> +    vfma##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                   \
> +                   VECT_VAR(vector2, T1, W, N),                      \
> +                       VECT_VAR(vector3, T1, W, N));                 \
> +  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res,
> +T1, W, N))
> +
> +#define CHECK_VFMA_RESULTS(test_name,comment)                                
> \
> +  {                                                                  \
> +    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);    \
> +    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);    \
> +     CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);   \
> +  }
> +
> +#define DECL_VABD_VAR(VAR)                   \
> +  DECL_VARIABLE(VAR, float, 32, 2);          \
> +  DECL_VARIABLE(VAR, float, 32, 4);          \
> +  DECL_VARIABLE(VAR, float, 64, 2);
> +
> +  DECL_VABD_VAR(vector1);
> +  DECL_VABD_VAR(vector2);
> +  DECL_VABD_VAR(vector3);
> +  DECL_VABD_VAR(vector_res);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */  VLOAD(vector1,
> + buffer, , float, f, 32, 2);  VLOAD(vector1, buffer, q, float, f, 32,
> + 4);  VLOAD(vector1, buffer, q, float, f, 64, 2);
> +
> +  /* Choose init value arbitrarily.  */  VDUP(vector2, , float, f, 32,
> + 2, 9.3f);  VDUP(vector2, q, float, f, 32, 4, 29.7f);  VDUP(vector2, q,
> + float, f, 64, 2, 15.8f);
> +
> +  /* Choose init value arbitrarily.  */  VDUP(vector3, , float, f, 32,
> + 2, 81.2f);  VDUP(vector3, q, float, f, 32, 4, 36.8f);  VDUP(vector3,
> + q, float, f, 64, 2, 51.7f);
> +
> +  /* Execute the tests.  */
> +  TEST_VFMA(, float, f, 32, 2);
> +  TEST_VFMA(q, float, f, 32, 4);
> +  TEST_VFMA(q, float, f, 64, 2);
> +
> +  CHECK_VFMA_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  exec_vfma ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
> =============================================================
> ======
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c        
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c        
> (revision 0)
> @@ -0,0 +1,67 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d };
> +VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8,
> +0xc48a5eb8, 0xc48a3eb8 };
> +VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520,
> +0xc089fee1532b8520 };
> +
> +#define TEST_MSG "VFMA/VFMAQ"
> +void exec_vfms (void)
> +{
> +  /* Basic test: v4=vfms(v1,v2), then store the result.  */
> +#define TEST_VFMA(Q, T1, T2, W, N)                                   \
> +  VECT_VAR(vector_res, T1, W, N) =                                   \
> +    vfms##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                   \
> +                   VECT_VAR(vector2, T1, W, N),                      \
> +                       VECT_VAR(vector3, T1, W, N));                 \
> +  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res,
> +T1, W, N))
> +
> +#define CHECK_VFMA_RESULTS(test_name,comment)                                
> \
> +  {                                                                  \
> +    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);    \
> +    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);    \
> +     CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);   \
> +  }
> +
> +#define DECL_VABD_VAR(VAR)                   \
> +  DECL_VARIABLE(VAR, float, 32, 2);          \
> +  DECL_VARIABLE(VAR, float, 32, 4);          \
> +  DECL_VARIABLE(VAR, float, 64, 2);
> +
> +  DECL_VABD_VAR(vector1);
> +  DECL_VABD_VAR(vector2);
> +  DECL_VABD_VAR(vector3);
> +  DECL_VABD_VAR(vector_res);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */  VLOAD(vector1,
> + buffer, , float, f, 32, 2);  VLOAD(vector1, buffer, q, float, f, 32,
> + 4);  VLOAD(vector1, buffer, q, float, f, 64, 2);
> +
> +  /* Choose init value arbitrarily.  */  VDUP(vector2, , float, f, 32,
> + 2, 9.3f);  VDUP(vector2, q, float, f, 32, 4, 29.7f);  VDUP(vector2, q,
> + float, f, 64, 2, 15.8f);
> +
> +  /* Choose init value arbitrarily.  */  VDUP(vector3, , float, f, 32,
> + 2, 81.2f);  VDUP(vector3, q, float, f, 32, 4, 36.8f);  VDUP(vector3,
> + q, float, f, 64, 2, 51.7f);
> +
> +  /* Execute the tests.  */
> +  TEST_VFMA(, float, f, 32, 2);
> +  TEST_VFMA(q, float, f, 32, 4);
> +  TEST_VFMA(q, float, f, 64, 2);
> +
> +  CHECK_VFMA_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  exec_vfms ();
> +  return 0;
> +}
> Index: gcc/testsuite/ChangeLog
> =============================================================
> ======
> --- gcc/testsuite/ChangeLog   (revision 217394)
> +++ gcc/testsuite/ChangeLog   (working copy)
> @@ -1,3 +1,14 @@
> +2014-11-18  Felix Yang  <felix.y...@huawei.com>
> +         Haijian Zhang  <z.zhanghaij...@huawei.com>
> +         Jiji Jiang  <jiangj...@huawei.com>
> +         Pengfei Sui  <suipeng...@huawei.com>
> +
> +     * gcc.target/aarch64/vfma.c: New test.
> +     * gcc.target/aarch64/vfma_n.c: New test.
> +     * gcc.target/aarch64/vfms.c: New test.
> +     * gcc.target/aarch64/narrow_high-intrinsics.c: Fix expected assembler
> +     for rsubhn2 & subhn2.
> +
>  2014-11-11  Anthony Brandon  <anthony.bran...@gmail.com>
>           Manuel L贸pez-Ib谩帽ez  <m...@gcc.gnu.org>
> 
> Index: gcc/config/aarch64/arm_neon.h
> =============================================================
> ======
> --- gcc/config/aarch64/arm_neon.h     (revision 217394)
> +++ gcc/config/aarch64/arm_neon.h     (working copy)
> @@ -2287,7 +2287,247 @@ vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
>    return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);  }
> 
> +__extension__ static __inline int8x8_t __attribute__
> +((__always_inline__))
> +vhsub_s8 (int8x8_t __a, int8x8_t __b)
> +{
> +  return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b); }
> +
> +__extension__ static __inline int16x4_t __attribute__
> +((__always_inline__))
> +vhsub_s16 (int16x4_t __a, int16x4_t __b) {
> +  return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b); }
> +
> +__extension__ static __inline int32x2_t __attribute__
> +((__always_inline__))
> +vhsub_s32 (int32x2_t __a, int32x2_t __b) {
> +  return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b); }
> +
> +__extension__ static __inline uint8x8_t __attribute__
> +((__always_inline__))
> +vhsub_u8 (uint8x8_t __a, uint8x8_t __b) {
> +  return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a,
> +                                               (int8x8_t) __b);
> +}
> +
>  __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
> +vhsub_u16 (uint16x4_t __a, uint16x4_t __b) {
> +  return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a,
> +                                                (int16x4_t) __b);
> +}
> +
> +__extension__ static __inline uint32x2_t __attribute__
> +((__always_inline__))
> +vhsub_u32 (uint32x2_t __a, uint32x2_t __b) {
> +  return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a,
> +                                                (int32x2_t) __b);
> +}
> +
> +__extension__ static __inline int8x16_t __attribute__
> +((__always_inline__))
> +vhsubq_s8 (int8x16_t __a, int8x16_t __b) {
> +  return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b); }
> +
> +__extension__ static __inline int16x8_t __attribute__
> +((__always_inline__))
> +vhsubq_s16 (int16x8_t __a, int16x8_t __b) {
> +  return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b); }
> +
> +__extension__ static __inline int32x4_t __attribute__
> +((__always_inline__))
> +vhsubq_s32 (int32x4_t __a, int32x4_t __b) {
> +  return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b); }
> +
> +__extension__ static __inline uint8x16_t __attribute__
> +((__always_inline__))
> +vhsubq_u8 (uint8x16_t __a, uint8x16_t __b) {
> +  return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a,
> +                                                 (int8x16_t) __b);
> +}
> +
> +__extension__ static __inline uint16x8_t __attribute__
> +((__always_inline__))
> +vhsubq_u16 (uint16x8_t __a, uint16x8_t __b) {
> +  return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a,
> +                                                (int16x8_t) __b);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> +((__always_inline__))
> +vhsubq_u32 (uint32x4_t __a, uint32x4_t __b) {
> +  return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a,
> +                                                (int32x4_t) __b);
> +}
> +
> +__extension__ static __inline int8x8_t __attribute__
> +((__always_inline__))
> +vsubhn_s16 (int16x8_t __a, int16x8_t __b) {
> +  return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b); }
> +
> +__extension__ static __inline int16x4_t __attribute__
> +((__always_inline__))
> +vsubhn_s32 (int32x4_t __a, int32x4_t __b) {
> +  return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b); }
> +
> +__extension__ static __inline int32x2_t __attribute__
> +((__always_inline__))
> +vsubhn_s64 (int64x2_t __a, int64x2_t __b) {
> +  return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b); }
> +
> +__extension__ static __inline uint8x8_t __attribute__
> +((__always_inline__))
> +vsubhn_u16 (uint16x8_t __a, uint16x8_t __b) {
> +  return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a,
> +                                               (int16x8_t) __b);
> +}
> +
> +__extension__ static __inline uint16x4_t __attribute__
> +((__always_inline__))
> +vsubhn_u32 (uint32x4_t __a, uint32x4_t __b) {
> +  return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a,
> +                                                (int32x4_t) __b);
> +}
> +
> +__extension__ static __inline uint32x2_t __attribute__
> +((__always_inline__))
> +vsubhn_u64 (uint64x2_t __a, uint64x2_t __b) {
> +  return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a,
> +                                                (int64x2_t) __b);
> +}
> +
> +__extension__ static __inline int8x8_t __attribute__
> +((__always_inline__))
> +vrsubhn_s16 (int16x8_t __a, int16x8_t __b) {
> +  return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b); }
> +
> +__extension__ static __inline int16x4_t __attribute__
> +((__always_inline__))
> +vrsubhn_s32 (int32x4_t __a, int32x4_t __b) {
> +  return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b); }
> +
> +__extension__ static __inline int32x2_t __attribute__
> +((__always_inline__))
> +vrsubhn_s64 (int64x2_t __a, int64x2_t __b) {
> +  return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b); }
> +
> +__extension__ static __inline uint8x8_t __attribute__
> +((__always_inline__))
> +vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b) {
> +  return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a,
> +                                                (int16x8_t) __b);
> +}
> +
> +__extension__ static __inline uint16x4_t __attribute__
> +((__always_inline__))
> +vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b) {
> +  return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a,
> +                                                 (int32x4_t) __b);
> +}
> +
> +__extension__ static __inline uint32x2_t __attribute__
> +((__always_inline__))
> +vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b) {
> +  return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a,
> +                                                 (int64x2_t) __b);
> +}
> +
> +__extension__ static __inline int8x16_t __attribute__
> +((__always_inline__))
> +vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c) {
> +  return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c); }
> +
> +__extension__ static __inline int16x8_t __attribute__
> +((__always_inline__))
> +vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c) {
> +  return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c); }
> +
> +__extension__ static __inline int32x4_t __attribute__
> +((__always_inline__))
> +vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c) {
> +  return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c); }
> +
> +__extension__ static __inline uint8x16_t __attribute__
> +((__always_inline__))
> +vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c) {
> +  return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
> +                                                 (int16x8_t) __b,
> +                                                 (int16x8_t) __c);
> +}
> +
> +__extension__ static __inline uint16x8_t __attribute__
> +((__always_inline__))
> +vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c) {
> +  return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
> +                                                 (int32x4_t) __b,
> +                                                 (int32x4_t) __c);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> +((__always_inline__))
> +vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c) {
> +  return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
> +                                                 (int64x2_t) __b,
> +                                                 (int64x2_t) __c);
> +}
> +
> +__extension__ static __inline int8x16_t __attribute__
> +((__always_inline__))
> +vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c) {
> +  return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c); }
> +
> +__extension__ static __inline int16x8_t __attribute__
> +((__always_inline__))
> +vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c) {
> +  return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);; }
> +
> +__extension__ static __inline int32x4_t __attribute__
> +((__always_inline__))
> +vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c) {
> +  return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c); }
> +
> +__extension__ static __inline uint8x16_t __attribute__
> +((__always_inline__))
> +vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c) {
> +  return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
> +                                                 (int16x8_t) __b,
> +                                                 (int16x8_t) __c);
> +}
> +
> +__extension__ static __inline uint16x8_t __attribute__
> +((__always_inline__))
> +vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c) {
> +  return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
> +                                                 (int32x4_t) __b,
> +                                                 (int32x4_t) __c);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> +((__always_inline__))
> +vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c) {
> +  return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
> +                                                 (int64x2_t) __b,
> +                                                 (int64x2_t) __c);
> +}
> +
> +__extension__ static __inline uint16x4_t __attribute__
> +((__always_inline__))
>  vqadd_u16 (uint16x4_t __a, uint16x4_t __b)  {
>    return __builtin_aarch64_uqaddv4hi_uuu (__a, __b); @@ -5889,237 +6129,6
> @@ vcvtxd_f32_f64 (float64_t a)  }
> 
>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> -vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) -{
> -  float32x2_t result;
> -  __asm__ ("fmla %0.2s,%2.2s,%3.2s"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
> -vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) -{
> -  float32x4_t result;
> -  __asm__ ("fmla %0.4s,%2.4s,%3.4s"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
> -vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) -{
> -  float64x2_t result;
> -  __asm__ ("fmla %0.2d,%2.2d,%3.2d"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> -vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c) -{
> -  float32x2_t result;
> -  __asm__ ("fmla %0.2s, %2.2s, %3.s[0]"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
> -vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) -{
> -  float32x4_t result;
> -  __asm__ ("fmla %0.4s, %2.4s, %3.s[0]"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
> -vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) -{
> -  float64x2_t result;
> -  __asm__ ("fmla %0.2d, %2.2d, %3.d[0]"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> -vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c) -{
> -  float32x2_t result;
> -  __asm__ ("fmls %0.2s,%2.2s,%3.2s"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
> -vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) -{
> -  float32x4_t result;
> -  __asm__ ("fmls %0.4s,%2.4s,%3.4s"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
> -vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) -{
> -  float64x2_t result;
> -  __asm__ ("fmls %0.2d,%2.2d,%3.2d"
> -           : "=w"(result)
> -           : "0"(a), "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
> -vhsub_s8 (int8x8_t a, int8x8_t b)
> -{
> -  int8x8_t result;
> -  __asm__ ("shsub %0.8b, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> -vhsub_s16 (int16x4_t a, int16x4_t b)
> -{
> -  int16x4_t result;
> -  __asm__ ("shsub %0.4h, %1.4h, %2.4h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> -vhsub_s32 (int32x2_t a, int32x2_t b)
> -{
> -  int32x2_t result;
> -  __asm__ ("shsub %0.2s, %1.2s, %2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
> -vhsub_u8 (uint8x8_t a, uint8x8_t b)
> -{
> -  uint8x8_t result;
> -  __asm__ ("uhsub %0.8b, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
> -vhsub_u16 (uint16x4_t a, uint16x4_t b)
> -{
> -  uint16x4_t result;
> -  __asm__ ("uhsub %0.4h, %1.4h, %2.4h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
> -vhsub_u32 (uint32x2_t a, uint32x2_t b)
> -{
> -  uint32x2_t result;
> -  __asm__ ("uhsub %0.2s, %1.2s, %2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
> -vhsubq_s8 (int8x16_t a, int8x16_t b)
> -{
> -  int8x16_t result;
> -  __asm__ ("shsub %0.16b, %1.16b, %2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vhsubq_s16 (int16x8_t a, int16x8_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("shsub %0.8h, %1.8h, %2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vhsubq_s32 (int32x4_t a, int32x4_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("shsub %0.4s, %1.4s, %2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
> -vhsubq_u8 (uint8x16_t a, uint8x16_t b)
> -{
> -  uint8x16_t result;
> -  __asm__ ("uhsub %0.16b, %1.16b, %2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
> -vhsubq_u16 (uint16x8_t a, uint16x8_t b) -{
> -  uint16x8_t result;
> -  __asm__ ("uhsub %0.8h, %1.8h, %2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
> -vhsubq_u32 (uint32x4_t a, uint32x4_t b) -{
> -  uint32x4_t result;
> -  __asm__ ("uhsub %0.4s, %1.4s, %2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>  vld1_dup_f32 (const float32_t * a)
>  {
>    float32x2_t result;
> @@ -10492,28 +10501,6 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
>         result;
> \
>       })
> 
> -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
> -vrecpe_u32 (uint32x2_t a)
> -{
> -  uint32x2_t result;
> -  __asm__ ("urecpe %0.2s,%1.2s"
> -           : "=w"(result)
> -           : "w"(a)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
> -vrecpeq_u32 (uint32x4_t a)
> -{
> -  uint32x4_t result;
> -  __asm__ ("urecpe %0.4s,%1.4s"
> -           : "=w"(result)
> -           : "w"(a)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
>  #define vrshrn_high_n_s16(a, b, c)                                      \
>    __extension__
> \
> 
> ({                                                                  \
> @@ -10819,138 +10806,6 @@ vrsqrtss_f32 (float32_t a, float32_t b)
>    return result;
>  }
> 
> -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
> -vrsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c) -{
> -  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vrsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c) -{
> -  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vrsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c) -{
> -  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
> -vrsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c) -{
> -  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
> -vrsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c) -{
> -  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
> -vrsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c) -{
> -  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
> -vrsubhn_s16 (int16x8_t a, int16x8_t b)
> -{
> -  int8x8_t result;
> -  __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> -vrsubhn_s32 (int32x4_t a, int32x4_t b)
> -{
> -  int16x4_t result;
> -  __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> -vrsubhn_s64 (int64x2_t a, int64x2_t b)
> -{
> -  int32x2_t result;
> -  __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
> -vrsubhn_u16 (uint16x8_t a, uint16x8_t b) -{
> -  uint8x8_t result;
> -  __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
> -vrsubhn_u32 (uint32x4_t a, uint32x4_t b) -{
> -  uint16x4_t result;
> -  __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
> -vrsubhn_u64 (uint64x2_t a, uint64x2_t b) -{
> -  uint32x2_t result;
> -  __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
>  #define vshrn_high_n_s16(a, b, c)                                       \
>    __extension__
> \
> 
> ({                                                                  \
> @@ -11482,139 +11337,8 @@ vrsqrtss_f32 (float32_t a, float32_t b)
>                  : "memory");
> \
>       })
> 
> -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
> -vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c) -{
> -  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> 
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c) -{
> -  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c) -{
> -  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
> -vsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c) -{
> -  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
> -vsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c) -{
> -  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
> -vsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c) -{
> -  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C
> (0x0)));
> -  __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
> -           : "+w"(result)
> -           : "w"(b), "w"(c)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
> -vsubhn_s16 (int16x8_t a, int16x8_t b)
> -{
> -  int8x8_t result;
> -  __asm__ ("subhn %0.8b, %1.8h, %2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> -vsubhn_s32 (int32x4_t a, int32x4_t b)
> -{
> -  int16x4_t result;
> -  __asm__ ("subhn %0.4h, %1.4s, %2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> -vsubhn_s64 (int64x2_t a, int64x2_t b)
> -{
> -  int32x2_t result;
> -  __asm__ ("subhn %0.2s, %1.2d, %2.2d"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
>  __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
> -vsubhn_u16 (uint16x8_t a, uint16x8_t b) -{
> -  uint8x8_t result;
> -  __asm__ ("subhn %0.8b, %1.8h, %2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
> -vsubhn_u32 (uint32x4_t a, uint32x4_t b) -{
> -  uint16x4_t result;
> -  __asm__ ("subhn %0.4h, %1.4s, %2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
> -vsubhn_u64 (uint64x2_t a, uint64x2_t b) -{
> -  uint32x2_t result;
> -  __asm__ ("subhn %0.2s, %1.2d, %2.2d"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
>  vtst_p8 (poly8x8_t a, poly8x8_t b)
>  {
>    uint8x8_t result;
> @@ -16057,6 +15781,42 @@ vfma_f64 (float64x1_t __a, float64x1_t __b, float6
>    return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};  }
> 
> +__extension__ static __inline float32x2_t __attribute__
> +((__always_inline__))
> +vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) {
> +  return __builtin_aarch64_fmav2sf (__b, __c, __a); }
> +
> +__extension__ static __inline float32x4_t __attribute__
> +((__always_inline__))
> +vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) {
> +  return __builtin_aarch64_fmav4sf (__b, __c, __a); }
> +
> +__extension__ static __inline float64x2_t __attribute__
> +((__always_inline__))
> +vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) {
> +  return __builtin_aarch64_fmav2df (__b, __c, __a); }
> +
> +__extension__ static __inline float32x2_t __attribute__
> +((__always_inline__))
> +vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) {
> +  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); }
> +
> +__extension__ static __inline float32x4_t __attribute__
> +((__always_inline__))
> +vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) {
> +  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a); }
> +
> +__extension__ static __inline float64x2_t __attribute__
> +((__always_inline__))
> +vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) {
> +  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a); }
> +
>  /* vfma_lane  */
> 
>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> @@ -16168,6 +15928,25 @@ vfms_f64 (float64x1_t __a, float64x1_t __b, float6
>    return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};  }
> 
> +__extension__ static __inline float32x2_t __attribute__
> +((__always_inline__))
> +vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) {
> +  return __builtin_aarch64_fmav2sf (-__b, __c, __a); }
> +
> +__extension__ static __inline float32x4_t __attribute__
> +((__always_inline__))
> +vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) {
> +  return __builtin_aarch64_fmav4sf (-__b, __c, __a); }
> +
> +__extension__ static __inline float64x2_t __attribute__
> +((__always_inline__))
> +vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) {
> +  return __builtin_aarch64_fmav2df (-__b, __c, __a); }
> +
> +
>  /* vfms_lane  */
> 
>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> @@ -21302,6 +21081,18 @@ vrbitq_u8 (uint8x16_t __a)
> 
>  /* vrecpe  */
> 
> +__extension__ static __inline uint32x2_t __attribute__
> +((__always_inline__))
> +vrecpe_u32 (uint32x2_t __a)
> +{
> +  return (uint32x2_t)__builtin_aarch64_urecpev2si ((int32x2_t)__a); }
> +
> +__extension__ static __inline uint32x4_t __attribute__
> +((__always_inline__))
> +vrecpeq_u32 (uint32x4_t __a)
> +{
> +  return (uint32x4_t)__builtin_aarch64_urecpev4si ((int32x4_t)__a); }
> +
>  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
>  vrecpes_f32 (float32_t __a)
>  {
> Index: gcc/config/aarch64/iterators.md
> =============================================================
> ======
> --- gcc/config/aarch64/iterators.md   (revision 217394)
> +++ gcc/config/aarch64/iterators.md   (working copy)
> @@ -153,6 +153,9 @@
>  ;; Vector modes except double int.
>  (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF
> V2DF])
> 
> +;; Vector modes for S type.
> +(define_mode_iterator VDQ_SI [V2SI V4SI])
> +
>  ;; Vector modes for Q and H types.
>  (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
> 
> Index: gcc/config/aarch64/aarch64-simd.md
> =============================================================
> ======
> --- gcc/config/aarch64/aarch64-simd.md        (revision 217394)
> +++ gcc/config/aarch64/aarch64-simd.md        (working copy)
> @@ -4972,6 +4972,14 @@
>    [(set_attr "type" "neon_fp_recps_<Vetype><q>")]
>  )
> 
> +(define_insn "aarch64_urecpe<mode>"
> +  [(set (match_operand:VDQ_SI 0 "register_operand" "=w")
> +        (unspec:VDQ_SI [(match_operand:VDQ_SI 1 "register_operand" "w")]
> +                UNSPEC_URECPE))]
> + "TARGET_SIMD"
> + "urecpe\\t%0.<Vtype>, %1.<Vtype>"
> +  [(set_attr "type" "neon_fp_recpe_<Vetype><q>")])
> +
>  ;; Standard pattern name vec_extract<mode>.
> 
>  (define_expand "vec_extract<mode>"
> Index: gcc/config/aarch64/aarch64.md
> =============================================================
> ======
> --- gcc/config/aarch64/aarch64.md     (revision 217394)
> +++ gcc/config/aarch64/aarch64.md     (working copy)
> @@ -75,6 +75,7 @@
>      UNSPEC_CRC32H
>      UNSPEC_CRC32W
>      UNSPEC_CRC32X
> +    UNSPEC_URECPE
>      UNSPEC_FRECPE
>      UNSPEC_FRECPS
>      UNSPEC_FRECPX
> Index: gcc/config/aarch64/aarch64-simd-builtins.def
> =============================================================
> ======
> --- gcc/config/aarch64/aarch64-simd-builtins.def      (revision 217394)
> +++ gcc/config/aarch64/aarch64-simd-builtins.def      (working copy)
> @@ -124,15 +124,21 @@
>    BUILTIN_VDW (BINOP, usubw, 0)
>    /* Implemented by aarch64_<sur>h<addsub><mode>.  */
>    BUILTIN_VQ_S (BINOP, shadd, 0)
> +  BUILTIN_VQ_S (BINOP, shsub, 0)
>    BUILTIN_VQ_S (BINOP, uhadd, 0)
> +  BUILTIN_VQ_S (BINOP, uhsub, 0)
>    BUILTIN_VQ_S (BINOP, srhadd, 0)
>    BUILTIN_VQ_S (BINOP, urhadd, 0)
>    /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
>    BUILTIN_VQN (BINOP, addhn, 0)
> +  BUILTIN_VQN (BINOP, subhn, 0)
>    BUILTIN_VQN (BINOP, raddhn, 0)
> +  BUILTIN_VQN (BINOP, rsubhn, 0)
>    /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
>    BUILTIN_VQN (TERNOP, addhn2, 0)
> +  BUILTIN_VQN (TERNOP, subhn2, 0)
>    BUILTIN_VQN (TERNOP, raddhn2, 0)
> +  BUILTIN_VQN (TERNOP, rsubhn2, 0)
> 
>    BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
>    /* Implemented by aarch64_<sur>qmovn<mode>.  */ @@ -334,6 +340,8
> @@
>    BUILTIN_GPF (BINOP, frecps, 0)
>    BUILTIN_GPF (UNOP, frecpx, 0)
> 
> +  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
> +
>    BUILTIN_VDQF (UNOP, frecpe, 0)
>    BUILTIN_VDQF (BINOP, frecps, 0)

[PING] [PATCH] [AArch64, NEON] More NEON intrinsics improvement

Reply via email to