PING? BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more elegant. So is the improvement of vcls(q?) vcnt(q?) OK for trunk? Thanks.
> > Hi, > This patch converts vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics to use > builtin functions instead of the previous inline assembly syntax. > Regtested with aarch64-linux-gnu on QEMU. Also passed the glorious > testsuite of Christophe Lyon. > OK for the trunk? > > > Index: gcc/ChangeLog > ============================================================= > ====== > --- gcc/ChangeLog (revision 217394) > +++ gcc/ChangeLog (working copy) > @@ -1,3 +1,21 @@ > +2014-11-13 Felix Yang <felix.y...@huawei.com> > + Jiji Jiang <jiangj...@huawei.com> > + Shanyao Chen <chenshan...@huawei.com> > + > + * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount, ld1r): New > + builtins. > + * config/aarch64/aarch64-simd.md (aarch64_ld1r<mode>): New expand. > + (clrsb<mode>2, popcount<mode>2): New patterns. > + (*aarch64_simd_ld1r<mode>): Renamed to aarch64_simd_ld1r<mode>. > + * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8, > + vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8, > + vcntq_u8, vld1_dup_f32, vld1_dup_f64, vld1_dup_p8, vld1_dup_p16, > + vld1_dup_s8, vld1_dup_s16, vld1_dup_s32, vld1_dup_s64, vld1_dup_u8, > + vld1_dup_u16, vld1_dup_u32, vld1_dup_u64, vld1q_dup_f32, > vld1q_dup_f64, > + vld1q_dup_p8, vld1q_dup_p16, vld1q_dup_s8, vld1q_dup_s16, > vld1q_dup_s32, > + vld1q_dup_s64, vld1q_dup_u8, vld1q_dup_u16, vld1q_dup_u32, > + vld1q_dup_u64): Rewrite using builtin functions. > + > 2014-11-11 Andrew Pinski <apin...@cavium.com> > > Bug target/61997 > Index: gcc/config/aarch64/arm_neon.h > ============================================================= > ====== > --- gcc/config/aarch64/arm_neon.h (revision 217394) > +++ gcc/config/aarch64/arm_neon.h (working copy) > @@ -5317,138 +5317,6 @@ vaddlvq_u32 (uint32x4_t a) > return result; > } > > -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) > -vcls_s8 (int8x8_t a) > -{ > - int8x8_t result; > - __asm__ ("cls %0.8b,%1.8b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > -vcls_s16 (int16x4_t a) > -{ > - int16x4_t result; > - __asm__ ("cls %0.4h,%1.4h" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > -vcls_s32 (int32x2_t a) > -{ > - int32x2_t result; > - __asm__ ("cls %0.2s,%1.2s" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) > -vclsq_s8 (int8x16_t a) > -{ > - int8x16_t result; > - __asm__ ("cls %0.16b,%1.16b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vclsq_s16 (int16x8_t a) > -{ > - int16x8_t result; > - __asm__ ("cls %0.8h,%1.8h" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vclsq_s32 (int32x4_t a) > -{ > - int32x4_t result; > - __asm__ ("cls %0.4s,%1.4s" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) > -vcnt_p8 (poly8x8_t a) > -{ > - poly8x8_t result; > - __asm__ ("cnt %0.8b,%1.8b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) > -vcnt_s8 (int8x8_t a) > -{ > - int8x8_t result; > - __asm__ ("cnt %0.8b,%1.8b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) > -vcnt_u8 (uint8x8_t a) > -{ > - uint8x8_t result; > - __asm__ ("cnt %0.8b,%1.8b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) > -vcntq_p8 (poly8x16_t a) > -{ > - poly8x16_t result; > - __asm__ ("cnt %0.16b,%1.16b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) > -vcntq_s8 (int8x16_t a) > -{ > - int8x16_t result; > - __asm__ ("cnt %0.16b,%1.16b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) > -vcntq_u8 (uint8x16_t a) > -{ > - uint8x16_t result; > - __asm__ ("cnt %0.16b,%1.16b" > - : "=w"(result) > - : "w"(a) > - : /* No clobbers */); > - return result; > -} > - > #define vcopyq_lane_f32(a, b, c, d) \ > __extension__ > \ > > ({ \ > @@ -6119,138 +5987,6 @@ vhsubq_u32 (uint32x4_t a, uint32x4_t b) > return result; > } > > -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) > -vld1_dup_f32 (const float32_t * a) > -{ > - float32x2_t result; > - __asm__ ("ld1r {%0.2s}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > -vld1_dup_f64 (const float64_t * a) > -{ > - float64x1_t result; > - __asm__ ("ld1r {%0.1d}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) > -vld1_dup_p8 (const poly8_t * a) > -{ > - poly8x8_t result; > - __asm__ ("ld1r {%0.8b}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) > -vld1_dup_p16 (const poly16_t * a) > -{ > - poly16x4_t result; > - __asm__ ("ld1r {%0.4h}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) > -vld1_dup_s8 (const int8_t * a) > -{ > - int8x8_t result; > - __asm__ ("ld1r {%0.8b}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > -vld1_dup_s16 (const int16_t * a) > -{ > - int16x4_t result; > - __asm__ ("ld1r {%0.4h}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > -vld1_dup_s32 (const int32_t * a) > -{ > - int32x2_t result; > - __asm__ ("ld1r {%0.2s}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) > -vld1_dup_s64 (const int64_t * a) > -{ > - int64x1_t result; > - __asm__ ("ld1r {%0.1d}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) > -vld1_dup_u8 (const uint8_t * a) > -{ > - uint8x8_t result; > - __asm__ ("ld1r {%0.8b}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) > -vld1_dup_u16 (const uint16_t * a) > -{ > - uint16x4_t result; > - __asm__ ("ld1r {%0.4h}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) > -vld1_dup_u32 (const uint32_t * a) > -{ > - uint32x2_t result; > - __asm__ ("ld1r {%0.2s}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) > -vld1_dup_u64 (const uint64_t * a) > -{ > - uint64x1_t result; > - __asm__ ("ld1r {%0.1d}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > #define vld1_lane_f32(a, b, c) \ > __extension__ > \ > > ({ \ > @@ -6407,138 +6143,7 @@ vhsubq_u32 (uint32x4_t a, uint32x4_t b) > result; > \ > }) > > -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > -vld1q_dup_f32 (const float32_t * a) > -{ > - float32x4_t result; > - __asm__ ("ld1r {%0.4s}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > > -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) > -vld1q_dup_f64 (const float64_t * a) > -{ > - float64x2_t result; > - __asm__ ("ld1r {%0.2d}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) > -vld1q_dup_p8 (const poly8_t * a) > -{ > - poly8x16_t result; > - __asm__ ("ld1r {%0.16b}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) > -vld1q_dup_p16 (const poly16_t * a) > -{ > - poly16x8_t result; > - __asm__ ("ld1r {%0.8h}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) > -vld1q_dup_s8 (const int8_t * a) > -{ > - int8x16_t result; > - __asm__ ("ld1r {%0.16b}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vld1q_dup_s16 (const int16_t * a) > -{ > - int16x8_t result; > - __asm__ ("ld1r {%0.8h}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vld1q_dup_s32 (const int32_t * a) > -{ > - int32x4_t result; > - __asm__ ("ld1r {%0.4s}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vld1q_dup_s64 (const int64_t * a) > -{ > - int64x2_t result; > - __asm__ ("ld1r {%0.2d}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) > -vld1q_dup_u8 (const uint8_t * a) > -{ > - uint8x16_t result; > - __asm__ ("ld1r {%0.16b}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) > -vld1q_dup_u16 (const uint16_t * a) > -{ > - uint16x8_t result; > - __asm__ ("ld1r {%0.8h}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) > -vld1q_dup_u32 (const uint32_t * a) > -{ > - uint32x4_t result; > - __asm__ ("ld1r {%0.4s}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) > -vld1q_dup_u64 (const uint64_t * a) > -{ > - uint64x2_t result; > - __asm__ ("ld1r {%0.2d}, %1" > - : "=w"(result) > - : "Utv"(*a) > - : /* No clobbers */); > - return result; > -} > - > #define vld1q_lane_f32(a, b, c) \ > __extension__ > \ > > ({ \ > @@ -14658,6 +14263,44 @@ vcltzd_f64 (float64_t __a) > return __a < 0.0 ? -1ll : 0ll; > } > > +/* vcls. */ > + > +__extension__ static __inline int8x8_t __attribute__ > +((__always_inline__)) > +vcls_s8 (int8x8_t __a) > +{ > + return __builtin_aarch64_clrsbv8qi (__a); } > + > +__extension__ static __inline int16x4_t __attribute__ > +((__always_inline__)) > +vcls_s16 (int16x4_t __a) > +{ > + return __builtin_aarch64_clrsbv4hi (__a); } > + > +__extension__ static __inline int32x2_t __attribute__ > +((__always_inline__)) > +vcls_s32 (int32x2_t __a) > +{ > + return __builtin_aarch64_clrsbv2si (__a); } > + > +__extension__ static __inline int8x16_t __attribute__ > +((__always_inline__)) > +vclsq_s8 (int8x16_t __a) > +{ > + return __builtin_aarch64_clrsbv16qi (__a); } > + > +__extension__ static __inline int16x8_t __attribute__ > +((__always_inline__)) > +vclsq_s16 (int16x8_t __a) > +{ > + return __builtin_aarch64_clrsbv8hi (__a); } > + > +__extension__ static __inline int32x4_t __attribute__ > +((__always_inline__)) > +vclsq_s32 (int32x4_t __a) > +{ > + return __builtin_aarch64_clrsbv4si (__a); } > + > /* vclz. */ > > __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) @@ > -14732,6 +14375,44 @@ vclzq_u32 (uint32x4_t __a) > return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a); } > > +/* vcnt. */ > + > +__extension__ static __inline poly8x8_t __attribute__ > +((__always_inline__)) > +vcnt_p8 (poly8x8_t __a) > +{ > + return (poly8x8_t)__builtin_aarch64_popcountv8qi ((int8x8_t)__a); } > + > +__extension__ static __inline int8x8_t __attribute__ > +((__always_inline__)) > +vcnt_s8 (int8x8_t __a) > +{ > + return __builtin_aarch64_popcountv8qi (__a); } > + > +__extension__ static __inline uint8x8_t __attribute__ > +((__always_inline__)) > +vcnt_u8 (uint8x8_t __a) > +{ > + return (uint8x8_t)__builtin_aarch64_popcountv8qi ((int8x8_t)__a); } > + > +__extension__ static __inline poly8x16_t __attribute__ > +((__always_inline__)) > +vcntq_p8 (poly8x16_t __a) > +{ > + return (poly8x16_t)__builtin_aarch64_popcountv16qi ((int8x16_t)__a); > +} > + > +__extension__ static __inline int8x16_t __attribute__ > +((__always_inline__)) > +vcntq_s8 (int8x16_t __a) > +{ > + return __builtin_aarch64_popcountv16qi (__a); } > + > +__extension__ static __inline uint8x16_t __attribute__ > +((__always_inline__)) > +vcntq_u8 (uint8x16_t __a) > +{ > + return (uint8x16_t)__builtin_aarch64_popcountv16qi ((int8x16_t)__a); > +} > + > /* vcvt (double -> float). */ > > __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) > @@ -16430,6 +16111,166 @@ vld1q_u64 (const uint64_t *a) > __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a); } > > +/* vld1_dup. */ > + > +__extension__ static __inline float32x2_t __attribute__ > +((__always_inline__)) > +vld1_dup_f32 (const float32_t *a) > +{ > + return __builtin_aarch64_ld1rv2sf ((const __builtin_aarch64_simd_sf > +*) a); } > + > +__extension__ static __inline float64x1_t __attribute__ > +((__always_inline__)) > +vld1_dup_f64 (const float64_t *a) > +{ > + return (float64x1_t) {*a}; > +} > + > +__extension__ static __inline poly8x8_t __attribute__ > +((__always_inline__)) > +vld1_dup_p8 (const poly8_t *a) > +{ > + return (poly8x8_t) > + __builtin_aarch64_ld1rv8qi ((const __builtin_aarch64_simd_qi *) a); > +} > + > +__extension__ static __inline poly16x4_t __attribute__ > +((__always_inline__)) > +vld1_dup_p16 (const poly16_t *a) > +{ > + return (poly16x4_t) > + __builtin_aarch64_ld1rv4hi ((const __builtin_aarch64_simd_hi *) a); > +} > + > +__extension__ static __inline int8x8_t __attribute__ > +((__always_inline__)) > +vld1_dup_s8 (const int8_t *a) > +{ > + return __builtin_aarch64_ld1rv8qi ((const __builtin_aarch64_simd_qi > +*) a); } > + > +__extension__ static __inline int16x4_t __attribute__ > +((__always_inline__)) > +vld1_dup_s16 (const int16_t *a) > +{ > + return __builtin_aarch64_ld1rv4hi ((const __builtin_aarch64_simd_hi > +*) a); } > + > +__extension__ static __inline int32x2_t __attribute__ > +((__always_inline__)) > +vld1_dup_s32 (const int32_t *a) > +{ > + return __builtin_aarch64_ld1rv2si ((const __builtin_aarch64_simd_si > +*) a); } > + > +__extension__ static __inline int64x1_t __attribute__ > +((__always_inline__)) > +vld1_dup_s64 (const int64_t *a) > +{ > + return (int64x1_t) {*a}; > +} > + > +__extension__ static __inline uint8x8_t __attribute__ > +((__always_inline__)) > +vld1_dup_u8 (const uint8_t *a) > +{ > + return (uint8x8_t) > + __builtin_aarch64_ld1rv8qi ((const __builtin_aarch64_simd_qi *) a); > +} > + > +__extension__ static __inline uint16x4_t __attribute__ > +((__always_inline__)) > +vld1_dup_u16 (const uint16_t *a) > +{ > + return (uint16x4_t) > + __builtin_aarch64_ld1rv4hi ((const __builtin_aarch64_simd_hi *) a); > +} > + > +__extension__ static __inline uint32x2_t __attribute__ > +((__always_inline__)) > +vld1_dup_u32 (const uint32_t *a) > +{ > + return (uint32x2_t) > + __builtin_aarch64_ld1rv2si ((const __builtin_aarch64_simd_si *) a); > +} > + > +__extension__ static __inline uint64x1_t __attribute__ > +((__always_inline__)) > +vld1_dup_u64 (const uint64_t *a) > +{ > + return (uint64x1_t) {*a}; > +} > + > +/* vld1q_dup. */ > + > +__extension__ static __inline float32x4_t __attribute__ > +((__always_inline__)) > +vld1q_dup_f32 (const float32_t *a) > +{ > + return __builtin_aarch64_ld1rv4sf ((const __builtin_aarch64_simd_sf > +*) a); } > + > +__extension__ static __inline float64x2_t __attribute__ > +((__always_inline__)) > +vld1q_dup_f64 (const float64_t *a) > +{ > + return __builtin_aarch64_ld1rv2df ((const __builtin_aarch64_simd_df > +*) a); } > + > +__extension__ static __inline poly8x16_t __attribute__ > +((__always_inline__)) > +vld1q_dup_p8 (const poly8_t *a) > +{ > + return (poly8x16_t) > + __builtin_aarch64_ld1rv16qi ((const __builtin_aarch64_simd_qi *) > +a); } > + > +__extension__ static __inline poly16x8_t __attribute__ > +((__always_inline__)) > +vld1q_dup_p16 (const poly16_t *a) > +{ > + return (poly16x8_t) > + __builtin_aarch64_ld1rv8hi ((const __builtin_aarch64_simd_hi *) a); > +} > + > +__extension__ static __inline int8x16_t __attribute__ > +((__always_inline__)) > +vld1q_dup_s8 (const int8_t *a) > +{ > + return __builtin_aarch64_ld1rv16qi ((const __builtin_aarch64_simd_qi > +*) a); } > + > +__extension__ static __inline int16x8_t __attribute__ > +((__always_inline__)) > +vld1q_dup_s16 (const int16_t *a) > +{ > + return __builtin_aarch64_ld1rv8hi ((const __builtin_aarch64_simd_hi > +*) a); } > + > +__extension__ static __inline int32x4_t __attribute__ > +((__always_inline__)) > +vld1q_dup_s32 (const int32_t *a) > +{ > + return __builtin_aarch64_ld1rv4si ((const __builtin_aarch64_simd_si > +*) a); } > + > +__extension__ static __inline int64x2_t __attribute__ > +((__always_inline__)) > +vld1q_dup_s64 (const int64_t *a) > +{ > + return __builtin_aarch64_ld1rv2di ((const __builtin_aarch64_simd_di > +*) a); } > + > +__extension__ static __inline uint8x16_t __attribute__ > +((__always_inline__)) > +vld1q_dup_u8 (const uint8_t *a) > +{ > + return (uint8x16_t) > + __builtin_aarch64_ld1rv16qi ((const __builtin_aarch64_simd_qi *) > +a); } > + > +__extension__ static __inline uint16x8_t __attribute__ > +((__always_inline__)) > +vld1q_dup_u16 (const uint16_t *a) > +{ > + return (uint16x8_t) > + __builtin_aarch64_ld1rv8hi ((const __builtin_aarch64_simd_hi *) a); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > +((__always_inline__)) > +vld1q_dup_u32 (const uint32_t *a) > +{ > + return (uint32x4_t) > + __builtin_aarch64_ld1rv4si ((const __builtin_aarch64_simd_si *) a); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > +((__always_inline__)) > +vld1q_dup_u64 (const uint64_t *a) > +{ > + return (uint64x2_t) > + __builtin_aarch64_ld1rv2di ((const __builtin_aarch64_simd_di *) a); > +} > + > + > /* vldn */ > > __extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__)) > Index: gcc/config/aarch64/aarch64-simd.md > ============================================================= > ====== > --- gcc/config/aarch64/aarch64-simd.md (revision 217394) > +++ gcc/config/aarch64/aarch64-simd.md (working copy) > @@ -1837,6 +1837,14 @@ > DONE; > }) > > +(define_insn "clrsb<mode>2" > + [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") > + (clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" > +"w")))] "TARGET_SIMD" > + "cls\\t%0.<Vtype>, %1.<Vtype>" > + [(set_attr "type" "neon_cls<q>")] > +) > + > (define_insn "clz<mode>2" > [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") > (clz:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))] > @@ -1845,6 +1853,14 @@ > [(set_attr "type" "neon_cls<q>")] > ) > > +(define_insn "popcount<mode>2" > + [(set (match_operand:VB 0 "register_operand" "=w") > + (popcount:VB (match_operand:VB 1 "register_operand" "w")))] > +"TARGET_SIMD" > + "cnt\\t%0.<Vbtype>, %1.<Vbtype>" > + [(set_attr "type" "neon_cnt<q>")] > +) > + > ;; 'across lanes' max and min ops. > > ;; Template for outputting a scalar, so we can create __builtins which can > be @@ > -4325,6 +4341,18 @@ > aarch64_simd_disambiguate_copy (operands, dest, src, 4); > }) > > +(define_expand "aarch64_ld1r<mode>" > + [(match_operand:VALL 0 "register_operand") > + (match_operand:DI 1 "register_operand")] > + "TARGET_SIMD" > +{ > + enum machine_mode mode = <VEL>mode; > + rtx mem = gen_rtx_MEM (mode, operands[1]); > + > + emit_insn (gen_aarch64_simd_ld1r<mode> (operands[0], mem)); > + DONE; > +}) > + > (define_expand "aarch64_ld2r<mode>" > [(match_operand:OI 0 "register_operand" "=w") > (match_operand:DI 1 "register_operand" "w") @@ -4935,7 +4963,7 @@ > DONE; > }) > > -(define_insn "*aarch64_simd_ld1r<mode>" > +(define_insn "aarch64_simd_ld1r<mode>" > [(set (match_operand:VALLDI 0 "register_operand" "=w") > (vec_duplicate:VALLDI > (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))] > Index: gcc/config/aarch64/aarch64-simd-builtins.def > ============================================================= > ====== > --- gcc/config/aarch64/aarch64-simd-builtins.def (revision 217394) > +++ gcc/config/aarch64/aarch64-simd-builtins.def (working copy) > @@ -45,7 +45,9 @@ > BUILTIN_VDQF (UNOP, sqrt, 2) > BUILTIN_VD_BHSI (BINOP, addp, 0) > VAR1 (UNOP, addp, 0, di) > + BUILTIN_VDQ_BHSI (UNOP, clrsb, 2) > BUILTIN_VDQ_BHSI (UNOP, clz, 2) > + BUILTIN_VB (UNOP, popcount, 2) > > BUILTIN_VALL (GETLANE, be_checked_get_lane, 0) > > @@ -350,6 +352,9 @@ > /* Implemented by aarch64_ld1<VALL:mode>. */ > BUILTIN_VALL (LOAD1, ld1, 0) > > + /* Implemented by aarch64_ld1r<VALL:mode>. */ BUILTIN_VALL (LOAD1, > + ld1r, 0) > + > /* Implemented by aarch64_st1<VALL:mode>. */ > BUILTIN_VALL (STORE1, st1, 0)