Re: [PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics
On 19 November 2014 at 06:14, Yangfei (Felix) felix.y...@huawei.com wrote: Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 217717) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,14 @@ +2014-11-13 Felix Yang felix.y...@huawei.com + Shanyao Chen chenshan...@huawei.com + + * config/aarch64/aarch64-simd.md (clrsbmode2, popcountmode2): New + patterns. + * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount): New + builtins. + * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8, + vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8, + vcntq_u8): Rewrite using builtin functions. + OK Thanks /Marcus
Re: [PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics
On 17 November 2014 06:58, Yangfei (Felix) felix.y...@huawei.com wrote: PING? BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more elegant. So is the improvement of vcls(q?) vcnt(q?) OK for trunk? Thanks. Please rebase over Alan's patch and repost, thank you /Marcus
Re: [PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics
On 17 November 2014 06:58, Yangfei (Felix) felix.y...@huawei.com wrote: PING? BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more elegant. So is the improvement of vcls(q?) vcnt(q?) OK for trunk? Thanks. Please rebase over Alan's patch and repost, thank you /Marcus I rebased the patch on the latest trunk. Regtested for aarch64-linux-gnu with qemu. OK for the trunk? Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 217717) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,14 @@ +2014-11-13 Felix Yang felix.y...@huawei.com + Shanyao Chen chenshan...@huawei.com + + * config/aarch64/aarch64-simd.md (clrsbmode2, popcountmode2): New + patterns. + * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount): New + builtins. + * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8, + vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8, + vcntq_u8): Rewrite using builtin functions. + 2014-11-18 Felix Yang felix.y...@huawei.com * config/aarch64/aarch64.c (doloop_end): New pattern. Index: gcc/config/aarch64/arm_neon.h === --- gcc/config/aarch64/arm_neon.h (revision 217717) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -5317,138 +5317,6 @@ vaddlvq_u32 (uint32x4_t a) return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vcls_s8 (int8x8_t a) -{ - int8x8_t result; - __asm__ (cls %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vcls_s16 (int16x4_t a) -{ - int16x4_t result; - __asm__ (cls %0.4h,%1.4h - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcls_s32 (int32x2_t a) -{ - int32x2_t result; - __asm__ (cls %0.2s,%1.2s - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vclsq_s8 (int8x16_t a) -{ - int8x16_t result; - __asm__ (cls %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vclsq_s16 (int16x8_t a) -{ - int16x8_t result; - __asm__ (cls %0.8h,%1.8h - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vclsq_s32 (int32x4_t a) -{ - int32x4_t result; - __asm__ (cls %0.4s,%1.4s - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vcnt_p8 (poly8x8_t a) -{ - poly8x8_t result; - __asm__ (cnt %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vcnt_s8 (int8x8_t a) -{ - int8x8_t result; - __asm__ (cnt %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcnt_u8 (uint8x8_t a) -{ - uint8x8_t result; - __asm__ (cnt %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vcntq_p8 (poly8x16_t a) -{ - poly8x16_t result; - __asm__ (cnt %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vcntq_s8 (int8x16_t a) -{ - int8x16_t result; - __asm__ (cnt %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcntq_u8 (uint8x16_t a) -{ - uint8x16_t result; - __asm__ (cnt %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - #define vcopyq_lane_f32(a, b, c, d) \ __extension__ \ ({ \ @@ -14082,6 +13950,44 @@ vcltzd_f64 (float64_t __a) return __a 0.0 ? -1ll : 0ll; } +/* vcls. */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vcls_s8 (int8x8_t __a) +{ + return
[PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics
PING? BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more elegant. So is the improvement of vcls(q?) vcnt(q?) OK for trunk? Thanks. Hi, This patch converts vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics to use builtin functions instead of the previous inline assembly syntax. Regtested with aarch64-linux-gnu on QEMU. Also passed the glorious testsuite of Christophe Lyon. OK for the trunk? Index: gcc/ChangeLog = == --- gcc/ChangeLog (revision 217394) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,21 @@ +2014-11-13 Felix Yang felix.y...@huawei.com + Jiji Jiang jiangj...@huawei.com + Shanyao Chen chenshan...@huawei.com + + * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount, ld1r): New + builtins. + * config/aarch64/aarch64-simd.md (aarch64_ld1rmode): New expand. + (clrsbmode2, popcountmode2): New patterns. + (*aarch64_simd_ld1rmode): Renamed to aarch64_simd_ld1rmode. + * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8, + vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8, + vcntq_u8, vld1_dup_f32, vld1_dup_f64, vld1_dup_p8, vld1_dup_p16, + vld1_dup_s8, vld1_dup_s16, vld1_dup_s32, vld1_dup_s64, vld1_dup_u8, + vld1_dup_u16, vld1_dup_u32, vld1_dup_u64, vld1q_dup_f32, vld1q_dup_f64, + vld1q_dup_p8, vld1q_dup_p16, vld1q_dup_s8, vld1q_dup_s16, vld1q_dup_s32, + vld1q_dup_s64, vld1q_dup_u8, vld1q_dup_u16, vld1q_dup_u32, + vld1q_dup_u64): Rewrite using builtin functions. + 2014-11-11 Andrew Pinski apin...@cavium.com Bug target/61997 Index: gcc/config/aarch64/arm_neon.h = == --- gcc/config/aarch64/arm_neon.h (revision 217394) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -5317,138 +5317,6 @@ vaddlvq_u32 (uint32x4_t a) return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vcls_s8 (int8x8_t a) -{ - int8x8_t result; - __asm__ (cls %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vcls_s16 (int16x4_t a) -{ - int16x4_t result; - __asm__ (cls %0.4h,%1.4h - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcls_s32 (int32x2_t a) -{ - int32x2_t result; - __asm__ (cls %0.2s,%1.2s - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vclsq_s8 (int8x16_t a) -{ - int8x16_t result; - __asm__ (cls %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vclsq_s16 (int16x8_t a) -{ - int16x8_t result; - __asm__ (cls %0.8h,%1.8h - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vclsq_s32 (int32x4_t a) -{ - int32x4_t result; - __asm__ (cls %0.4s,%1.4s - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vcnt_p8 (poly8x8_t a) -{ - poly8x8_t result; - __asm__ (cnt %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vcnt_s8 (int8x8_t a) -{ - int8x8_t result; - __asm__ (cnt %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcnt_u8 (uint8x8_t a) -{ - uint8x8_t result; - __asm__ (cnt %0.8b,%1.8b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vcntq_p8 (poly8x16_t a) -{ - poly8x16_t result; - __asm__ (cnt %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vcntq_s8 (int8x16_t a) -{ - int8x16_t result; - __asm__ (cnt %0.16b,%1.16b - : =w(result) - : w(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__