Re: [PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics

2014-12-05 Thread Marcus Shawcroft
On 19 November 2014 at 06:14, Yangfei (Felix) felix.y...@huawei.com wrote:

 Index: gcc/ChangeLog
 ===
 --- gcc/ChangeLog   (revision 217717)
 +++ gcc/ChangeLog   (working copy)
 @@ -1,3 +1,14 @@
 +2014-11-13  Felix Yang  felix.y...@huawei.com
 +   Shanyao Chen  chenshan...@huawei.com
 +
 +   * config/aarch64/aarch64-simd.md (clrsbmode2, popcountmode2): New
 +   patterns.
 +   * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount): New
 +   builtins.
 +   * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8,
 +   vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8,
 +   vcntq_u8): Rewrite using builtin functions.
 +

OK Thanks /Marcus


Re: [PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics

2014-11-18 Thread Marcus Shawcroft
On 17 November 2014 06:58, Yangfei (Felix) felix.y...@huawei.com wrote:
 PING?
 BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more 
 elegant.
 So is the improvement of vcls(q?) vcnt(q?) OK for trunk?  Thanks.

Please rebase over Alan's patch and repost, thank you /Marcus


Re: [PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics

2014-11-18 Thread Yangfei (Felix)
 On 17 November 2014 06:58, Yangfei (Felix) felix.y...@huawei.com wrote:
  PING?
  BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more
 elegant.
  So is the improvement of vcls(q?) vcnt(q?) OK for trunk?  Thanks.
 
 Please rebase over Alan's patch and repost, thank you /Marcus


I rebased the patch on the latest trunk. 
Regtested for aarch64-linux-gnu with qemu. 
OK for the trunk? 


Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 217717)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,14 @@
+2014-11-13  Felix Yang  felix.y...@huawei.com
+   Shanyao Chen  chenshan...@huawei.com
+
+   * config/aarch64/aarch64-simd.md (clrsbmode2, popcountmode2): New
+   patterns.
+   * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount): New
+   builtins.
+   * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8,
+   vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8,
+   vcntq_u8): Rewrite using builtin functions.
+
 2014-11-18  Felix Yang  felix.y...@huawei.com
 
* config/aarch64/aarch64.c (doloop_end): New pattern.
Index: gcc/config/aarch64/arm_neon.h
===
--- gcc/config/aarch64/arm_neon.h   (revision 217717)
+++ gcc/config/aarch64/arm_neon.h   (working copy)
@@ -5317,138 +5317,6 @@ vaddlvq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcls_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ (cls %0.8b,%1.8b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vcls_s16 (int16x4_t a)
-{
-  int16x4_t result;
-  __asm__ (cls %0.4h,%1.4h
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcls_s32 (int32x2_t a)
-{
-  int32x2_t result;
-  __asm__ (cls %0.2s,%1.2s
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclsq_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ (cls %0.16b,%1.16b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclsq_s16 (int16x8_t a)
-{
-  int16x8_t result;
-  __asm__ (cls %0.8h,%1.8h
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclsq_s32 (int32x4_t a)
-{
-  int32x4_t result;
-  __asm__ (cls %0.4s,%1.4s
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vcnt_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ (cnt %0.8b,%1.8b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcnt_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ (cnt %0.8b,%1.8b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcnt_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ (cnt %0.8b,%1.8b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vcntq_p8 (poly8x16_t a)
-{
-  poly8x16_t result;
-  __asm__ (cnt %0.16b,%1.16b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vcntq_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ (cnt %0.16b,%1.16b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcntq_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ (cnt %0.16b,%1.16b
-   : =w(result)
-   : w(a)
-   : /* No clobbers */);
-  return result;
-}
-
 #define vcopyq_lane_f32(a, b, c, d) \
   __extension__ \
 ({  \
@@ -14082,6 +13950,44 @@ vcltzd_f64 (float64_t __a)
   return __a  0.0 ? -1ll : 0ll;
 }
 
+/* vcls.  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcls_s8 (int8x8_t __a)
+{
+  return 

[PING][PATCH] [AARCH64, NEON] Improve vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics

2014-11-16 Thread Yangfei (Felix)
PING?  
BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more 
elegant.  
So is the improvement of vcls(q?) vcnt(q?) OK for trunk?  Thanks.  


 
 Hi,
 This patch converts vcls(q?) vcnt(q?) and vld1(q?)_dup intrinsics to use
 builtin functions instead of the previous inline assembly syntax.
 Regtested with aarch64-linux-gnu on QEMU.  Also passed the glorious
 testsuite of Christophe Lyon.
 OK for the trunk?
 
 
 Index: gcc/ChangeLog
 =
 ==
 --- gcc/ChangeLog (revision 217394)
 +++ gcc/ChangeLog (working copy)
 @@ -1,3 +1,21 @@
 +2014-11-13  Felix Yang  felix.y...@huawei.com
 + Jiji Jiang  jiangj...@huawei.com
 + Shanyao Chen  chenshan...@huawei.com
 +
 + * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount, ld1r): New
 + builtins.
 + * config/aarch64/aarch64-simd.md (aarch64_ld1rmode): New expand.
 + (clrsbmode2, popcountmode2): New patterns.
 + (*aarch64_simd_ld1rmode): Renamed to aarch64_simd_ld1rmode.
 + * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8,
 + vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8,
 + vcntq_u8, vld1_dup_f32, vld1_dup_f64, vld1_dup_p8, vld1_dup_p16,
 + vld1_dup_s8, vld1_dup_s16, vld1_dup_s32, vld1_dup_s64, vld1_dup_u8,
 + vld1_dup_u16, vld1_dup_u32, vld1_dup_u64, vld1q_dup_f32,
 vld1q_dup_f64,
 + vld1q_dup_p8, vld1q_dup_p16, vld1q_dup_s8, vld1q_dup_s16,
 vld1q_dup_s32,
 + vld1q_dup_s64, vld1q_dup_u8, vld1q_dup_u16, vld1q_dup_u32,
 + vld1q_dup_u64): Rewrite using builtin functions.
 +
  2014-11-11  Andrew Pinski  apin...@cavium.com
 
   Bug target/61997
 Index: gcc/config/aarch64/arm_neon.h
 =
 ==
 --- gcc/config/aarch64/arm_neon.h (revision 217394)
 +++ gcc/config/aarch64/arm_neon.h (working copy)
 @@ -5317,138 +5317,6 @@ vaddlvq_u32 (uint32x4_t a)
return result;
  }
 
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vcls_s8 (int8x8_t a)
 -{
 -  int8x8_t result;
 -  __asm__ (cls %0.8b,%1.8b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vcls_s16 (int16x4_t a)
 -{
 -  int16x4_t result;
 -  __asm__ (cls %0.4h,%1.4h
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vcls_s32 (int32x2_t a)
 -{
 -  int32x2_t result;
 -  __asm__ (cls %0.2s,%1.2s
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vclsq_s8 (int8x16_t a)
 -{
 -  int8x16_t result;
 -  __asm__ (cls %0.16b,%1.16b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vclsq_s16 (int16x8_t a)
 -{
 -  int16x8_t result;
 -  __asm__ (cls %0.8h,%1.8h
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vclsq_s32 (int32x4_t a)
 -{
 -  int32x4_t result;
 -  __asm__ (cls %0.4s,%1.4s
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vcnt_p8 (poly8x8_t a)
 -{
 -  poly8x8_t result;
 -  __asm__ (cnt %0.8b,%1.8b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vcnt_s8 (int8x8_t a)
 -{
 -  int8x8_t result;
 -  __asm__ (cnt %0.8b,%1.8b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcnt_u8 (uint8x8_t a)
 -{
 -  uint8x8_t result;
 -  __asm__ (cnt %0.8b,%1.8b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vcntq_p8 (poly8x16_t a)
 -{
 -  poly8x16_t result;
 -  __asm__ (cnt %0.16b,%1.16b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vcntq_s8 (int8x16_t a)
 -{
 -  int8x16_t result;
 -  __asm__ (cnt %0.16b,%1.16b
 -   : =w(result)
 -   : w(a)
 -   : /* No clobbers */);
 -  return result;
 -}
 -
 -__extension__ static __inline uint8x16_t __attribute__