[PATCH,committed] [MAINTAINERS] Update email address

2018-08-06 Thread Hurugalawadi, Naveen
Hi,

Updating my email address in the MAINTAINERS file.

Thanks,
Naveen
Index: ChangeLog
===
--- ChangeLog	(revision 263324)
+++ ChangeLog	(working copy)
@@ -1,3 +1,7 @@
+2018-08-06  Naveen H.S  
+
+	* MAINTAINERS: Update my email address.
+
 2018-07-19  DJ Delorie  
 
 	* MAINTAINERS (m32c, msp43, rl78, libiberty, build): Remove myself
Index: MAINTAINERS
===
--- MAINTAINERS	(revision 263324)
+++ MAINTAINERS	(working copy)
@@ -414,7 +414,7 @@
 Andrew John Hughes
 Dominique d'Humieres
 Andy Hutchinson	
-Naveen H.S	
+Naveen H.S	
 Meador Inge	
 Bernardo Innocenti
 Alexander Ivchenko


Re: [PING] [PATCH] [AArch64] Add addr_type attribute

2017-08-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-07/msg01634.html

Thanks,
Naveen


Re: [PING] [PATCH] [AArch64] vec_pack_trunc_ should split after register allocator

2017-08-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-07/msg01529.html

Thanks,
Naveen



  

[PING 5] [PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-08-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00505.html

Thanks,
Naveen





    

Re: [PING 4] [PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-07-27 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00505.html

Thanks,
Naveen





    

Re: [PATCH][AArch64] Add addr_type attribute

2017-07-26 Thread Hurugalawadi, Naveen
Hi James,

Thanks for the review and comments on the patch.

>> What am I missing - you add a new function which is never called?
>> Should this have been in series with a scheduling model change?

Sorry. You are right. This patch is one in series for scheduling and
addition of attributes to improve the performance. 
The function is part of the other patch which will be posted after testing.

>> Note you need to include the POST ones for AARCH64 but
>> it should be similar enough.

Modified the patch as per your suggestion as in PowerPC.

Please review the patch and let me know your comments on it.
Bootstrapped and Regression tested on aarch64-thunder-linux.

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f876a2b..0fb62fc 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -212,6 +212,30 @@
 ;; no predicated insns.
 (define_attr "predicated" "yes,no" (const_string "no"))
 
+;; Does this instruction use indexed (that is, reg+reg) addressing?
+;; This is used for load and store insns.  If operand 0 or 1 is a MEM
+;; it is automatically set based on that.  If a load or store instruction
+;; has fewer than two operands it needs to set this attribute manually
+;; or the compiler will crash.
+(define_attr "index" "no,yes"
+  (if_then_else (ior (match_operand 0 "index_address_mem")
+ (match_operand 1 "index_address_mem"))
+(const_string "yes")
+(const_string "no")))
+
+;; Does this instruction use update addressing?
+;; This is used for load and store insns.  See the comments for "indexed".
+(define_attr "update" "no,yes"
+  (if_then_else (ior (match_operand 0 "update_address_mem")
+ (match_operand 1 "update_address_mem"))
+(const_string "yes")
+(const_string "no")))
+
+(define_attr "index_shift" "no,yes"
+  (if_then_else (ior (match_operand 0 "index_shift_address_mem")
+ (match_operand 1 "index_shift_address_mem"))
+(const_string "yes")
+(const_string "no")))
 ;; ---
 ;; Pipeline descriptions and scheduling
 ;; ---
@@ -546,7 +570,19 @@
 operands[0] = gen_rtx_MEM (DImode, operands[0]);
 return pftype[INTVAL(operands[1])][locality];
   }
-  [(set_attr "type" "load1")]
+  [(set_attr "type" "load1")
+   (set (attr "update")
+	(if_then_else (match_operand 0 "update_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index")
+	(if_then_else (match_operand 0 "index_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index_shift")
+	(if_then_else (match_operand 0 "index_shift_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))]
 )
 
 (define_insn "trap"
@@ -1192,7 +1228,19 @@
ldp\\t%w0, %w2, %1
ldp\\t%s0, %s2, %1"
   [(set_attr "type" "load2,neon_load1_2reg")
-   (set_attr "fp" "*,yes")]
+   (set_attr "fp" "*,yes")
+   (set (attr "update")
+	(if_then_else (match_operand 1 "update_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index")
+	(if_then_else (match_operand 1 "index_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index_shift")
+	(if_then_else (match_operand 1 "index_shift_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))]
 )
 
 (define_insn "load_pairdi"
@@ -1208,7 +1256,19 @@
ldp\\t%x0, %x2, %1
ldp\\t%d0, %d2, %1"
   [(set_attr "type" "load2,neon_load1_2reg")
-   (set_attr "fp" "*,yes")]
+   (set_attr "fp" "*,yes")
+   (set (attr "update")
+	(if_then_else (match_operand 1 "update_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index")
+	(if_then_else (match_operand 1 "index_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index_shift")
+	(if_then_else (match_operand 1 "index_shift_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))]
 )
 
 
@@ -1227,7 +1287,19 @@
stp\\t%w1, %w3, %0
stp\\t%s1, %s3, %0"
   [(set_attr "type" "store2,neon_store1_2reg")
-   (set_attr "fp" "*,yes")]
+   (set_attr "fp" "*,yes")
+   (set (attr "update")
+	(if_then_else (match_operand 0 "update_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index")
+	(if_then_else (match_operand 0 "index_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))
+   (set (attr "index_shift")
+	(if_then_else (match_operand 0 "index_shift_address_mem")
+		  (const_string "yes")
+		  (const_string "no")))]
 )
 
 (define_insn "store_pairdi"
@@ -1243,7 +1315,19 @@
stp\\t%x1, %x3, %0
stp\\t%d1, %d3, %0"
   [(set_attr "type" "store2,neon_store1_2reg")
-   (set_attr "fp" "*,yes")]
+   

Re: [PATCH][AArch64] vec_pack_trunc_ should split after register allocator

2017-07-25 Thread Hurugalawadi, Naveen
Hi,

>> I haven't been clear in what I was asking for

Sorry. We understood right with the first comment but the second
part confused us a bit :).

>> Could you switch this back to an insn_and_split as it was in the previous
>> patch, and just drop the && reload_completed ?

Done.

Bootstrapped and Regression done on AArch64-Thunder-Linux.
Please review the patch and let me know if its okay?

Thanks,
Naveen   

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1cb6eeb..0011040 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1291,6 +1291,18 @@
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
 
+(define_insn "aarch64_simd_vec_pack_trunc_hi_"
+ [(set (match_operand: 0 "register_operand" "=w")
+   (vec_concat:
+	 (truncate: (match_operand:VQN 1 "register_operand" "w"))
+	 (vec_select:
+	   (match_operand: 3 "register_operand" "0")
+	   (match_operand: 2 "vect_par_cnst_hi_half" ""]
+ "TARGET_SIMD"
+ "xtn2\\t%0., %1."
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)
+
 (define_expand "vec_pack_trunc_"
  [(match_operand: 0 "register_operand" "")
   (match_operand:VDN 1 "register_operand" "")
@@ -1309,17 +1321,41 @@
 
 ;; For quads.
 
-(define_insn "vec_pack_trunc_"
+(define_insn_and_split "vec_pack_trunc_"
  [(set (match_operand: 0 "register_operand" "=")
(vec_concat:
 	 (truncate: (match_operand:VQN 1 "register_operand" "w"))
 	 (truncate: (match_operand:VQN 2 "register_operand" "w"]
  "TARGET_SIMD"
+ "#"
+ ""
+ [(const_int 0)]
  {
if (BYTES_BIG_ENDIAN)
- return "xtn\\t%0., %2.\;xtn2\\t%0., %1.";
+ {
+   rtx low_part = gen_lowpart (mode, operands[0]);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_ (low_part,
+			  operands[2]));
+   rtx high_part = aarch64_simd_vect_par_cnst_half (mode,
+			true);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_hi_ (operands[0],
+			 operands[1],
+			 high_part,
+			 operands[0]));
+ }
else
- return "xtn\\t%0., %1.\;xtn2\\t%0., %2.";
+ {
+   rtx low_part = gen_lowpart (mode, operands[0]);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_ (low_part,
+			  operands[1]));
+   rtx high_part = aarch64_simd_vect_par_cnst_half (mode,
+			true);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_hi_ (operands[0],
+			 operands[2],
+			 high_part,
+			 operands[0]));
+ }
+   DONE;
  }
   [(set_attr "type" "multiple")
(set_attr "length" "8")]


Re: [PATCH][AArch64] vec_pack_trunc_ should split after register allocator

2017-07-25 Thread Hurugalawadi, Naveen
Hi,

>> I think we can split this whenever we like, and
>> that there isn't any benefit in keeping the pair together?

Thanks for the review and your views.

The patch is modified as per your suggestion.

Please review the patch and let me know if its okay?

Bootstrapped and Regression done on AArch64-Thunder-Linux.

Thanks,
Naveen   diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1cb6eeb..a41edad 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1291,6 +1291,18 @@
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
 
+(define_insn "aarch64_simd_vec_pack_trunc_hi_"
+ [(set (match_operand: 0 "register_operand" "=w")
+   (vec_concat:
+	 (truncate: (match_operand:VQN 1 "register_operand" "w"))
+	 (vec_select:
+	   (match_operand: 3 "register_operand" "0")
+	   (match_operand: 2 "vect_par_cnst_hi_half" ""]
+ "TARGET_SIMD"
+ "xtn2\\t%0., %1."
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)
+
 (define_expand "vec_pack_trunc_"
  [(match_operand: 0 "register_operand" "")
   (match_operand:VDN 1 "register_operand" "")
@@ -1309,20 +1321,39 @@
 
 ;; For quads.
 
-(define_insn "vec_pack_trunc_"
- [(set (match_operand: 0 "register_operand" "=")
+(define_expand "vec_pack_trunc_"
+ [(set (match_operand: 0 "register_operand" "")
(vec_concat:
-	 (truncate: (match_operand:VQN 1 "register_operand" "w"))
-	 (truncate: (match_operand:VQN 2 "register_operand" "w"]
+	 (truncate: (match_operand:VQN 1 "register_operand" ""))
+	 (truncate: (match_operand:VQN 2 "register_operand" ""]
  "TARGET_SIMD"
  {
if (BYTES_BIG_ENDIAN)
- return "xtn\\t%0., %2.\;xtn2\\t%0., %1.";
+ {
+   rtx low_part = gen_lowpart (mode, operands[0]);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_ (low_part,
+			  operands[2]));
+   rtx high_part = aarch64_simd_vect_par_cnst_half (mode,
+			true);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_hi_ (operands[0],
+			 operands[1],
+			 high_part,
+			 operands[0]));
+ }
else
- return "xtn\\t%0., %1.\;xtn2\\t%0., %2.";
+ {
+   rtx low_part = gen_lowpart (mode, operands[0]);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_ (low_part,
+			  operands[1]));
+   rtx high_part = aarch64_simd_vect_par_cnst_half (mode,
+			true);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_hi_ (operands[0],
+			 operands[2],
+			 high_part,
+			 operands[0]));
+ }
+   DONE;
  }
-  [(set_attr "type" "multiple")
-   (set_attr "length" "8")]
 )
 
 ;; Widening operations.


Re: [PING} [PATCH] Transform (m1 > m2) * d into m1> m2 ? d : 0

2017-07-18 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-07/msg00178.html

Thanks,
Naveen




Re: [PING 5] [PATCH] [AArch64] vec_pack_trunc_ should split after register allocator

2017-07-18 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01334.html

Thanks,
Naveen



    

Re: [PATCH] [AArch64] Fix PR71112

2017-07-06 Thread Hurugalawadi, Naveen
Hi Ramana,

>> PR71112 is still open - should this be backported to GCC-6 ?

Ported the patch to gcc-6-branch and committed as:-
https://gcc.gnu.org/viewcvs/gcc?view=revision=250014

Bootstrapped and Regression Tested gcc-6-branch for AArch64
on aarch64-thunder-linux.

Thanks,
Naveen


Re: [PATCH] Transform (m1 > m2) * d into m1> m2 ? d : 0

2017-07-04 Thread Hurugalawadi, Naveen
Hi,

Thanks for the review and comments on the patch.

>> The proposed patch handled both the same.  This means the pattern
>> shouldn't use range-info but instead match a more complex

The patch handles as per the discussion by matching the pattern
in match.pd.

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveen

2017-07-04  Naveen H.S  

gcc
* match.pd (((m1 >/=/<= m2) * d -> (m1 >/=/<= m2) ? d : 0) New
pattern.

gcc/testsuite
* gcc.dg/tree-ssa/vrp116.c: New Test.diff --git a/gcc/match.pd b/gcc/match.pd
index 4c64b21..d914db1 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1088,6 +1088,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   && tree_nop_conversion_p (type, TREE_TYPE (@1)))
   (convert (bit_and (bit_not @1) @0
 
+/* (m1 CMP m2) * d -> (m1 CMP m2) ? d : 0  */
+(for cmp (gt lt ge le)
+(simplify
+ (mult (convert (cmp @0 @1)) @2)
+  (cond (cmp @0 @1) @2 { build_zero_cst (type); })))
+
 /* For integral types with undefined overflow and C != 0 fold
x * C EQ/NE y * C into x EQ/NE y.  */
 (for cmp (eq ne)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
new file mode 100644
index 000..d9d7b23
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-vrp1" } */
+
+int
+f (int m1, int m2, int c)
+{
+  int d = m1 > m2;
+  int e = d * c;
+  return e ? m1 : m2;
+}
+
+/* { dg-final { scan-tree-dump-times "\\? c_\[0-9\]\\(D\\) : 0" 1 "vrp1" } } */


Re: [PING 4] [PATCH] [AArch64] vec_pack_trunc_ should split after register allocator

2017-06-30 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01334.html

Thanks,
Naveen



    

[PATCH] Transform (m1 > m2) * d into m1> m2 ? d : 0

2017-06-28 Thread Hurugalawadi, Naveen
Hi, 

The code (m1 > m2) * d code should be optimized as m1> m2 ? d : 0.

The patch optimizes it inside tree-vrp.c when simplifying with the range
inside simplify_stmt_using_ranges. If a multiply is found and either side
has a range [0,1], then transform it.

Ex:- d * c where d has a range of [0,1] transform it to be
COND_EXPR(d != 0, c, 0).
The other optimization passes should prop m1 > m2.

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveen

2017-06-28  Naveen H.S  

gcc
* tree-vrp.c (simplify_stmt_using_ranges): Add case for
optimizing a case of multiplication.
(simplify_mult_ops_using_ranges): New.   

gcc/testsuite
* gcc.dg/tree-ssa/vrp116.c: New Test.


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
new file mode 100644
index 000..d9d7b23
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-vrp1" } */
+
+int
+f (int m1, int m2, int c)
+{
+  int d = m1 > m2;
+  int e = d * c;
+  return e ? m1 : m2;
+}
+
+/* { dg-final { scan-tree-dump-times "\\? c_\[0-9\]\\(D\\) : 0" 1 "vrp1" } } */
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index 9ca3924..291b87f 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -9146,6 +9146,46 @@ vrp_visit_phi_node (gphi *phi)
   return SSA_PROP_NOT_INTERESTING;
 }
 
+static bool
+simplify_mult_ops_using_ranges (gimple_stmt_iterator * gsi, gimple *stmt)
+{
+  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
+  tree op0, op1, lhs;
+
+  op0 = gimple_assign_rhs1 (stmt);
+  op1 = gimple_assign_rhs2 (stmt);
+  lhs = gimple_assign_lhs (stmt);
+
+  if (!op_with_boolean_value_range_p (op0)
+  && !op_with_boolean_value_range_p (op1))
+return false;
+
+  if (rhs_code == MULT_EXPR)
+{
+  if (op_with_boolean_value_range_p (op0))
+	{
+	  tree t = build_int_cst (TREE_TYPE (lhs), 0);
+	  tree tmp = build3 (COND_EXPR, TREE_TYPE (lhs),
+			 build2 (NE_EXPR, boolean_type_node, op0, t),
+			 op1, t);
+	  gimple *new_assign = gimple_build_assign (lhs, tmp);
+	  gsi_replace (gsi, new_assign, true);
+	  return true;
+	}
+  if (op_with_boolean_value_range_p (op1))
+	{
+	  tree t = build_int_cst (TREE_TYPE (lhs), 0);
+	  tree tmp = build3 (COND_EXPR, TREE_TYPE (lhs),
+			 build2 (NE_EXPR, boolean_type_node, op1, t),
+			 op0, t);
+	  gimple *new_assign = gimple_build_assign (lhs, tmp);
+	  gsi_replace (gsi, new_assign, true);
+	  return true;
+	}
+}
+  return false;
+}
+
 /* Simplify boolean operations if the source is known
to be already a boolean.  */
 static bool
@@ -10345,6 +10385,11 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
 	return simplify_div_or_mod_using_ranges (gsi, stmt);
 	  break;
 
+	case MULT_EXPR:
+	  if (INTEGRAL_TYPE_P (TREE_TYPE (rhs1)))
+	return simplify_mult_ops_using_ranges (gsi, stmt);
+	  break;
+
   /* Transform ABS (X) into X or -X as appropriate.  */
 	case ABS_EXPR:
 	  if (TREE_CODE (rhs1) == SSA_NAME


Re: [PING][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-06-26 Thread Hurugalawadi, Naveen
Hi Jeff,

Thanks for the review and your approval for final patch.
Sorry, It was a long weekend and hence could not revert to your
comments earlier.

>> You need a ChangeLog entry, but I think that's it.  Can you
>> please repost with a ChangeLog entry for final approval?

Please find the final patch and ChangeLog entry updated as required.
Please review the same and let me know if its okay to commit?

Thanks,
Naveen

2017-06-27  Naveen H.S  

* config/aarch64/aarch64.c (aarch_macro_fusion_pair_p): Push the
check for CC usage into AARCH64_FUSE_CMP_BRANCH.
* config/i386/i386.c (ix86_macro_fusion_pair_p): Push the check for
CC usage from generic code to here.
* sched-deps.c (sched_macro_fuse_insns): Move the condition for
CC usage into the target macros.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..b38b8b7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13973,13 +13973,23 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 {
   enum attr_type prev_type = get_attr_type (prev);
 
-  /* FIXME: this misses some which is considered simple arthematic
- instructions for ThunderX.  Simple shifts are missed here.  */
-  if (prev_type == TYPE_ALUS_SREG
-  || prev_type == TYPE_ALUS_IMM
-  || prev_type == TYPE_LOGICS_REG
-  || prev_type == TYPE_LOGICS_IMM)
-return true;
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  aarch64_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+
+  if (reg_referenced_p (cc_reg_1, PATTERN (curr))
+	  && prev
+	  && modified_in_p (cc_reg_1, prev))
+	{
+	  /* FIXME: this misses some which is considered simple arthematic
+	 instructions for ThunderX.  Simple shifts are missed here.  */
+	  if (prev_type == TYPE_ALUS_SREG
+	  || prev_type == TYPE_ALUS_IMM
+	  || prev_type == TYPE_LOGICS_REG
+	  || prev_type == TYPE_LOGICS_IMM)
+	return true;
+	}
 }
 
   return false;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 0b2fa1b..af14c90 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -29483,6 +29483,15 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
   if (!any_condjump_p (condjmp))
 return false;
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  ix86_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+  || !condgen
+  || !modified_in_p (cc_reg_1, condgen))
+return false;
+
   if (get_attr_type (condgen) != TYPE_TEST
   && get_attr_type (condgen) != TYPE_ICMP
   && get_attr_type (condgen) != TYPE_INCDEC
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index b2393bf..4c459e6 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -2834,34 +2834,30 @@ static void
 sched_macro_fuse_insns (rtx_insn *insn)
 {
   rtx_insn *prev;
-
+  prev = prev_nonnote_nondebug_insn (insn);
+  if (!prev)
+return;
+ 
   if (any_condjump_p (insn))
 {
   unsigned int condreg1, condreg2;
   rtx cc_reg_1;
   targetm.fixed_condition_code_regs (, );
   cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!reg_referenced_p (cc_reg_1, PATTERN (insn))
-  || !prev
-  || !modified_in_p (cc_reg_1, prev))
-return;
+  if (reg_referenced_p (cc_reg_1, PATTERN (insn))
+	  && modified_in_p (cc_reg_1, prev))
+	{
+	  if (targetm.sched.macro_fusion_pair_p (prev, insn))
+	SCHED_GROUP_P (insn) = 1;
+	  return;
+	}
 }
-  else
-{
-  rtx insn_set = single_set (insn);
-
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!prev
-  || !insn_set
-  || !single_set (prev))
-return;
 
+  if (single_set (insn) && single_set (prev))
+{
+  if (targetm.sched.macro_fusion_pair_p (prev, insn))
+	SCHED_GROUP_P (insn) = 1;
 }
-
-  if (targetm.sched.macro_fusion_pair_p (prev, insn))
-SCHED_GROUP_P (insn) = 1;
-
 }
 
 /* Get the implicit reg pending clobbers for INSN and save them in TEMP.  */


Re: [PATCH][AArch64] Add crypto_pmull attribute

2017-06-20 Thread Hurugalawadi, Naveen
Hi Ramana,

Thanks for the review and approval.

>> Please update the ARM backend with the new attribute too
>> (define_insn "crypto_vmullp64"

Its already been updated in the patch posted at:-
https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00504.html

>> Ok with that change and checking that you can build cc1 for arm-none-eabi .

Checked and built the arm toolchain successfully with the patch.

Patch has been committed at:-
https://gcc.gnu.org/viewcvs/gcc?view=revision=249433

Thanks,
Naveen


Re: [PATCH][AArch64] Add crypto_pmull attribute

2017-06-20 Thread Hurugalawadi, Naveen
Hi James,

Thanks for the approval.

>> From an AArch64 perspective, this is OK - but please wait for an ARM
>> approval before you commit it.

Can anyone from ARM comment on the patch so that it can be committed
upstream if no issues.

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00504.html

Thanks,
Naveen



Re: [PATCH, AArch64] Add x86 intrinsic headers to GCC AArch64 taget

2017-06-20 Thread Hurugalawadi, Naveen
Hi Joesph,

Thanks for your review and valuable comments on this issue.

Please find attached the patch that merges x86-intrinsics for AArch64 and PPC
architectures.

>> it would seem to me to be a bad idea to duplicate the 
>> implementation for more and more architectures.
Merged the implementation for AArch64 and PPC architectures.

The testcase have not been merged yet. Will do it after checking out
the comments on the current idea of implementation.

Please check the patch and let me know the comments.

Bootstrapped and Regression tested on aarch64-thunder-linux and PPC.

Thanks,
Naveen

2017-06-20  Naveen H.S  

[gcc]
* config.gcc (aarch64*-*-*): Add bmi2intrin.h, bmiintrin.h,
adxintrin.h and x86intrin.h in Config folder.
(powerpc*-*-*): Move bmi2intrin.h, bmiintrin.h and x86intrin.h into
Config folder.
* config/adxintrin.h: New file.
* config/bmi2intrin.h: New file.
* config/bmiintrin.h: New file.
* config/x86intrin.h: New file.
* config/rs6000/bmi2intrin.h: Delete file.
* config/rs6000/bmiintrin.h: Likewise.
* config/rs6000/x86intrin.h: Likewise.

[gcc/testsuite]

* gcc.target/aarch64/adx-addcarryx32-1.c: New file.
* gcc.target/aarch64/adx-addcarryx32-2.c: New file.
* gcc.target/aarch64/adx-addcarryx32-3.c: New file.
* gcc.target/aarch64/adx-addcarryx64-1.c: New file.
* gcc.target/aarch64/adx-addcarryx64-2.c: New file
* gcc.target/aarch64/adx-addcarryx64-3.c: New file
* gcc.target/aarch64/adx-check.h: New file
* gcc.target/aarch64/bmi-andn-1.c: New file
* gcc.target/aarch64/bmi-andn-2.c: New file.
* gcc.target/aarch64/bmi-bextr-1.c: New file.
* gcc.target/aarch64/bmi-bextr-2.c: New file.
* gcc.target/aarch64/bmi-bextr-4.c: New file.
* gcc.target/aarch64/bmi-bextr-5.c: New file.
* gcc.target/aarch64/bmi-blsi-1.c: New file.
* gcc.target/aarch64/bmi-blsi-2.c: New file.
* gcc.target/aarch64/bmi-blsmsk-1.c: new file.
* gcc.target/aarch64/bmi-blsmsk-2.c: New file.
* gcc.target/aarch64/bmi-blsr-1.c: New file.
* gcc.target/aarch64/bmi-blsr-2.c: New File.
* gcc.target/aarch64/bmi-check.h: New File.
* gcc.target/aarch64/bmi-tzcnt-1.c: new file.
* gcc.target/aarch64/bmi-tzcnt-2.c: New file.
* gcc.target/aarch64/bmi2-bzhi32-1.c: New file.
* gcc.target/aarch64/bmi2-bzhi64-1.c: New file.
* gcc.target/aarch64/bmi2-bzhi64-1a.c: New file.
* gcc.target/aarch64/bmi2-check.h: New file.
* gcc.target/aarch64/bmi2-mulx32-1.c: New file.
* gcc.target/aarch64/bmi2-mulx32-2.c: New file.
* gcc.target/aarch64/bmi2-mulx64-1.c: New file.
* gcc.target/aarch64/bmi2-mulx64-2.c: New file.
* gcc.target/aarch64/bmi2-pdep32-1.c: New file.
* gcc.target/aarch64/bmi2-pdep64-1.c: New file.
* gcc.target/aarch64/bmi2-pext32-1.c: New File.
* gcc.target/aarch64/bmi2-pext64-1.c: New file.
* gcc.target/aarch64/bmi2-pext64-1a.c: New File.diff --git a/gcc/config.gcc b/gcc/config.gcc
index 8b00e66..18d0bd8 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -301,6 +301,8 @@ m32c*-*-*)
 aarch64*-*-*)
 	cpu_type=aarch64
 	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
+	extra_headers="${extra_headers} ../bmi2intrin.h ../bmiintrin.h ../x86intrin.h"
+	extra_headers="${extra_headers} ../adxintrin.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
@@ -455,7 +457,7 @@ powerpc*-*-*spe*)
 powerpc*-*-*)
 	cpu_type=rs6000
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
-	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
+	extra_headers="${extra_headers} ../bmi2intrin.h ../bmiintrin.h ../x86intrin.h"
 	extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
 	extra_headers="${extra_headers} paired.h"
 	case x$with_cpu in
diff --git a/gcc/config/adxintrin.h b/gcc/config/adxintrin.h
new file mode 100644
index 000..6ba326e
--- /dev/null
+++ b/gcc/config/adxintrin.h
@@ -0,0 +1,99 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software 

Re: [PING 3] [PATCH] [AArch64] vec_pack_trunc_ should split after register allocator

2017-06-14 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01334.html

Thanks,
Naveen



    

Re: [PING 3] [PATCH] [AArch64] Implement ALU_BRANCH fusion

2017-06-14 Thread Hurugalawadi, Naveen
Hi Wilco,

>> That looks good to me now.

Thanks for the review and your okay for the patch.

Please consider this as a personal reminder to review the patch
at following link and let me know if its okay to commit?

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01333.html

Thanks,
Naveen

Re: [PING 3][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-06-14 Thread Hurugalawadi, Naveen
Hi Wilco,

>> That looks good to me now.

Thanks for the review and your okay for the patch.

Please consider this as a personal reminder to review the patch
at following link and let me know if its okay to commit?

https://gcc.gnu.org/ml/gcc-patches/2017-05/msg00839.html

Thanks,
Naveen

Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-14 Thread Hurugalawadi, Naveen
Hi James,

>> Could you make the testcase a bit more comprehensive? 

Modified the testcase considering all the possible cases.
Split up the test based on different scenarios.

Please review the patch and let us know if its okay?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bce490f..239ba72 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11707,6 +11707,57 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   return;
 }
 
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+
+  /* The algorithm will fill matches[*][0] with the earliest matching element,
+ and matches[X][1] with the count of duplicate elements (if X is the
+ earliest element which has duplicates).  */
+
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2] = {0};
+  for (int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  {
+	maxelement = i;
+	maxv = matches[i][1];
+	  }
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
   /* Initialise a vector which is part-variable.  We want to first try
  to build those lanes which are constant in the most efficient way we
  can.  */
@@ -11740,10 +11791,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 }
 
   /* Insert the variable lanes directly.  */
-
-  enum insn_code icode = optab_handler (vec_set_optab, mode);
-  gcc_assert (icode != CODE_FOR_nothing);
-
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-1.c b/gcc/testsuite/gcc.target/aarch64/vect-init-1.c
new file mode 100644
index 000..90ba3ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, c, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-2.c b/gcc/testsuite/gcc.target/aarch64/vect-init-2.c
new file mode 100644
index 000..0444675
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-3.c b/gcc/testsuite/gcc.target/aarch64/vect-init-3.c
new file mode 100644
index 000..b5822b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b)
+{
+  return (vector float) { a, b, a, b };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-4.c b/gcc/testsuite/gcc.target/aarch64/vect-init-4.c
new file mode 100644
index 000..09a0095
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b)
+{
+  return (vector float) { a, b, b, a };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-5.c b/gcc/testsuite/gcc.target/aarch64/vect-init-5.c
new file mode 100644
index 000..76d5502
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-5.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine 

Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-13 Thread Hurugalawadi, Naveen
Hi James,

Thanks for your review and useful comments.

>> If you could try to keep one reply chain for each patch series
Will keep that in mind for sure :-)

>> Very minor, but what is wrong with:
>> int matches[16][2] = {0};
Done.

>> nummatches is unused.
Removed.

>> This search algorithm is tough to follow
Updated as per your comments.

>> Put braces round this and write it as two statements
Done.

>> Move your new code above the part-variable case.
Done.

>> c is unused.
Removed.

Bootstrapped and Regression tested on aarch64-thunder-linux.

Please review the patch and let us know if any comments or suggestions.

Thanks,
Naveen
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bce490f..239ba72 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11707,6 +11707,57 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   return;
 }
 
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+
+  /* The algorithm will fill matches[*][0] with the earliest matching element,
+ and matches[X][1] with the count of duplicate elements (if X is the
+ earliest element which has duplicates).  */
+
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2] = {0};
+  for (int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  {
+	maxelement = i;
+	maxv = matches[i][1];
+	  }
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
   /* Initialise a vector which is part-variable.  We want to first try
  to build those lanes which are constant in the most efficient way we
  can.  */
@@ -11740,10 +11791,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 }
 
   /* Insert the variable lanes directly.  */
-
-  enum insn_code icode = optab_handler (vec_set_optab, mode);
-  gcc_assert (icode != CODE_FOR_nothing);
-
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..65f368d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


[PATCH, AArch64] Add x86 intrinsic headers to GCC AArch64 taget

2017-05-29 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds first set of X86 instrinsic
headers to AArch64 target.
The implementation is based on similar work targeted at PPC64LE.
https://gcc.gnu.org/ml/gcc-patches/2017-05/msg00550.html

We are using the corresponding DejaGnu tests similar to Powerpc from 
gcc/testsuite/gcc.target/i386/ to gcc/testsuite/gcc.target/aarch64 as the
source remains same. Only modifications are target related as appropriate.

Bootstrapped and Regression tested on aarch64-thunder-linux.

Please review the patch and let us know if any comments or suggestions.

Thanks,
Naveen

2017-05-29  Naveen H.S  

[gcc]
* config.gcc (aarch64*-*-*): Add bmi2intrin.h, bmiintrin.h,
and x86intrin.h
* config/aarch64/bmi2intrin.h: New file.
* config/aarch64/bmiintrin.h: New file.
* config/aarch64/x86intrin.h: New file.

[gcc/testsuite]

* gcc.target/aarch64/bmi-andn-1.c: New file
* gcc.target/aarch64/bmi-andn-2.c: New file.
* gcc.target/aarch64/bmi-bextr-1.c: New file.
* gcc.target/aarch64/bmi-bextr-2.c: New file.
* gcc.target/aarch64/bmi-bextr-4.c: New file.
* gcc.target/aarch64/bmi-bextr-5.c: New file.
* gcc.target/aarch64/bmi-blsi-1.c: New file.
* gcc.target/aarch64/bmi-blsi-2.c: New file.
* gcc.target/aarch64/bmi-blsmsk-1.c: new file.
* gcc.target/aarch64/bmi-blsmsk-2.c: New file.
* gcc.target/aarch64/bmi-blsr-1.c: New file.
* gcc.target/aarch64/bmi-blsr-2.c: New File.
* gcc.target/aarch64/bmi-check.h: New File.
* gcc.target/aarch64/bmi-tzcnt-1.c: new file.
* gcc.target/aarch64/bmi-tzcnt-2.c: New file.
* gcc.target/aarch64/bmi2-bzhi32-1.c: New file.
* gcc.target/aarch64/bmi2-bzhi64-1.c: New file.
* gcc.target/aarch64/bmi2-bzhi64-1a.c: New file.
* gcc.target/aarch64/bmi2-check.h: New file.
* gcc.target/aarch64/bmi2-mulx32-1.c: New file.
* gcc.target/aarch64/bmi2-mulx32-2.c: New file.
* gcc.target/aarch64/bmi2-mulx64-1.c: New file.
* gcc.target/aarch64/bmi2-mulx64-2.c: New file.
* gcc.target/aarch64/bmi2-pdep32-1.c: New file.
* gcc.target/aarch64/bmi2-pdep64-1.c: New file.
* gcc.target/aarch64/bmi2-pext32-1.c: New File.
* gcc.target/aarch64/bmi2-pext64-1.c: New file.
* gcc.target/aarch64/bmi2-pext64-1a.c: New File.diff --git a/gcc/config.gcc b/gcc/config.gcc
index f55dcaa..9eac70e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -301,6 +301,7 @@ m32c*-*-*)
 aarch64*-*-*)
 	cpu_type=aarch64
 	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
+	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
diff --git a/gcc/config/aarch64/bmi2intrin.h b/gcc/config/aarch64/bmi2intrin.h
new file mode 100644
index 000..c797f22
--- /dev/null
+++ b/gcc/config/aarch64/bmi2intrin.h
@@ -0,0 +1,148 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to Aarch64.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use  directly; include  instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, 

Re: [PING 2] [PATCH] [AArch64] Implement ALU_BRANCH fusion

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01333.html

Thanks,
Naveen
   

Re: [PING 2] [PATCH] [AArch64] vec_pack_trunc_ should split after register allocator

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01334.html

Thanks,
Naveen





Re: [PING 2] [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01260.html

Thanks,
Naveen





Re: [PING 3] [PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00505.html

Thanks,
Naveen





    

Re: [PING 3][PATCH][AArch64] Add crypto_pmull attribute

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00504.html

Thanks,
Naveen


    

Re: [PING 3][PATCH] [AArch64] Implement automod load and store for Thunderx2t99

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00226.html

Thanks,
Naveen

    

Re: [PING 3][PATCH][AArch64] Add addr_type attribute

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00222.html

Thanks,
Naveen


    

Re: [PING 2][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-05-26 Thread Hurugalawadi, Naveen
Hi, 

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same. 

https://gcc.gnu.org/ml/gcc-patches/2017-05/msg00839.html

Thanks,
Naveen
    

Re: [PING] [PATCH] [AArch64] vec_pack_trunc_ should split after register allocator

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01334.html

Thanks,
Naveen





Re: [PING] [PATCH] [AArch64] Implement ALU_BRANCH fusion

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01333.html

Thanks,
Naveen
   

Re: [PING] [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01260.html

Thanks,
Naveen





Re: [PING2] [PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00505.html

Thanks,
Naveen





    

Re: [PING2][PATCH][AArch64] Add crypto_pmull attribute

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00504.html

Thanks,
Naveen


    

Re: [PING2][PATCH] [AArch64] Implement automod load and store for Thunderx2t99

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00226.html

Thanks,
Naveen



Re: [PING2][PATCH][AArch64] Add addr_type attribute

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00222.html

Thanks,
Naveen


    

Re: [PING2][PATCH][AArch64] Add addr_type attribute

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00222.html

Thanks,
Naveen


    

Re: [PING2][PATCH][AArch64] Add addr_type attribute

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00222.html

Thanks,
Naveen


    

Re: [PING2][PATCH][AArch64] Add addr_type attribute

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00222.html

Thanks,
Naveen


    

Re: [PING][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-05-10 Thread Hurugalawadi, Naveen
Hi,

>> Doesn't this avoid calling the target hook in cases where it used to 
>> call it before?

Yes. Thanks for pointing it out.

>> Consider a conditional jump inside a parallel that is not a single set.

Please find attached the modified patch that handles the case mentioned.
Please review the patch and let us know if its okay?

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveen
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..b38b8b7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13973,13 +13973,23 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 {
   enum attr_type prev_type = get_attr_type (prev);
 
-  /* FIXME: this misses some which is considered simple arthematic
- instructions for ThunderX.  Simple shifts are missed here.  */
-  if (prev_type == TYPE_ALUS_SREG
-  || prev_type == TYPE_ALUS_IMM
-  || prev_type == TYPE_LOGICS_REG
-  || prev_type == TYPE_LOGICS_IMM)
-return true;
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  aarch64_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+
+  if (reg_referenced_p (cc_reg_1, PATTERN (curr))
+	  && prev
+	  && modified_in_p (cc_reg_1, prev))
+	{
+	  /* FIXME: this misses some which is considered simple arthematic
+	 instructions for ThunderX.  Simple shifts are missed here.  */
+	  if (prev_type == TYPE_ALUS_SREG
+	  || prev_type == TYPE_ALUS_IMM
+	  || prev_type == TYPE_LOGICS_REG
+	  || prev_type == TYPE_LOGICS_IMM)
+	return true;
+	}
 }
 
   return false;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 0b2fa1b..af14c90 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -29483,6 +29483,15 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
   if (!any_condjump_p (condjmp))
 return false;
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  ix86_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+  || !condgen
+  || !modified_in_p (cc_reg_1, condgen))
+return false;
+
   if (get_attr_type (condgen) != TYPE_TEST
   && get_attr_type (condgen) != TYPE_ICMP
   && get_attr_type (condgen) != TYPE_INCDEC
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index b2393bf..4c459e6 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -2834,34 +2834,30 @@ static void
 sched_macro_fuse_insns (rtx_insn *insn)
 {
   rtx_insn *prev;
-
+  prev = prev_nonnote_nondebug_insn (insn);
+  if (!prev)
+return;
+ 
   if (any_condjump_p (insn))
 {
   unsigned int condreg1, condreg2;
   rtx cc_reg_1;
   targetm.fixed_condition_code_regs (, );
   cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!reg_referenced_p (cc_reg_1, PATTERN (insn))
-  || !prev
-  || !modified_in_p (cc_reg_1, prev))
-return;
+  if (reg_referenced_p (cc_reg_1, PATTERN (insn))
+	  && modified_in_p (cc_reg_1, prev))
+	{
+	  if (targetm.sched.macro_fusion_pair_p (prev, insn))
+	SCHED_GROUP_P (insn) = 1;
+	  return;
+	}
 }
-  else
-{
-  rtx insn_set = single_set (insn);
-
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!prev
-  || !insn_set
-  || !single_set (prev))
-return;
 
+  if (single_set (insn) && single_set (prev))
+{
+  if (targetm.sched.macro_fusion_pair_p (prev, insn))
+	SCHED_GROUP_P (insn) = 1;
 }
-
-  if (targetm.sched.macro_fusion_pair_p (prev, insn))
-SCHED_GROUP_P (insn) = 1;
-
 }
 
 /* Get the implicit reg pending clobbers for INSN and save them in TEMP.  */


[PATCH][AArch64] vec_pack_trunc_ should split after register allocator

2017-04-26 Thread Hurugalawadi, Naveen
Hi,

The instruction "vec_pack_trunc_" should be split after register
allocator for scheduling reasons. Currently the instruction is marked as type
multiple which means it will scheduled as single issued. However, nothing can
be scheduled with either xtn/xtn2 which is a problem in some cases.

The patch splits the instruction and fixes the issue.

Please review the patch and let me know if its okay.
Bootstrapped and Regression tested on aarch64-thunder-linux.

2017-04-27  Naveen H.S  

* config/aarch64/aarch64-simd.md
(aarch64_simd_vec_pack_trunc_hi_): New pattern.
(vec_pack_trunc_): Split the instruction pattern.diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c462164..9b5135c 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1278,6 +1278,18 @@
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
 
+(define_insn "aarch64_simd_vec_pack_trunc_hi_"
+ [(set (match_operand: 0 "register_operand" "=w")
+   (vec_concat:
+	 (truncate: (match_operand:VQN 1 "register_operand" "w"))
+	 (vec_select:
+	   (match_operand: 3 "register_operand" "0")
+	   (match_operand: 2 "vect_par_cnst_hi_half" ""]
+ "TARGET_SIMD"
+ "xtn2\\t%0., %1."
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)
+
 (define_expand "vec_pack_trunc_"
  [(match_operand: 0 "register_operand" "")
   (match_operand:VDN 1 "register_operand" "")
@@ -1296,17 +1308,41 @@
 
 ;; For quads.
 
-(define_insn "vec_pack_trunc_"
+(define_insn_and_split "vec_pack_trunc_"
  [(set (match_operand: 0 "register_operand" "=")
(vec_concat:
 	 (truncate: (match_operand:VQN 1 "register_operand" "w"))
 	 (truncate: (match_operand:VQN 2 "register_operand" "w"]
  "TARGET_SIMD"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
  {
if (BYTES_BIG_ENDIAN)
- return "xtn\\t%0., %2.\;xtn2\\t%0., %1.";
+ {
+   rtx low_part = gen_lowpart (mode, operands[0]);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_ (low_part,
+			  operands[2]));
+   rtx high_part = aarch64_simd_vect_par_cnst_half (mode,
+			true);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_hi_ (operands[0],
+			 operands[1],
+			 high_part,
+			 operands[0]));
+ }
else
- return "xtn\\t%0., %1.\;xtn2\\t%0., %2.";
+ {
+   rtx low_part = gen_lowpart (mode, operands[0]);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_ (low_part,
+			  operands[1]));
+   rtx high_part = aarch64_simd_vect_par_cnst_half (mode,
+			true);
+   emit_insn (gen_aarch64_simd_vec_pack_trunc_hi_ (operands[0],
+			 operands[2],
+			 high_part,
+			 operands[0]));
+ }
+   DONE;
  }
   [(set_attr "type" "multiple")
(set_attr "length" "8")]


Re: [PING][PATCH][AArch64] Implement ALU_BRANCH fusion

2017-04-26 Thread Hurugalawadi, Naveen
Hi Wilco,

>> You should only return true if there is a match, not if there is
>> not a match.

Done.

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveen
   diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index f0e6dbc..300cd00 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -34,5 +34,6 @@ AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
 AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
 AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
+AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1e58e9d..d3b66f2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -792,7 +792,8 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */
@@ -14031,6 +14032,49 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 return true;
 }
 
+  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+  && any_condjump_p (curr))
+{
+  /* We're trying to match:
+	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
+	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
+			 (const_int 0))
+		 (label_ref ("SYM"))
+		 (pc))  */
+  if (SET_DEST (curr_set) == (pc_rtx)
+	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
+	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
+	  && REG_P (SET_DEST (prev_set))
+	  && REGNO (SET_DEST (prev_set))
+	 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
+	{
+	  /* Fuse ALU operations followed by conditional branch instruction.  */
+	  switch (get_attr_type (prev))
+	{
+	case TYPE_ALU_IMM:
+	case TYPE_ALU_SREG:
+	case TYPE_ADC_REG:
+	case TYPE_ADC_IMM:
+	case TYPE_ADCS_REG:
+	case TYPE_ADCS_IMM:
+	case TYPE_LOGIC_REG:
+	case TYPE_LOGIC_IMM:
+	case TYPE_CSEL:
+	case TYPE_ADR:
+	case TYPE_MOV_IMM:
+	case TYPE_SHIFT_REG:
+	case TYPE_SHIFT_IMM:
+	case TYPE_BFM:
+	case TYPE_RBIT:
+	case TYPE_REV:
+	case TYPE_EXTEND:
+	  return true;
+
+	default:;
+	}
+	}
+}
+
   return false;
 }
 


Re: [PING][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-04-26 Thread Hurugalawadi, Naveen
Hi Wilco,

>> I suggest you check the logic and follow the existing patterns in
>> aarch_macro_fusion_pair_p.

Done.

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveen
 diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1e58e9d..9f838f5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14022,13 +14022,23 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 {
   enum attr_type prev_type = get_attr_type (prev);
 
-  /* FIXME: this misses some which is considered simple arthematic
- instructions for ThunderX.  Simple shifts are missed here.  */
-  if (prev_type == TYPE_ALUS_SREG
-  || prev_type == TYPE_ALUS_IMM
-  || prev_type == TYPE_LOGICS_REG
-  || prev_type == TYPE_LOGICS_IMM)
-return true;
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  aarch64_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+
+  if (reg_referenced_p (cc_reg_1, PATTERN (curr))
+	  && prev
+	  && modified_in_p (cc_reg_1, prev))
+	{
+	  /* FIXME: this misses some which is considered simple arthematic
+	 instructions for ThunderX.  Simple shifts are missed here.  */
+	  if (prev_type == TYPE_ALUS_SREG
+	  || prev_type == TYPE_ALUS_IMM
+	  || prev_type == TYPE_LOGICS_REG
+	  || prev_type == TYPE_LOGICS_IMM)
+	return true;
+	}
 }
 
   return false;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d985657..3352189 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -29610,6 +29610,15 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
   if (!any_condjump_p (condjmp))
 return false;
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  ix86_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+  || !condgen
+  || !modified_in_p (cc_reg_1, condgen))
+return false;
+
   if (get_attr_type (condgen) != TYPE_TEST
   && get_attr_type (condgen) != TYPE_ICMP
   && get_attr_type (condgen) != TYPE_INCDEC
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index b2393bf..b15a865 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -2835,33 +2835,16 @@ sched_macro_fuse_insns (rtx_insn *insn)
 {
   rtx_insn *prev;
 
-  if (any_condjump_p (insn))
-{
-  unsigned int condreg1, condreg2;
-  rtx cc_reg_1;
-  targetm.fixed_condition_code_regs (, );
-  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!reg_referenced_p (cc_reg_1, PATTERN (insn))
-  || !prev
-  || !modified_in_p (cc_reg_1, prev))
-return;
-}
-  else
-{
-  rtx insn_set = single_set (insn);
-
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!prev
-  || !insn_set
-  || !single_set (prev))
-return;
+  rtx insn_set = single_set (insn);
 
-}
+  prev = prev_nonnote_nondebug_insn (insn);
+  if (!prev
+  || !insn_set
+  || !single_set (prev))
+return;
 
   if (targetm.sched.macro_fusion_pair_p (prev, insn))
 SCHED_GROUP_P (insn) = 1;
-
 }
 
 /* Get the implicit reg pending clobbers for INSN and save them in TEMP.  */


Re: [PING][PATCH][AArch64] Implement ALU_BRANCH fusion

2017-04-26 Thread Hurugalawadi, Naveen
Hi Wilco,

>> Same comment for this part, we want to return true if we match:

Thanks for pointing out about the confusion.

>> Note writing these complex conditions using positive logic makes them much
>> more readable - if you have to negate use !(X && Y && Z) rather than
>> !X || !Y || !Z.

Modified the code as required.

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveen diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index f0e6dbc..300cd00 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -34,5 +34,6 @@ AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
 AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
 AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
+AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..1a63ad0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -792,7 +792,8 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */
@@ -13982,6 +13992,49 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 return true;
 }
 
+  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+  && any_condjump_p (curr))
+{
+  /* We're trying to match:
+	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
+	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
+			 (const_int 0))
+		 (label_ref ("SYM"))
+		 (pc))  */
+  if (! (SET_DEST (curr_set) == (pc_rtx)
+	 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
+	 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
+	 && REG_P (SET_DEST (prev_set))
+	 && REGNO (SET_DEST (prev_set))
+		== REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0
+	return true;
+
+  /* Fuse ALU operations followed by conditional branch instruction.  */
+  switch (get_attr_type (prev))
+	{
+	case TYPE_ALU_IMM:
+	case TYPE_ALU_SREG:
+	case TYPE_ADC_REG:
+	case TYPE_ADC_IMM:
+	case TYPE_ADCS_REG:
+	case TYPE_ADCS_IMM:
+	case TYPE_LOGIC_REG:
+	case TYPE_LOGIC_IMM:
+	case TYPE_CSEL:
+	case TYPE_ADR:
+	case TYPE_MOV_IMM:
+	case TYPE_SHIFT_REG:
+	case TYPE_SHIFT_IMM:
+	case TYPE_BFM:
+	case TYPE_RBIT:
+	case TYPE_REV:
+	case TYPE_EXTEND:
+	  return true;
+
+	default:;
+	}
+}
+
   return false;
 }
 



Re: [PING][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-04-26 Thread Hurugalawadi, Naveen
Hi Wilco,

Thanks for reviewing the patch.

>> The return false seems incorrect - it means a core can either have
>> FUSE_CMP_BRANCH or FUSE_ALU_BRANCH but not both.

Thanks for pointing out about the confusion.
Modified the code as required.

Bootstrapped and Regression tested on AArch64 and X86_64.
Please review the patch and let us know if its okay?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..1a63ad0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13973,6 +13974,15 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 {
   enum attr_type prev_type = get_attr_type (prev);
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  aarch64_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (curr))
+	  || !prev
+	  || !modified_in_p (cc_reg_1, prev))
+	return true;
+
   /* FIXME: this misses some which is considered simple arthematic
  instructions for ThunderX.  Simple shifts are missed here.  */
   if (prev_type == TYPE_ALUS_SREG
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 0b2fa1b..af14c90 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -29483,6 +29483,15 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
   if (!any_condjump_p (condjmp))
 return false;
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  ix86_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+  || !condgen
+  || !modified_in_p (cc_reg_1, condgen))
+return false;
+
   if (get_attr_type (condgen) != TYPE_TEST
   && get_attr_type (condgen) != TYPE_ICMP
   && get_attr_type (condgen) != TYPE_INCDEC
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index b2393bf..b15a865 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -2835,33 +2835,16 @@ sched_macro_fuse_insns (rtx_insn *insn)
 {
   rtx_insn *prev;
 
-  if (any_condjump_p (insn))
-{
-  unsigned int condreg1, condreg2;
-  rtx cc_reg_1;
-  targetm.fixed_condition_code_regs (, );
-  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!reg_referenced_p (cc_reg_1, PATTERN (insn))
-  || !prev
-  || !modified_in_p (cc_reg_1, prev))
-return;
-}
-  else
-{
-  rtx insn_set = single_set (insn);
-
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!prev
-  || !insn_set
-  || !single_set (prev))
-return;
+  rtx insn_set = single_set (insn);
 
-}
+  prev = prev_nonnote_nondebug_insn (insn);
+  if (!prev
+  || !insn_set
+  || !single_set (prev))
+return;
 
   if (targetm.sched.macro_fusion_pair_p (prev, insn))
 SCHED_GROUP_P (insn) = 1;
-
 }
 
 /* Get the implicit reg pending clobbers for INSN and save them in TEMP.  */


Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-04-26 Thread Hurugalawadi, Naveen
Hi Kyrill,

Thanks for the review and your comments.

>> It would be useful if you expanded a bit on the approach used to
>> generate the improved codegen

The patch creates a duplicate of most common element and tries to optimize
the insertion using dup for the element followed by insertions.

Current code:

moviv2.4s, 0
ins v2.s[0], v0.s[0]
ins v2.s[1], v1.s[0]
ins v2.s[2], v0.s[0]
orr v0.16b, v2.16b, v2.16b
ins v0.s[3], v3.s[0]
ret


Code after the patch:

dup v0.4s, v0.s[0]
ins v0.s[1], v1.s[0]
ins v0.s[3], v3.s[0]
ret


>> Some typos

Modified as required

>> worth adding a testcase where one of the vector elements appears more than
>> the others?

Modified the testcase as required using common element.

Please review the patch and let us know if its okay?
Bootstrapped and Regression tested on aarch64-thunder-linux.

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..8747a23 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11671,11 +11671,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   aarch64_expand_vector_init (target, copy);
 }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2];
+  int nummatches = 0;
+  memset (matches, 0, sizeof(matches));
+  for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		nummatches++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
+  /* Insert the variable lanes directly.  */
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..a043a21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


Re: [PING][PATCH][AArch64] Implement ALU_BRANCH fusion

2017-04-25 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg01369.html

Thanks,
Naveen





[PING][PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-04-25 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg01368.html

Thanks,
Naveen




[PING}[PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-04-25 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00505.html

Thanks,
Naveen







[PING][PATCH][AArch64] Add crypto_pmull attribute

2017-04-25 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00504.html

Thanks,
Naveen




[PING][PATCH] [AArch64] Implement automod load and store for Thunderx2t99

2017-04-25 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00226.html

Thanks,
Naveen



[PING][PATCH][AArch64] Add addr_type attribute

2017-04-25 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00222.html

Thanks,
Naveen




[PING] [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-04-25 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-12/msg00718.html

Thanks,
Naveen






Re: [PATCH][AArch64] Implement ALU_BRANCH fusion

2017-03-26 Thread Hurugalawadi, Naveen
Hi,

Thanks for the review and suggestions.

> I think the patch isn't quite complete yet. You will also need changes in
> generic code. Currently sched_macro_fuse_insns() does:

Modified the sched_macro_fuse_insns() as required.

> Basically the idea is to push the check for CC usage into target macros

Done. Pushed the check into target macros.

The modifications were generic and and quite different from ALU+BRANCH
fusion; a separate patch is posted with the above 2 modifications at:-
https://gcc.gnu.org/ml/gcc-patches/2017-03/msg01368.html

> Also in aarch64.c's macro fusion you need check that the branch
>> instruction uses the same register 

Added to check that same registers are used in ALU and Branch instruction.

Bootstrapped and Regression tested on AArch64.

Please review the patch and let us know if its okay?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index f0e6dbc..300cd00 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -34,5 +34,6 @@ AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
 AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
 AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
+AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a4..31bc5f4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -792,7 +792,8 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */
@@ -13981,6 +13982,50 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 return true;
 }
 
+  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+  && any_condjump_p (curr))
+{
+  /* We're trying to match:
+	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
+	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
+			 (const_int 0))
+		 (label_ref ("SYM"))
+		 (pc))  */
+
+  if (SET_DEST (curr_set) != (pc_rtx)
+	  || GET_CODE (SET_SRC (curr_set)) != IF_THEN_ELSE
+	  || ! REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
+	  || ! REG_P (SET_DEST (prev_set))
+	  || REGNO (SET_DEST (prev_set))
+	 != REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
+	return false;
+
+  /* Fuse ALU operations followed by conditional branch instruction.  */
+  switch (get_attr_type (prev))
+	{
+	case TYPE_ALU_IMM:
+	case TYPE_ALU_SREG:
+	case TYPE_ADC_REG:
+	case TYPE_ADC_IMM:
+	case TYPE_ADCS_REG:
+	case TYPE_ADCS_IMM:
+	case TYPE_LOGIC_REG:
+	case TYPE_LOGIC_IMM:
+	case TYPE_CSEL:
+	case TYPE_ADR:
+	case TYPE_MOV_IMM:
+	case TYPE_SHIFT_REG:
+	case TYPE_SHIFT_IMM:
+	case TYPE_BFM:
+	case TYPE_RBIT:
+	case TYPE_REV:
+	case TYPE_EXTEND:
+	  return true;
+
+	default:;
+	}
+}
+
   return false;
 }
 


[PATCH] Move the check for any_condjump_p from sched-deps to target macros

2017-03-26 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that moves the check for CC usage in
any_condjump_p from sched-deps to target macros.

Currently the check is used only by i386 and AArch64.
The general condition checks for the fusion candidates to use/modify CC1
register. However, the fusion of ALU and Branch instruction in AArch64 looks
for any register and hence does not satisfy the condition. 

Bootstrapped and Regression tested on AArch64 and X86_64.

Please review the patch and let us know if its okay?

Thanks,
Naveen

2017-03-27  Naveen H.S  

* config/aarch64/aarch64.c (aarch_macro_fusion_pair_p): Push the
check for CC usage into AARCH64_FUSE_CMP_BRANCH.
* config/i386/i386.c (ix86_macro_fusion_pair_p): Push the check for
CC usage from generic code.
* sched-deps.c (sched_macro_fuse_insns): Move the condition for
any_condjump_p into the target macros.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a4..ec0a3ec 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13972,6 +13972,15 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 {
   enum attr_type prev_type = get_attr_type (prev);
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  aarch64_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (curr))
+	  || !prev
+	  || !modified_in_p (cc_reg_1, prev))
+	return false;
+
   /* FIXME: this misses some which is considered simple arthematic
  instructions for ThunderX.  Simple shifts are missed here.  */
   if (prev_type == TYPE_ALUS_SREG
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index bb0debf..3dcbe37 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -29490,6 +29490,15 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
   if (!any_condjump_p (condjmp))
 return false;
 
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  ix86_fixed_condition_code_regs (, );
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+  || !condgen
+  || !modified_in_p (cc_reg_1, condgen))
+return false;
+
   if (get_attr_type (condgen) != TYPE_TEST
   && get_attr_type (condgen) != TYPE_ICMP
   && get_attr_type (condgen) != TYPE_INCDEC
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index b2393bf..b15a865 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -2835,33 +2835,16 @@ sched_macro_fuse_insns (rtx_insn *insn)
 {
   rtx_insn *prev;
 
-  if (any_condjump_p (insn))
-{
-  unsigned int condreg1, condreg2;
-  rtx cc_reg_1;
-  targetm.fixed_condition_code_regs (, );
-  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!reg_referenced_p (cc_reg_1, PATTERN (insn))
-  || !prev
-  || !modified_in_p (cc_reg_1, prev))
-return;
-}
-  else
-{
-  rtx insn_set = single_set (insn);
-
-  prev = prev_nonnote_nondebug_insn (insn);
-  if (!prev
-  || !insn_set
-  || !single_set (prev))
-return;
+  rtx insn_set = single_set (insn);
 
-}
+  prev = prev_nonnote_nondebug_insn (insn);
+  if (!prev
+  || !insn_set
+  || !single_set (prev))
+return;
 
   if (targetm.sched.macro_fusion_pair_p (prev, insn))
 SCHED_GROUP_P (insn) = 1;
-
 }
 
 /* Get the implicit reg pending clobbers for INSN and save them in TEMP.  */


Re: [PATCH][AArch64] Implement ALU_BRANCH fusion

2017-03-15 Thread Hurugalawadi, Naveen
Hi Kyrill,

>> I suggest you reword the whole comment and not talk about particular CPUs
>> but rather about the kinds of instructions you want to fuse

Modified as per the comments. Had modified the earlier version of patch
which had the vulcan reservation before James comments.

Please find attached the modified patch with comments incorporated.

Thanks,
Naveen


diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index f0e6dbc..300cd00 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -34,5 +34,6 @@ AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
 AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
 AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
+AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a069427..3af0b1a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -792,7 +792,8 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */
@@ -13981,6 +13982,35 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 return true;
 }
 
+  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+  && any_condjump_p (curr))
+{
+  /* Fuse ALU operations followed by conditional branch instruction.  */
+  switch (get_attr_type (prev))
+	{
+	case TYPE_ALU_IMM:
+	case TYPE_ALU_SREG:
+	case TYPE_ADC_REG:
+	case TYPE_ADC_IMM:
+	case TYPE_ADCS_REG:
+	case TYPE_ADCS_IMM:
+	case TYPE_LOGIC_REG:
+	case TYPE_LOGIC_IMM:
+	case TYPE_CSEL:
+	case TYPE_ADR:
+	case TYPE_MOV_IMM:
+	case TYPE_SHIFT_REG:
+	case TYPE_SHIFT_IMM:
+	case TYPE_BFM:
+	case TYPE_RBIT:
+	case TYPE_REV:
+	case TYPE_EXTEND:
+	  return true;
+
+	default:;
+	}
+}
+
   return false;
 }
 


Re: [PATCH][AArch64] Implement ALU_BRANCH fusion

2017-03-14 Thread Hurugalawadi, Naveen
Hi James,

>> My reason for asking is that the instruction fusion implemented in LLVM
>> ( lib/Target/AArch64/AArch64MacroFusion.cpp::shouldScheduleAdjacent )

Sorry. There seems to be some confusion in the branch instructions.
The branch should be conditional for ALU_BRANCH fusion.

Please find attached the modified patch that fuses ALU instructions and
conditional branches.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay?

Thanks,
Naveen
diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index f0e6dbc..300cd00 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -34,5 +34,6 @@ AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
 AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
 AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
+AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a069427..f76a2ff 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -792,7 +792,8 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */
@@ -13981,6 +13982,37 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 return true;
 }
 
+  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+  && any_condjump_p (curr))
+{
+  /* These types correspond to the reservation "vulcan_alu_basic" for
+	 Broadcom Vulcan: these are ALU operations that produce a single uop
+	 during instruction decoding.  */
+  switch (get_attr_type (prev))
+	{
+	case TYPE_ALU_IMM:
+	case TYPE_ALU_SREG:
+	case TYPE_ADC_REG:
+	case TYPE_ADC_IMM:
+	case TYPE_ADCS_REG:
+	case TYPE_ADCS_IMM:
+	case TYPE_LOGIC_REG:
+	case TYPE_LOGIC_IMM:
+	case TYPE_CSEL:
+	case TYPE_ADR:
+	case TYPE_MOV_IMM:
+	case TYPE_SHIFT_REG:
+	case TYPE_SHIFT_IMM:
+	case TYPE_BFM:
+	case TYPE_RBIT:
+	case TYPE_REV:
+	case TYPE_EXTEND:
+	  return true;
+
+	default:;
+	}
+}
+
   return false;
 }
 


Re: [PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-03-09 Thread Hurugalawadi, Naveen
Hi James,

> The whitespace in various places in this patch is inconsistent with the
> whitespace around the modified line. For example:

Fixed the whitespace.

>> So this patch isn't OK without fixes for the models
>> in cortex-a53.md and exynos-m1.md

Thanks for pointing out the missing cores in patch.
Added the support as per your comments.

Please find attached the modified patch and let us know
if its okay for stage1?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7ad3a76..4e378d3 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2101,7 +2101,7 @@
 		UNSPEC_ADDV))]
  "TARGET_SIMD"
  "add\\t%0, %1."
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon__add")]
 )
 
 (define_insn "aarch64_reduc_plus_internalv2si"
@@ -2110,7 +2110,7 @@
 		UNSPEC_ADDV))]
  "TARGET_SIMD"
  "addp\\t%0.2s, %1.2s, %1.2s"
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon_pairwise_add")]
 )
 
 (define_insn "reduc_plus_scal_"
@@ -4405,7 +4405,7 @@
   UNSPEC_ADDP))]
   "TARGET_SIMD"
   "addp\t%0, %1, %2"
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon_pairwise_add")]
 )
 
 (define_insn "aarch64_addpdi"
@@ -4415,7 +4415,7 @@
   UNSPEC_ADDP))]
   "TARGET_SIMD"
   "addp\t%d0, %1.2d"
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon_pairwise_add")]
 )
 
 ;; sqrt
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 1ddf6ad..41f2f4c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -793,6 +793,12 @@
 		  (V2SF "p") (V4SF  "v")
 		  (V4HF "v") (V8HF  "v")])
 
+(define_mode_attr reduc_pairwise [(V8QI "reduc") (V16QI "reduc")
+  (V4HI "reduc") (V8HI "reduc")
+  (V2SI "pairwise") (V4SI "reduc")
+  (V2DI "pairwise") (V2DF "pairwise")
+  (V2SF "pairwise") (V4SF "reduc")])
+
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
 (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
 
diff --git a/gcc/config/aarch64/thunderx.md b/gcc/config/aarch64/thunderx.md
index b67671d..95bfad4 100644
--- a/gcc/config/aarch64/thunderx.md
+++ b/gcc/config/aarch64/thunderx.md
@@ -266,7 +266,8 @@
 
 (define_insn_reservation "thunderx_neon_add" 4
   (and (eq_attr "tune" "thunderx")
-   (eq_attr "type" "neon_reduc_add, neon_reduc_minmax, neon_fp_reduc_add_s, \
+   (eq_attr "type" "neon_reduc_add, neon_pairwise_add, neon_reduc_minmax,\
+			neon_fp_reduc_add_s, \
 			neon_fp_reduc_add_d, neon_fp_to_int_s, neon_fp_to_int_d, \
 			neon_add_halve, neon_sub_halve, neon_qadd, neon_compare, \
 			neon_compare_zero, neon_minmax, neon_abd, neon_add, neon_sub, \
@@ -280,7 +281,8 @@
 
 (define_insn_reservation "thunderx_neon_add_q" 5
   (and (eq_attr "tune" "thunderx")
-   (eq_attr "type" "neon_reduc_add_q, neon_reduc_minmax_q, neon_fp_reduc_add_s_q, \
+   (eq_attr "type" "neon_reduc_add_q, neon_pairwise_add_q,\
+			neon_reduc_minmax_q, neon_fp_reduc_add_s_q, \
 			neon_fp_reduc_add_d_q, neon_fp_to_int_s_q, neon_fp_to_int_d_q, \
 			neon_add_halve_q, neon_sub_halve_q, neon_qadd_q, neon_compare_q, \
 			neon_compare_zero_q, neon_minmax_q, neon_abd_q, neon_add_q, neon_sub_q, \
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index 0dd7199..eb5e02a 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -231,6 +231,7 @@
 			neon_abs,neon_abs_q,\
 			neon_add,neon_add_q,\
 			neon_neg,neon_neg_q,\
+			neon_pairwise_add,neon_pairwise_add_q,\
 			neon_add_long,neon_add_widen,\
 			neon_add_halve,neon_add_halve_q,\
 			neon_sub_long,neon_sub_widen,\
diff --git a/gcc/config/arm/cortex-a15-neon.md b/gcc/config/arm/cortex-a15-neon.md
index 73ee84c..e3731ea 100644
--- a/gcc/config/arm/cortex-a15-neon.md
+++ b/gcc/config/arm/cortex-a15-neon.md
@@ -48,6 +48,7 @@
   (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
neon_add_widen, neon_neg, neon_neg_q,\
neon_reduc_add, neon_reduc_add_q,\
+   neon_pairwise_add, neon_pairwise_add_q,\
neon_reduc_add_long, neon_sub, neon_sub_q,\
neon_sub_long, neon_sub_widen, neon_logic,\
neon_logic_q, neon_tst, neon_tst_q")
diff --git a/gcc/config/arm/cortex-a17-neon.md b/gcc/config/arm/cortex-a17-neon.md
index 29d08de..0eaf6fc 100644
--- a/gcc/config/arm/cortex-a17-neon.md
+++ b/gcc/config/arm/cortex-a17-neon.md
@@ -47,6 +47,7 @@
   (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
neon_add_widen, neon_neg, neon_neg_q,\
neon_reduc_add, neon_reduc_add_q,\
+   neon_pairwise_add, neon_pairwise_add_q,\
neon_reduc_add_long, neon_sub, neon_sub_q,\
neon_sub_long, 

Re: [PATCH][AArch64] Add crypto_pmull attribute

2017-03-09 Thread Hurugalawadi, Naveen
Hi James,

>> You need to do this for all cores which might be affected by this change,
>> i.e. all those which model neon_mul_d_long.

Thanks for pointing out the missing cores in patch.
Added the support as per your comments.

Please find attached the modified patch and let us know
if its okay for stage1?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7ad3a76..1aa1b96 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5818,7 +5818,7 @@
 		UNSPEC_PMULL))]
  "TARGET_SIMD && TARGET_CRYPTO"
  "pmull\\t%0.1q, %1.1d, %2.1d"
-  [(set_attr "type" "neon_mul_d_long")]
+  [(set_attr "type" "crypto_pmull")]
 )
 
 (define_insn "aarch64_crypto_pmullv2di"
@@ -5828,5 +5828,5 @@
 		  UNSPEC_PMULL2))]
   "TARGET_SIMD && TARGET_CRYPTO"
   "pmull2\\t%0.1q, %1.2d, %2.2d"
-  [(set_attr "type" "neon_mul_d_long")]
+  [(set_attr "type" "crypto_pmull")]
 )
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index 0dd7199..67011ac 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -441,3 +441,8 @@
   (and (eq_attr "tune" "thunderx2t99")
(eq_attr "type" "neon_store2_one_lane,neon_store2_one_lane_q"))
   "thunderx2t99_ls01,thunderx2t99_f01")
+
+(define_insn_reservation "thunderx2t99_pmull" 5
+  (and (eq_attr "tune" "thunderx2t99")
+   (eq_attr "type" "crypto_pmull"))
+  "thunderx2t99_f1")
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index 7cf5fc5..049ac85 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -379,7 +379,7 @@
 		 neon_sat_mul_b_long, neon_sat_mul_h_long,\
 		 neon_sat_mul_s_long, neon_sat_mul_h_scalar_q,\
 		 neon_sat_mul_s_scalar_q, neon_sat_mul_h_scalar_long,\
-		 neon_sat_mul_s_scalar_long, neon_mla_b_q,\
+		 neon_sat_mul_s_scalar_long, crypto_pmull, neon_mla_b_q,\
 		 neon_mla_h_q, neon_mla_s_q, neon_mla_b_long,\
 		 neon_mla_h_long, neon_mla_s_long,\
 		 neon_mla_h_scalar_q, neon_mla_s_scalar_q,\
diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md
index fd30758..ebf4a49 100644
--- a/gcc/config/arm/cortex-a57.md
+++ b/gcc/config/arm/cortex-a57.md
@@ -76,7 +76,7 @@
 			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
 			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
 			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
-			   neon_sat_mul_s_scalar_long")
+			   neon_sat_mul_s_scalar_long, crypto_pmull")
 	(const_string "neon_multiply")
 	  (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q,\
 			   neon_mul_h_scalar_q, neon_mul_s_scalar_q,\
diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md
index 46b0715..a5e558b 100644
--- a/gcc/config/arm/crypto.md
+++ b/gcc/config/arm/crypto.md
@@ -81,7 +81,7 @@
  UNSPEC_VMULLP64))]
   "TARGET_CRYPTO"
   "vmull.p64\\t%q0, %P1, %P2"
-  [(set_attr "type" "neon_mul_d_long")]
+  [(set_attr "type" "crypto_pmull")]
 )
 
 (define_insn "crypto_"
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
index 5d397cc..b54d4c8 100644
--- a/gcc/config/arm/exynos-m1.md
+++ b/gcc/config/arm/exynos-m1.md
@@ -78,7 +78,7 @@
 			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
 			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
 			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
-			   neon_sat_mul_s_scalar_long")
+			   neon_sat_mul_s_scalar_long, crypto_pmull")
 	(const_string "neon_multiply")
 
 	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index b0b375c..253f496 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -539,6 +539,7 @@
 ; crypto_sha1_slow
 ; crypto_sha256_fast
 ; crypto_sha256_slow
+; crypto_pmull
 ;
 ; The classification below is for coprocessor instructions
 ;
@@ -1078,6 +1079,7 @@
   crypto_sha1_slow,\
   crypto_sha256_fast,\
   crypto_sha256_slow,\
+  crypto_pmull,\
   coproc"
(const_string "untyped"))
 
diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
index 62a0732..34a13f4 100644
--- a/gcc/config/arm/xgene1.md
+++ b/gcc/config/arm/xgene1.md
@@ -527,5 +527,6 @@
 (define_insn_reservation "xgene1_neon_pmull" 5
   (and (eq_attr "tune" "xgene1")
(eq_attr "type" "neon_mul_d_long,\
-   "))
+			crypto_pmull,\
+		   "))
   "xgene1_decode2op")


Re: [PATCH][AArch64] Implement ALU_BRANCH fusion

2017-03-08 Thread Hurugalawadi, Naveen
Hi James,

Thanks for the review and your comments.

>> I'd need more detail on what types of instruction pairs you
>> are trying to fuse. 

The documentation mentions it as follows:-
Single uop ALU instruction may fuse with adjacent branch instruction in the 
same bundle

>> This comment looks incorrect - there is no vulcan_alu_basic reservation

Modified as per comment.

Please let us know if the description is sufficient?

Thanks,
Naveen

[PATCH][AArch64] Fix type for 1-element load

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that fixes type for 1-element load in AArch64.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set): Fix
type for 1-element load.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 878f86a..0443a86 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -561,7 +561,7 @@
 	gcc_unreachable ();
  }
   }
-  [(set_attr "type" "neon_from_gp, neon_ins, neon_load1_1reg")]
+  [(set_attr "type" "neon_from_gp, neon_ins, neon_load1_one_lane")]
 )
 
 (define_insn "*aarch64_simd_vec_copy_lane"


[PATCH][AArch64] Add neon_pairwise_add & neon_pairwise_add_q types

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds "neon_pairwise_add" & 
"neon_pairwise_add_qcrypto_pmull" for AArch64.

The patch doesn't change spec but improve other benchmarks.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/aarch64-simd.md (aarch64_reduc_plus_internal)
(aarch64_reduc_plus_internalv2si, aarch64_addp, aarch64_addpdi):
Use neon_pairwise_add/neon_pairwise_add_q as appropriate.
* config/aarch64/iterators.md (reduc_pairwise): New mode attribute.
* config/aarch64/thunderx.md (thunderx_neon_add, thunderx_neon_add_q):
Tweak for neon_pairwise_add split.
* config/aarch64/thunderx2t99.md (thunderx2t99_asimd_int): Add
neon_pairwise_add/neon_pairwise_add_q types.
* config/arm/cortex-a15-neon.md (cortex_a15_neon_type): Likewise.
* config/arm/cortex-a17-neon.md (cortex_a17_neon_type): Likewise.
* config/arm/cortex-a57.md (cortex_a57_neon_type): Likewise.
* config/arm/cortex-a8-neon.md (cortex_a8_neon_type): Likewise.
* config/arm/cortex-a9-neon.md (cortex_a9_neon_type): Likewise.
* config/arm/xgene1.md (xgene1_neon_arith): Likewise.
* config/arm/types.md (neon_pairwise_add, neon_pairwise_add_q): Add.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 338b9f8..878f86a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2101,7 +2101,7 @@
 		UNSPEC_ADDV))]
  "TARGET_SIMD"
  "add\\t%0, %1."
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon__add")]
 )
 
 (define_insn "aarch64_reduc_plus_internalv2si"
@@ -2110,7 +2110,7 @@
 		UNSPEC_ADDV))]
  "TARGET_SIMD"
  "addp\\t%0.2s, %1.2s, %1.2s"
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon_pairwise_add")]
 )
 
 (define_insn "reduc_plus_scal_"
@@ -4405,7 +4405,7 @@
   UNSPEC_ADDP))]
   "TARGET_SIMD"
   "addp\t%0, %1, %2"
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon_pairwise_add")]
 )
 
 (define_insn "aarch64_addpdi"
@@ -4415,7 +4415,7 @@
   UNSPEC_ADDP))]
   "TARGET_SIMD"
   "addp\t%d0, %1.2d"
-  [(set_attr "type" "neon_reduc_add")]
+  [(set_attr "type" "neon_pairwise_add")]
 )
 
 ;; sqrt
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index c59d31e..c829cb5 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -790,6 +790,12 @@
 		  (V2SF "p") (V4SF  "v")
 		  (V4HF "v") (V8HF  "v")])
 
+(define_mode_attr reduc_pairwise [(V8QI "reduc") (V16QI "reduc")
+  (V4HI "reduc") (V8HI "reduc")
+  (V2SI "pairwise") (V4SI "reduc")
+  (V2DI "pairwise") (V2DF "pairwise")
+  (V2SF "pairwise") (V4SF "reduc")])
+
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
 (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
 
diff --git a/gcc/config/aarch64/thunderx.md b/gcc/config/aarch64/thunderx.md
index b67671d..95bfad4 100644
--- a/gcc/config/aarch64/thunderx.md
+++ b/gcc/config/aarch64/thunderx.md
@@ -266,7 +266,8 @@
 
 (define_insn_reservation "thunderx_neon_add" 4
   (and (eq_attr "tune" "thunderx")
-   (eq_attr "type" "neon_reduc_add, neon_reduc_minmax, neon_fp_reduc_add_s, \
+   (eq_attr "type" "neon_reduc_add, neon_pairwise_add, neon_reduc_minmax,\
+			neon_fp_reduc_add_s, \
 			neon_fp_reduc_add_d, neon_fp_to_int_s, neon_fp_to_int_d, \
 			neon_add_halve, neon_sub_halve, neon_qadd, neon_compare, \
 			neon_compare_zero, neon_minmax, neon_abd, neon_add, neon_sub, \
@@ -280,7 +281,8 @@
 
 (define_insn_reservation "thunderx_neon_add_q" 5
   (and (eq_attr "tune" "thunderx")
-   (eq_attr "type" "neon_reduc_add_q, neon_reduc_minmax_q, neon_fp_reduc_add_s_q, \
+   (eq_attr "type" "neon_reduc_add_q, neon_pairwise_add_q,\
+			neon_reduc_minmax_q, neon_fp_reduc_add_s_q, \
 			neon_fp_reduc_add_d_q, neon_fp_to_int_s_q, neon_fp_to_int_d_q, \
 			neon_add_halve_q, neon_sub_halve_q, neon_qadd_q, neon_compare_q, \
 			neon_compare_zero_q, neon_minmax_q, neon_abd_q, neon_add_q, neon_sub_q, \
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index 67011ac..f807547 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -231,6 +231,7 @@
 			neon_abs,neon_abs_q,\
 			neon_add,neon_add_q,\
 			neon_neg,neon_neg_q,\
+			neon_pairwise_add,neon_pairwise_add_q,\
 			neon_add_long,neon_add_widen,\
 			neon_add_halve,neon_add_halve_q,\
 			neon_sub_long,neon_sub_widen,\
diff --git a/gcc/config/arm/cortex-a15-neon.md b/gcc/config/arm/cortex-a15-neon.md
index 73ee84c..1a02fa2 100644
--- a/gcc/config/arm/cortex-a15-neon.md
+++ b/gcc/config/arm/cortex-a15-neon.md
@@ -48,6 +48,7 @@
   (eq_attr "type" "neon_add, neon_add_q, 

[PATCH] [AArch64] Implement automod load and store for Thunderx2t99

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that implements automod load and store for
Thunderx2t99.
The patch doesn't change spec but improve other benchmarks.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/aarch64-protos.h (aarch64_automod_addr_only_dep): Add
prototype.
* config/aarch64/aarch64.c (aarch64_automod_addr_only_dep): New
function.
* config/aarch64/thunderx2t99.md (thunderx2t99_load_basic)
(thunderx2t99_store_basic, thunderx2t99_storepair_basic)
(thunderx2t99_fp_load_basic, thunderx2t99_fp_loadpair_basic)
(thunderx2t99_fp_storepair_basic): Add aarch64_mem_type_p test.
(thunderx2t99_load_automod, thunderx2t99_load_regoffset)
(thunderx2t99_load_scale_ext, thunderx2t99_store_automod)
(thunderx2t99_store_regoffset_scale_ext, thunderx2t99_fp_load_automod)
(thunderx2t99_storepair_automod, thunderx2t99_fp_load_regoffset)
(thunderx2t99_fp_load_scale_ext, thunderx2t99_fp_loadpair_automod)
(thunderx2t99_fp_store_automod, thunderx2t99_fp_storepair_automod)
(thunderx2t99_fp_store_regoffset_scale_ext): New insn reservations.
(thunderx2t99_load_automod, thunderx2t99_fp_load_automod)
(thunderx2t99_fp_loadpair_automod): Add bypass for output address-only
dependencies.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e045df8..7472d98 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -488,5 +488,6 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
 			unsigned long);
 
 rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
+int aarch64_automod_addr_only_dep (rtx_insn *, rtx_insn *);
 
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 62f5461..c674c51 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14875,6 +14875,94 @@ aarch64_run_selftests (void)
 
 #endif /* #if CHECKING_P */
 
+/* Return nonzero if the CONSUMER has a dependency only on an automodify
+   address in PRODUCER (a load instruction, i.e. the dependency is not on the
+   loaded value).  */
+
+int
+aarch64_automod_addr_only_dep (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx prod_set = single_set (producer);
+
+  if (prod_set)
+{
+  rtx dst, src = SET_SRC (prod_set);
+
+  if (GET_CODE (src) == ZERO_EXTEND || GET_CODE (src) == SIGN_EXTEND)
+	src = XEXP (src, 0);
+
+  gcc_assert (MEM_P (src));
+
+  dst = XEXP (prod_set, 0);
+
+  rtx cons_set = single_set (consumer);
+  rtx cons_pat = PATTERN (consumer);
+
+  if (cons_set)
+	return !reg_overlap_mentioned_p (dst, cons_set);
+  else if (GET_CODE (cons_pat) == PARALLEL)
+	{
+	  for (int i = 0; i < XVECLEN (cons_pat, 0); i++)
+	{
+	  rtx set = XVECEXP (cons_pat, 0, i);
+
+	  if (GET_CODE (set) != SET)
+		continue;
+
+	  if (reg_overlap_mentioned_p (dst, set))
+		return 0;
+	}
+	}
+  else
+	return 0;
+}
+  else if (GET_CODE (PATTERN (producer)) == PARALLEL)
+{
+  rtx prod_pat = PATTERN (producer);
+  rtx cons_set = single_set (consumer);
+  rtx cons_pat = PATTERN (consumer);
+
+  for (int i = 0; i < XVECLEN (prod_pat, 0); i++)
+	{
+	  prod_set = XVECEXP (prod_pat, 0, i);
+
+	  if (GET_CODE (prod_set) == SET)
+	{
+	  rtx src = XEXP (prod_set, 1), dst = XEXP (prod_set, 0);
+
+	  if (GET_CODE (src) == ZERO_EXTEND
+		  || GET_CODE (src) == SIGN_EXTEND)
+		src = XEXP (src, 0);
+
+	  gcc_assert (MEM_P (src));
+
+	  if (cons_set)
+		{
+		  if (reg_overlap_mentioned_p (dst, cons_set))
+		return 0;
+		}
+	  else if (GET_CODE (cons_pat) == PARALLEL)
+		{
+		  for (int i = 0; i < XVECLEN (cons_pat, 0); i++)
+		{
+		  rtx set = XVECEXP (cons_pat, 0, i);
+
+		  if (GET_CODE (set) != SET)
+		continue;
+
+		  if (reg_overlap_mentioned_p (dst, set))
+			return 0;
+		}
+		}
+	  else
+		return 0;
+	}
+	}
+}
+
+  return 1;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index 936078c..add3707 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -123,24 +123,73 @@
 
 (define_insn_reservation "thunderx2t99_load_basic" 4
   (and (eq_attr "tune" "thunderx2t99")
-   (eq_attr "type" "load1"))
+   (eq_attr "type" "load1")
+   (match_test "aarch64_mem_type_p (insn, AARCH64_ADDR_SYMBOLIC
+	  | AARCH64_ADDR_REG_IMM
+	  | AARCH64_ADDR_LO_SUM)"))
   "thunderx2t99_ls01")
 
+(define_insn_reservation "thunderx2t99_load_automod" 4
+  (and 

[PATCH][AArch64] Add aes and sha reservations for Thunderx2t99

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds aes and sha reservations for
Thunderx2t99.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
    Naveen H.S  

* config/aarch64/thunderx2t99.md (thunderx2t99_crc): New Reservation.diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index f807547..2eb136b 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -443,7 +443,22 @@
(eq_attr "type" "neon_store2_one_lane,neon_store2_one_lane_q"))
   "thunderx2t99_ls01,thunderx2t99_f01")
 
+;; Crypto extensions.
+
+; FIXME: Forwarding path for aese/aesmc or aesd/aesimc pairs?
+
+(define_insn_reservation "thunderx2t99_aes" 5
+  (and (eq_attr "tune" "thunderx2t99")
+   (eq_attr "type" "crypto_aese,crypto_aesmc"))
+  "thunderx2t99_f1")
+
 (define_insn_reservation "thunderx2t99_pmull" 5
   (and (eq_attr "tune" "thunderx2t99")
(eq_attr "type" "crypto_pmull"))
   "thunderx2t99_f1")
+
+(define_insn_reservation "thunderx2t99_sha" 7
+  (and (eq_attr "tune" "thunderx2t99")
+   (eq_attr "type" "crypto_sha1_fast,crypto_sha1_xor,crypto_sha1_slow,\
+			crypto_sha256_fast,crypto_sha256_slow"))
+  "thunderx2t99_f1")


[PATCH][AArch64] Implement ALU_BRANCH fusion

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that implements alu_branch fusion
for AArch64.
The patch doesn't change spec but improve other benchmarks.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/aarch64-fusion-pairs.def: Add ALU_BRANCH entry.
* config/aarch64/aarch64.c (AARCH64_FUSE_ALU_BRANCH): New fusion type.
(thunderx2t99_tunings): Set AARCH64_FUSE_ALU_BRANCH flag.
(aarch_macro_fusion_pair_p): Add support for AARCH64_FUSE_ALU_BRANCH.
diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index f0e6dbc..300cd00 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -34,5 +34,6 @@ AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
 AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
 AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
+AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa25d43..62f5461 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -792,7 +792,8 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */
@@ -14063,6 +14064,37 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 return true;
 }
 
+  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+  && any_uncondjump_p (curr))
+{
+  /* These types correspond to the reservation "vulcan_alu_basic" for
+	 Broadcom Vulcan: these are ALU operations that produce a single uop
+	 during instruction decoding.  */
+  switch (get_attr_type (prev))
+	{
+	case TYPE_ALU_IMM:
+	case TYPE_ALU_SREG:
+	case TYPE_ADC_REG:
+	case TYPE_ADC_IMM:
+	case TYPE_ADCS_REG:
+	case TYPE_ADCS_IMM:
+	case TYPE_LOGIC_REG:
+	case TYPE_LOGIC_IMM:
+	case TYPE_CSEL:
+	case TYPE_ADR:
+	case TYPE_MOV_IMM:
+	case TYPE_SHIFT_REG:
+	case TYPE_SHIFT_IMM:
+	case TYPE_BFM:
+	case TYPE_RBIT:
+	case TYPE_REV:
+	case TYPE_EXTEND:
+	  return true;
+
+	default:;
+	}
+}
+
   return false;
 }
 


[PATCH][AArch64] Add crypto_pmull attribute

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds "crypto_pmull" for AArch64.

Bootstrapped and Regression tested on aarch64-thunder-linux.

Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/aarch64-simd.md (aarch64_crypto_pmulldi)
(aarch64_crypto_pmullv2di): Change type attribute to crypto_pmull.
* config/aarch64/thunderx2t99.md (thunderx2t99_pmull): New
reservation.
* config/arm/cortex-a57.md (cortex_a57_neon_type): Add crypto_pmull to
attribute type list for neon_multiply.
* config/arm/crypto.md (crypto_vmullp64): Change type to crypto_pmull.
* config/arm/types.md (crypto_pmull): Add.
* config/arm/xgene1.md (xgene1_neon_pmull): Add crypto_pmull to
attribute type list.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b61f79a..338b9f8 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5818,7 +5818,7 @@
 		UNSPEC_PMULL))]
  "TARGET_SIMD && TARGET_CRYPTO"
  "pmull\\t%0.1q, %1.1d, %2.1d"
-  [(set_attr "type" "neon_mul_d_long")]
+  [(set_attr "type" "crypto_pmull")]
 )
 
 (define_insn "aarch64_crypto_pmullv2di"
@@ -5828,5 +5828,5 @@
 		  UNSPEC_PMULL2))]
   "TARGET_SIMD && TARGET_CRYPTO"
   "pmull2\\t%0.1q, %1.2d, %2.2d"
-  [(set_attr "type" "neon_mul_d_long")]
+  [(set_attr "type" "crypto_pmull")]
 )
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index 0dd7199..67011ac 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -441,3 +441,8 @@
   (and (eq_attr "tune" "thunderx2t99")
(eq_attr "type" "neon_store2_one_lane,neon_store2_one_lane_q"))
   "thunderx2t99_ls01,thunderx2t99_f01")
+
+(define_insn_reservation "thunderx2t99_pmull" 5
+  (and (eq_attr "tune" "thunderx2t99")
+   (eq_attr "type" "crypto_pmull"))
+  "thunderx2t99_f1")
diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md
index fd30758..ebf4a49 100644
--- a/gcc/config/arm/cortex-a57.md
+++ b/gcc/config/arm/cortex-a57.md
@@ -76,7 +76,7 @@
 			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
 			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
 			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
-			   neon_sat_mul_s_scalar_long")
+			   neon_sat_mul_s_scalar_long, crypto_pmull")
 	(const_string "neon_multiply")
 	  (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q,\
 			   neon_mul_h_scalar_q, neon_mul_s_scalar_q,\
diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md
index 46b0715..a5e558b 100644
--- a/gcc/config/arm/crypto.md
+++ b/gcc/config/arm/crypto.md
@@ -81,7 +81,7 @@
  UNSPEC_VMULLP64))]
   "TARGET_CRYPTO"
   "vmull.p64\\t%q0, %P1, %P2"
-  [(set_attr "type" "neon_mul_d_long")]
+  [(set_attr "type" "crypto_pmull")]
 )
 
 (define_insn "crypto_"
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index b0b375c..253f496 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -539,6 +539,7 @@
 ; crypto_sha1_slow
 ; crypto_sha256_fast
 ; crypto_sha256_slow
+; crypto_pmull
 ;
 ; The classification below is for coprocessor instructions
 ;
@@ -1078,6 +1079,7 @@
   crypto_sha1_slow,\
   crypto_sha256_fast,\
   crypto_sha256_slow,\
+  crypto_pmull,\
   coproc"
(const_string "untyped"))
 
diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
index 62a0732..34a13f4 100644
--- a/gcc/config/arm/xgene1.md
+++ b/gcc/config/arm/xgene1.md
@@ -527,5 +527,6 @@
 (define_insn_reservation "xgene1_neon_pmull" 5
   (and (eq_attr "tune" "xgene1")
(eq_attr "type" "neon_mul_d_long,\
-   "))
+			crypto_pmull,\
+		   "))
   "xgene1_decode2op")


[PATCH][AArch64] Add crc reservations for Thunderx2t99

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds crc reservations for Thunderx2t99.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/thunderx2t99.md (thunderx2t99_crc): New Reservation.
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index 2eb136b..936078c 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -462,3 +462,10 @@
(eq_attr "type" "crypto_sha1_fast,crypto_sha1_xor,crypto_sha1_slow,\
 			crypto_sha256_fast,crypto_sha256_slow"))
   "thunderx2t99_f1")
+
+;; CRC extension.
+
+(define_insn_reservation "thunderx2t99_crc" 4
+  (and (eq_attr "tune" "thunderx2t99")
+   (eq_attr "type" "crc"))
+  "thunderx2t99_i1")


[PATCH][AArch64] Add addr_type attribute

2017-03-05 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds "addr_type" attribute 
for AArch64.

The patch doesn't change spec but improve other benchmarks.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay for Stage-1?

Thanks,
Naveen

2017-03-06  Julian Brown  
Naveen H.S  

* config/aarch64/aarch64-protos.h (AARCH64_ADDR_REG_IMM)
(AARCH64_ADDR_REG_WB, AARCH64_ADDR_REG_REG, AARCH64_ADDR_REG_SHIFT)
(AARCH64_ADDR_REG_EXT, AARCH64_ADDR_REG_SHIFT_EXT, AARCH64_ADDR_LO_SUM)
(AARCH64_ADDR_SYMBOLIC): New.
(aarch64_mem_type_p): Add prototype.
* config/aarch64/aarch64.c (aarch64_mem_type_p): New function.
* config/aarch64/aarch64.md (addr_type): New attribute.
(prefetch, *mov_aarch64, *movsi_aarch64, *movdi_aarch64)
(*movti_aarch64, *movtf_aarch64, *movsf_aarch64, *movdf_aarch64)
(load_pairsi, load_pairdi, store_pairsi, store_pairdi, load_pairsf)
(load_pairdf, store_pairsf)
(store_pairdf, loadwb_pair_)
(storewb_pair_, extendsidi2_aarch64)
(*load_pair_extendsidi2_aarch64, *zero_extendsidi2_aarch64)
(*load_pair_zero_extendsidi2_aarch64)
(*extend2_aarch64)
(*zero_extend2_aarch64)
(ldr_got_small_, ldr_got_small_sidi, ldr_got_tiny)
(tlsie_small_, tlsie_small_sidi): Add addr_type attribute.diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 9543f8c..e045df8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -299,6 +299,19 @@ enum aarch64_parse_opt_result
 
 extern struct tune_params aarch64_tune_params;
 
+/* Mask bits to use for for aarch64_mem_type_p.  Unshifted/shifted index
+   register variants are separated for scheduling purposes because the
+   distinction matters on some cores.  */
+
+#define AARCH64_ADDR_REG_IMM		0x01
+#define AARCH64_ADDR_REG_WB		0x02
+#define AARCH64_ADDR_REG_REG		0x04
+#define AARCH64_ADDR_REG_SHIFT		0x08
+#define AARCH64_ADDR_REG_EXT		0x10
+#define AARCH64_ADDR_REG_SHIFT_EXT	0x20
+#define AARCH64_ADDR_LO_SUM		0x40
+#define AARCH64_ADDR_SYMBOLIC		0x80
+
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
@@ -347,6 +360,7 @@ bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_simd_valid_immediate (rtx, machine_mode, bool,
    struct simd_immediate_info *);
 bool aarch64_split_dimode_const_store (rtx, rtx);
+bool aarch64_mem_type_p (rtx_insn *, unsigned HOST_WIDE_INT);
 bool aarch64_symbolic_address_p (rtx);
 bool aarch64_uimm12_shift (HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 714bb79..fa25d43 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4551,6 +4551,88 @@ aarch64_classify_address (struct aarch64_address_info *info,
 }
 }
 
+/* Return TRUE if INSN uses an address that satisfies any of the (non-strict)
+   addressing modes specified by MASK.  This is intended for use in scheduling
+   models that are sensitive to the form of address used by some particular
+   instruction.  */
+
+bool
+aarch64_mem_type_p (rtx_insn *insn, unsigned HOST_WIDE_INT mask)
+{
+  aarch64_address_info info;
+  bool valid;
+  attr_addr_type addr_type;
+  rtx mem, addr;
+  machine_mode mode;
+
+  addr_type = get_attr_addr_type (insn);
+
+  switch (addr_type)
+{
+case ADDR_TYPE_WB:
+  info.type = ADDRESS_REG_WB;
+  break;
+
+case ADDR_TYPE_LO_SUM:
+  info.type = ADDRESS_LO_SUM;
+  break;
+
+case ADDR_TYPE_OP0:
+case ADDR_TYPE_OP1:
+  extract_insn_cached (insn);
+
+  mem = recog_data.operand[(addr_type == ADDR_TYPE_OP0) ? 0 : 1];
+
+  gcc_assert (MEM_P (mem));
+  
+  addr = XEXP (mem, 0);
+  mode = GET_MODE (mem);
+
+classify:
+  valid = aarch64_classify_address (, addr, mode, MEM, false);
+  if (!valid)
+	return false;
+
+  break;
+
+case ADDR_TYPE_OP0ADDR:
+case ADDR_TYPE_OP1ADDR:
+  extract_insn_cached (insn);
+
+  addr = recog_data.operand[(addr_type == ADDR_TYPE_OP0ADDR) ? 0 : 1];
+  mode = DImode;
+  goto classify;
+
+case ADDR_TYPE_NONE:
+  return false;
+}
+
+  switch (info.type)
+{
+case ADDRESS_REG_IMM:
+  return (mask & AARCH64_ADDR_REG_IMM) != 0;
+case ADDRESS_REG_WB:
+  return (mask & AARCH64_ADDR_REG_WB) != 0;
+case ADDRESS_REG_REG:
+  if (info.shift == 0)
+	return (mask & AARCH64_ADDR_REG_REG) != 0;
+  else
+return (mask & AARCH64_ADDR_REG_SHIFT) != 0;
+case ADDRESS_REG_UXTW:
+case ADDRESS_REG_SXTW:
+  if (info.shift == 0)
+	return (mask & AARCH64_ADDR_REG_EXT) != 0;
+  else
+	return (mask & AARCH64_ADDR_REG_SHIFT_EXT) != 

Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-02-05 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-12/msg00718.html

Thanks,
Naveen






Re: [PATCH] [AArch64] Implement popcount pattern

2017-02-02 Thread Hurugalawadi, Naveen
Hi Andrew,

Thanks for clearing the confusion.

> I don't understand this comment and how it relates to your updated patch

foo, foo1 and foo2 generates calls to "popcountdi2" which should have
been "popcountsi2" for foo1. When Kyrill commented on using the
popcountsi2; I was confused :).

Hence, the testcase generally checks for the absence of call to "popcount"
and the presence of "cnt" instruction instead.

>> Now of course what should change still is the argument 
>> types to foo1/foo2 

The arguments to foo1 and foo2 are modified as required.

Bootstrapped and regression tested on aarch64-linux-gnu with no regressions.

Please let us know if its okay for stage 1?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a693a3b..684a833 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3778,6 +3778,39 @@
   }
 )
 
+;; Pop count be done via the "CNT" instruction in AdvSIMD.
+;;
+;; MOV	v.1d, x0
+;; CNT	v1.8b, v.8b
+;; ADDV b2, v1.8b
+;; MOV	w0, v2.b[0]
+
+(define_expand "popcount2"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx v = gen_reg_rtx (V8QImode);
+  rtx v1 = gen_reg_rtx (V8QImode);
+  rtx r = gen_reg_rtx (QImode);
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if(mode == SImode)
+{
+  rtx tmp;
+  tmp = gen_reg_rtx (DImode);
+  /* If we have SImode, zero extend to DImode, pop count does
+ not change if we have extra zeros. */
+  emit_insn (gen_zero_extendsidi2 (tmp, in));
+  in = tmp;
+}
+  emit_move_insn (v, gen_lowpart (V8QImode, in));
+  emit_insn (gen_popcountv8qi2 (v1, v));
+  emit_insn (gen_reduc_plus_scal_v8qi (r, v1));
+  emit_insn (gen_zero_extendqi2 (out, r));
+  DONE;
+})
+
 (define_insn "clrsb2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt.c b/gcc/testsuite/gcc.target/aarch64/popcnt.c
new file mode 100644
index 000..7e95796
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+foo (int x)
+{
+  return __builtin_popcount (x);
+}
+
+long
+foo1 (long x)
+{
+  return __builtin_popcountl (x);
+}
+
+long long
+foo2 (long long x)
+{
+  return __builtin_popcountll (x);
+}
+
+/* { dg-final { scan-assembler-not "popcount" } } */
+/* { dg-final { scan-assembler-times "cnt\t" 3 } } */


Re: [PATCH/AARCH64] Add scheduler for Thunderx2t99

2017-02-01 Thread Hurugalawadi, Naveen
Hi James,

Thanks for reviewing the patch and comments.

>> I wonder whether the current modeling of:
>> (define_insn_reservation "thunderx2t99_asimd_load4_elts" 6
>> Actually benefits the schedule in a meaningful way, or if it just increases

Done. Removed the scheduler modeling for thunderx2t99_asimd_load*_mult and
thunderx2t99_asimd_load*_elts for ld3/ld4 and st3/st4 which are rarely used.

The automaton size has come down drastically without that and hopefully
should be okay.

Automaton `thunderx2t99'
  184 NDFA states,838 NDFA arcs
  184 DFA states, 838 DFA arcs
  184 minimal DFA states, 838 minimal DFA arcs
  360 all insns  8 insn equivalence classes
0 locked states
 1016 transition comb vector els,  1472 trans table els: use simple vect
 1472 min delay table els, compression factor 4

Automaton `thunderx2t99_advsimd'
  453 NDFA states,   1966 NDFA arcs
  453 DFA states,1966 DFA arcs
  351 minimal DFA states,1562 minimal DFA arcs
  360 all insns  7 insn equivalence classes
0 locked states
 1901 transition comb vector els,  2457 trans table els: use simple vect
 2457 min delay table els, compression factor 2

Automaton `thunderx2t99_ldst'
   41 NDFA states,163 NDFA arcs
   41 DFA states, 163 DFA arcs
   14 minimal DFA states,  78 minimal DFA arcs
  360 all insns  8 insn equivalence classes
0 locked states
   83 transition comb vector els,   112 trans table els: use simple vect
  112 min delay table els, compression factor 4

Automaton `thunderx2t99_mult'
2 NDFA states,  5 NDFA arcs
2 DFA states,   5 DFA arcs
2 minimal DFA states,   5 minimal DFA arcs
  360 all insns  3 insn equivalence classes
0 locked states
6 transition comb vector els, 6 trans table els: use simple vect
6 min delay table els, compression factor 8


>> You'll want to update this to use your new scheduling model :-).

Done. I had overlooked it :-).

>> you should be changing vulcan to use the new thunderx2t99 model. 

Done. Using the new thunderx2t99 model.

Please review the modified patch and let us know your comments on the same.

Thanks,
Naveen

thunderx2t99-scheduler.patch
Description: thunderx2t99-scheduler.patch


Re: [PATCH] [AArch64] Enable AES and cmp_branch fusion for Thunderx2t99

2017-02-01 Thread Hurugalawadi, Naveen
Hi Kyrill,

Thanks for the review and comments.

>> but there are a couple of issues with the ChangeLog

2017-02-02  Naveen H.S  

* config/aarch64/aarch64.c (thunderx2t99_tunings): Enable AES and
cmp_branch fusion.

Thanks,
Naveen

Re: [PATCH] [AArch64] Implement popcount pattern

2017-02-01 Thread Hurugalawadi, Naveen
Hi James and Kyrill,

Thanks for the review and comments on the patch.

>> On ILP32 systems this would still use the SImode patterns, 
>> so I suggest you use __builtin_popcountll and
>> an unsigned long long return type to ensure you always exercise the 64-bit 
>> code.

Sorry for not commenting on this part.
The issue is that code generates "__popcountdi2" for all the codes in testcase
for LP64 and ILP32 variants.
__builtin_popcount, __builtin_popcountl and __builtin_popcount.

Hence, modified the patch to check for "popcount".

Bootstrapped and regression tested on AArch64-Thunderx-Linux machine.

Please find attached the modified patch and let us know if its okay?

Thanks,
Naveen



popcount-2.patch
Description: popcount-2.patch


[PATCH] [AArch64] Enable AES and cmp_branch fusion for Thunderx2t99

2017-01-24 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that adds AES and CMP_BRANCH
fusion for Thunderx2t99.

Bootstrapped and Regression tested on aarch64-thunderx2t99.
Please review the patch and let us know if its okay?

2017-1-25  Naveen H.S 

gcc
* config/aarch64/aarch64.c (thunderx2t99_tunings):
Improve vector initialization code gen.

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f343d92..acaa975 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -780,7 +780,7 @@ static const struct tune_params thunderx2t99_tunings =
   _approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
-  AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
+  (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
   16,	/* function_align.  */
   8,	/* jump_align.  */
   16,	/* loop_align.  */


[PATCH/AARCH64] Add scheduler for Thunderx2t99

2017-01-11 Thread Hurugalawadi, Naveen
Hi James,

The scheduling patch for vulcan was posted at the following link:-
https://gcc.gnu.org/ml/gcc-patches/2016-07/msg01205.html

We are working on the patch and addressed the comments for thunderx2t99.

>> I tried lowering the repeat expressions as so:
Done.

>>split off the AdvSIMD/FP model from the main pipeline
Done.

>> A change like wiring the vulcan_f0 and vulcan_f1 reservations
>> to be cpu_units of a new define_automaton "vulcan_advsimd"
Done.

>> simplifying some of the remaining large expressions
>> (vulcan_asimd_load*_mult, vulcan_asimd_load*_elts) can bring the size down
Did not understand much about this comment.
Can you please let me know about the simplification?

Please find attached the modified patch as per your suggestions and comments.
Please review the patch and let us know if its okay?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index a7a4b33..4d39673 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -75,7 +75,7 @@ AARCH64_CORE("xgene1",  xgene1,xgene1,8A,  AARCH64_FL_FOR_ARCH8, xge
 
 /* Broadcom ('B') cores. */
 AARCH64_CORE("thunderx2t99",  thunderx2t99, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, thunderx2t99, 0x42, 0x516, -1)
-AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, thunderx2t99, 0x42, 0x516, -1)
+AARCH64_CORE("vulcan",  vulcan, vulcan, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, thunderx2t99, 0x42, 0x516, -1)
 
 /* V8 big.LITTLE implementations.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index bde4231..063559c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -220,6 +220,7 @@
 (include "../arm/exynos-m1.md")
 (include "thunderx.md")
 (include "../arm/xgene1.md")
+(include "thunderx2t99.md")
 
 ;; ---
 ;; Jumps and other miscellaneous insns
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
new file mode 100644
index 000..00d40f8
--- /dev/null
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -0,0 +1,513 @@
+;; Cavium ThunderX 2 CN99xx pipeline description
+;; Copyright (C) 2016-2017 Free Software Foundation, Inc.
+;;
+;; Contributed by Cavium, Broadcom and Mentor Embedded.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; .
+
+(define_automaton "thunderx2t99, thunderx2t99_advsimd, thunderx2t99_ldst")
+(define_automaton "thunderx2t99_mult")
+
+(define_cpu_unit "thunderx2t99_i0" "thunderx2t99")
+(define_cpu_unit "thunderx2t99_i1" "thunderx2t99")
+(define_cpu_unit "thunderx2t99_i2" "thunderx2t99")
+
+(define_cpu_unit "thunderx2t99_ls0" "thunderx2t99_ldst")
+(define_cpu_unit "thunderx2t99_ls1" "thunderx2t99_ldst")
+(define_cpu_unit "thunderx2t99_sd" "thunderx2t99_ldst")
+
+; Pseudo-units for multiply pipeline.
+
+(define_cpu_unit "thunderx2t99_i1m1" "thunderx2t99_mult")
+(define_cpu_unit "thunderx2t99_i1m2" "thunderx2t99_mult")
+(define_cpu_unit "thunderx2t99_i1m3" "thunderx2t99_mult")
+
+; Pseudo-units for load delay (assuming dcache hit).
+
+(define_cpu_unit "thunderx2t99_ls0d1" "thunderx2t99_ldst")
+(define_cpu_unit "thunderx2t99_ls0d2" "thunderx2t99_ldst")
+(define_cpu_unit "thunderx2t99_ls0d3" "thunderx2t99_ldst")
+
+(define_cpu_unit "thunderx2t99_ls1d1" "thunderx2t99_ldst")
+(define_cpu_unit "thunderx2t99_ls1d2" "thunderx2t99_ldst")
+(define_cpu_unit "thunderx2t99_ls1d3" "thunderx2t99_ldst")
+
+; Make some aliases for f0/f1.
+(define_cpu_unit "thunderx2t99_f0" "thunderx2t99_advsimd")
+(define_cpu_unit "thunderx2t99_f1" "thunderx2t99_advsimd")
+
+(define_reservation "thunderx2t99_i012" "thunderx2t99_i0|thunderx2t99_i1|thunderx2t99_i2")
+(define_reservation "thunderx2t99_ls01" "thunderx2t99_ls0|thunderx2t99_ls1")
+(define_reservation "thunderx2t99_f01" "thunderx2t99_f0|thunderx2t99_f1")
+
+(define_reservation "thunderx2t99_ls_both" "thunderx2t99_ls0+thunderx2t99_ls1")
+
+; A load with delay in the ls0/ls1 pipes.
+(define_reservation "thunderx2t99_l0delay" "thunderx2t99_ls0,\
+  thunderx2t99_ls0d1,thunderx2t99_ls0d2,\
+  thunderx2t99_ls0d3")
+(define_reservation "thunderx2t99_l1delay" "thunderx2t99_ls1,\
+  thunderx2t99_ls1d1,thunderx2t99_ls1d2,\
+  thunderx2t99_ls1d3")

[PATCH] [Match & Simplify] Optimize some minmax patterns

2016-12-15 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that optimizes some patterns
in maxmin on same variabes with constants.

Bootstrapped and Regression tested on x86_64 & aarch64-thunder-linux.

Please review the patch and let us know if its okay?

2016-12-15  Andrew Pinski  
 Naveen H.S 
gcc
* match.pd (max:c @0 (plus@2 @0 INTEGER_CST@1)): New Pattern.
(min:c @0 (plus@2 @0 INTEGER_CST@1)) : New Pattern.
gcc/testsuite
* gcc.dg/max.c: New Testcase.
* gcc.dg/min.c: New Testcase.
   

diff --git a/gcc/match.pd b/gcc/match.pd
index f4cc2d8..ff5e97b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1324,6 +1324,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 
 /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax().  */
 
+/* max (a, a + CST) -> a + CST where CST is positive.  */
+/* max (a, a + CST) -> a where CST is negative.  */
+(simplify
+ (max:c @0 (plus@2 @0 INTEGER_CST@1))
+  (if (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
+   (if (tree_int_cst_sgn (@1) > 0)
+@2
+@0)))
+
+/* min (a, a + CST) -> a where CST is positive.  */
+/* min (a, a + CST) -> a + CST where CST is negative. */
+(simplify
+ (min:c @0 (plus@2 @0 INTEGER_CST@1))
+  (if (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
+   (if (tree_int_cst_sgn (@1) > 0)
+@0
+@2)))
+
 (for minmax (min max FMIN FMAX)
  (simplify
   (minmax @0 @0)
diff --git a/gcc/testsuite/gcc.dg/max.c b/gcc/testsuite/gcc.dg/max.c
new file mode 100644
index 000..e979810
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/max.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+static inline int
+max (int a, int b)
+{
+  return a < b ? b : a;
+}
+
+int
+test_00 (int a)
+{
+  return max (a, a + 8);
+}
+
+int
+test_01 (int a)
+{
+  return max (a, a - 8);
+}
+
+/* { dg-final { scan-tree-dump-not "MAX_EXPR" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/min.c b/gcc/testsuite/gcc.dg/min.c
new file mode 100644
index 000..d847270
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/min.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+static inline int
+min (int a, int b)
+{
+  return a < b ? a : b;
+}
+
+int
+test_00 (int a)
+{
+  return min (a, a + 8);
+}
+
+int
+test_01 (int a)
+{
+  return min (a, a - 8);
+}
+
+/* { dg-final { scan-tree-dump-not "MIN_EXPR" "optimized" } } */


Re: [PATCH] [AArch64] Implement popcount pattern

2016-12-13 Thread Hurugalawadi, Naveen
Hi Kyrill,

Thanks for reviewing the patch and your useful comments.

>> looks good to me if it has gone through the normal required
>> bootstrap and testing, but I can't approve.

Bootstrapped and Regression Tested on aarch64-thunderx-linux.

>> The rest of the MD file uses the term AdvSIMD. Also, the instrurction
>> is CNT rather than "pop count".

Done.

>> __builtin_popcount takes an unsigned int, so this should be 
>> scanning for absence of popcountsi2 instead?

Done.

Please find attached the modified patch as per review comments
and let me know if its okay for Stage-1 or current branch.

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 65eb326..0acb3f0 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3785,6 +3785,39 @@
   }
 )
 
+;; Pop count be done via the "CNT" instruction in AdvSIMD.
+;;
+;; MOV	v.1d, x0
+;; CNT	v1.8b, v.8b
+;; ADDV b2, v1.8b
+;; MOV	w0, v2.b[0]
+
+(define_expand "popcount2"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx v = gen_reg_rtx (V8QImode);
+  rtx v1 = gen_reg_rtx (V8QImode);
+  rtx r = gen_reg_rtx (QImode);
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if(mode == SImode)
+{
+  rtx tmp;
+  tmp = gen_reg_rtx (DImode);
+  /* If we have SImode, zero extend to DImode, pop count does
+ not change if we have extra zeros. */
+  emit_insn (gen_zero_extendsidi2 (tmp, in));
+  in = tmp;
+}
+  emit_move_insn (v, gen_lowpart (V8QImode, in));
+  emit_insn (gen_popcountv8qi2 (v1, v));
+  emit_insn (gen_reduc_plus_scal_v8qi (r, v1));
+  emit_insn (gen_zero_extendqi2 (out, r));
+  DONE;
+})
+
 (define_insn "clrsb2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt.c b/gcc/testsuite/gcc.target/aarch64/popcnt.c
new file mode 100644
index 000..37cf4b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+foo (int x)
+{
+  return __builtin_popcount (x);
+}
+
+long
+foo1 (int x)
+{
+  return __builtin_popcountl (x);
+}
+
+/* { dg-final { scan-assembler-not "popcount" } } */
+/* { dg-final { scan-assembler-times "cnt\t" 2 } } */


[PATCH] [AArch64] Implement popcount pattern

2016-12-11 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that implements the support for popcount
patterns in AArch64.

The implementation improves OVS-DPDK on ThunderX by 3%. It would have a
similar effect on other AArch64 targets.

Please review the patch and let us know if its okay?

2016-12-12  Andrew Pinski  

gcc
* config/aarch64/aarch64.md (popcount2): New pattern.

gcc/testsuite
* gcc.target/aarch64/popcount.c : New Testcase.diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 65eb326..c688ddc 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3785,6 +3785,39 @@
   }
 )
 
+/* Pop count be done via the pop count instruction in NEON. */
+/*
+  mov v.1d, x0
+  Cnt v1.8b, v.8b
+  Addv b2, v1.8b
+  Mov w0, v2.b[0]
+*/
+(define_expand "popcount2"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx v = gen_reg_rtx (V8QImode);
+  rtx v1 = gen_reg_rtx (V8QImode);
+  rtx r = gen_reg_rtx (QImode);
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if(mode == SImode)
+{
+  rtx tmp;
+  tmp = gen_reg_rtx (DImode);
+  /* If we have SImode, zero extend to DImode, pop count does
+ not change if we have extra zeros. */
+  emit_insn (gen_zero_extendsidi2 (tmp, in));
+  in = tmp;
+}
+  emit_move_insn (v, gen_lowpart (V8QImode, in));
+  emit_insn (gen_popcountv8qi2 (v1, v));
+  emit_insn (gen_reduc_plus_scal_v8qi (r, v1));
+  emit_insn (gen_zero_extendqi2 (out, r));
+  DONE;
+})
+
 (define_insn "clrsb2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcount.c b/gcc/testsuite/gcc.target/aarch64/popcount.c
new file mode 100644
index 000..2d71168
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcount.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int foo(int x)
+{
+  return __builtin_popcount(x);
+}
+
+/* { dg-final { scan-assembler-not "popcountdi2" } } */
+/* { dg-final { scan-assembler "cnt\t" } } */


Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2016-12-08 Thread Hurugalawadi, Naveen
Hi,

Sorry. Missed out the testcase in patch submission.
Added the missing testcase along with the ChangeLog.
Please review the same and let us know if thats okay?

2016-12-09  Andrew PInski  

gcc
    * config/aarch64/aarch64.c (aarch64_expand_vector_init):
    Improve vector initialization code gen.
gcc/testsuite
* gcc.target/aarch64/pr71663.c: New Testcase.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e87831f..da5b6fa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11609,11 +11609,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   aarch64_expand_vector_init (target, copy);
 }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there is only varables, try to optimize
+ the inseration using dup for the most common element
+ followed by insertations. */
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2];
+  int nummatches = 0;
+  memset (matches, 0, sizeof(matches));
+  for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		nummatches++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+  /* Create a duplicate of the most common element. */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+  /* Insert the rest. */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
+  /* Insert the variable lanes directly.  */
+
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..c8df847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, c, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 3 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


[PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2016-12-08 Thread Hurugalawadi, Naveen
Hi,

The AArch64 vector initialization sequence can be optimized to generate
better code. The attached patch handles for the case where the vector
contains only variables. It checks for the common elements in the vector
and inserts the values in optimized way.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay?

2016-12-09  Andrew PInski  

gcc
* config/aarch64/aarch64.c (aarch64_expand_vector_init):
Improve vector initialization code gen.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e87831f..da5b6fa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11609,11 +11609,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   aarch64_expand_vector_init (target, copy);
 }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there is only varables, try to optimize
+ the inseration using dup for the most common element
+ followed by insertations. */
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2];
+  int nummatches = 0;
+  memset (matches, 0, sizeof(matches));
+  for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		nummatches++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+  /* Create a duplicate of the most common element. */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+  /* Insert the rest. */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
+  /* Insert the variable lanes directly.  */
+
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);


Re: [PATCH] [AArch64] Fix PR78382

2016-12-06 Thread Hurugalawadi, Naveen
Hi James,

Thanks for the review and suggestions regarding the testcase.

>> Why limit the ABI and endianness here

Extra options have been dropped and the testcase will check across
all variants and endianness.

Please find attached the modified patch as per the comments and let
me know if its okay?

Thanks,
Naveen

    diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index dab46b5..2b61897 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1378,10 +1378,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 case SYMBOL_SMALL_TLSGD:
   {
 	rtx_insn *insns;
-	rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
+	machine_mode mode = GET_MODE (dest);
+	rtx result = gen_rtx_REG (mode, R0_REGNUM);
 
 	start_sequence ();
-	aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
+	if (TARGET_ILP32)
+	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
+	else
+	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
 	insns = get_insns ();
 	end_sequence ();
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1e6b6f5..9d89ee8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5173,20 +5173,20 @@
 ;; The TLS ABI specifically requires that the compiler does not schedule
 ;; instructions in the TLS stubs, in order to enable linker relaxation.
 ;; Therefore we treat the stubs as an atomic sequence.
-(define_expand "tlsgd_small"
+(define_expand "tlsgd_small_"
  [(parallel [(set (match_operand 0 "register_operand" "")
   (call (mem:DI (match_dup 2)) (const_int 1)))
-	 (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
+	 (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
 	 (clobber (reg:DI LR_REGNUM))])]
  ""
 {
   operands[2] = aarch64_tls_get_addr ();
 })
 
-(define_insn "*tlsgd_small"
+(define_insn "*tlsgd_small_"
   [(set (match_operand 0 "register_operand" "")
 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
+   (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
(clobber (reg:DI LR_REGNUM))
   ]
   ""
diff --git a/gcc/testsuite/gcc.target/aarch64/pr78382.c b/gcc/testsuite/gcc.target/aarch64/pr78382.c
new file mode 100644
index 000..febe7bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr78382.c
@@ -0,0 +1,10 @@
+/* { dg-require-effective-target fpic } */
+/* { dg-options "-mtls-dialect=trad -fpic" } */
+
+__thread int abc;
+void
+foo ()
+{
+  int *p;
+  p = 
+}


Re: [PATCH] [AArch64] Fix PR71112

2016-12-06 Thread Hurugalawadi, Naveen
Hi James,

Thanks for the review and suggestions regarding the testcase.

>> Why limit the ABI and endianness here, and if you do plan to do that

Extra options have been dropped and the testcase will check across
all variants and endianness.

Please find attached the modified patch as per the comments and let
me know if its okay?

Thanks,
Naveen

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index dab46b5..9fce849 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1302,7 +1302,8 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 	emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 
 	if (mode != GET_MODE (gp_rtx))
-	  gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
+ gp_rtx = gen_lowpart (mode, gp_rtx);
+
 	  }
 
 	if (mode == ptr_mode)
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr71112.c b/gcc/testsuite/gcc.c-torture/compile/pr71112.c
new file mode 100644
index 000..69e2df6
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr71112.c
@@ -0,0 +1,10 @@
+/* PR target/71112.  */
+/* { dg-additional-options "-fpie" { target pie } } */
+
+extern int dbs[100];
+void f (int *);
+int nscd_init (void)
+{
+  f (dbs);
+  return 0;
+}


[PING][PATCH] [AArch64] Fix PR71112

2016-12-05 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-11/msg02305.html

Thanks,
Naveen



[PING][PATCH] [AArch64] Fix PR78382

2016-12-05 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-11/msg02078.html

Thanks,
Naveen



[PING] [PATCH] [AArch64] Fix PR71727

2016-12-05 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-11/msg00697.html

Thanks,
Naveen  

Re: [PING] [PATCH] Fix PR31096

2016-11-29 Thread Hurugalawadi, Naveen
Hi Jeff,

>> I believe Richi asked for a small change after which you can consider 
>> the patch approved:

Yeah. Thanks for all the comments and reviews.
Patch committed after the modification as:-

https://gcc.gnu.org/ml/gcc-cvs/2016-11/msg01019.html

Thanks,
Naveen

[PING] [PATCH] Fix PR71727

2016-11-22 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-11/msg00697.html

Thanks,
Naveen



[PATCH] [AArch64] Fix PR71112

2016-11-22 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that fixes PR71112.

The current implementation that handles SYMBOL_SMALL_GOT_28K in
aarch64_load_symref_appropriately access the high part of RTX for Big-Endian
mode which results in ICE for ILP32.

The attached patch modifies it by accessing the lower part for both Endian
and fixes the issue.

Please review the patch and let me know if its okay?


2016-11-23  Andrew PInski  

gcc
* config/aarch64/aarch64.c (aarch64_load_symref_appropriately):
Access the lower part of RTX appropriately.

gcc/testsuite
* gcc.target/aarch64/pr71112.c : New Testcase.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index efcba83..4d87953 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1298,7 +1298,8 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 	emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 
 	if (mode != GET_MODE (gp_rtx))
-	  gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
+ gp_rtx = gen_lowpart (mode, gp_rtx);
+
 	  }
 
 	if (mode == ptr_mode)
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71112.c b/gcc/testsuite/gcc.target/aarch64/pr71112.c
new file mode 100644
index 000..5bb9dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71112.c
@@ -0,0 +1,12 @@
+/* PR target/71112 */
+/* { dg-do compile } */
+/* { dg-options "-mabi=ilp32 -mbig-endian -fpie" } */
+
+extern int dbs[100];
+void f (int *);
+int
+nscd_init (void)
+{
+  f (dbs);
+  return 0;
+}


[PING] [PATCH] Fix PR31096

2016-11-22 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-11/msg01049.html

Thanks,
Naveen

[PATCH] [AArch64] Fix PR77635

2016-11-22 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that fixes PR77635.

Some load pair testcase fails when gcc is configured "--with-cpu=thunderx"
as -mcpu=generic is missed out in them.
The attached patch modifies the testcases to use -mcpu=generic.

Please review the patch and let me know if its okay?

2016-11-23  Naveen H.S  

* gcc.target/aarch64/ldp_stp_1.c : Add -mcpu=generic.
* gcc.target/aarch64/store-pair-1.c : Likewise.

diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
index 9de4e77..89550e0 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mcpu=generic" } */
 
 int arr[4][4];
 
diff --git a/gcc/testsuite/gcc.target/aarch64/store-pair-1.c b/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
index a90fc61..b8e762b 100644
--- a/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mcpu=generic" } */
 
 int f(int *a, int b)
 {


[PATCH] [AArch64] Fix PR77634

2016-11-22 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that fixes PR77634.

Some testcase does not use -fno-vect-cost-model  and hence fails when gcc is
configured "--with-cpu=thunderx".
The attached patch modifies the testcases to use -fno-vect-cost-model.

Please review the patch and let me know if its okay?


2016-11-23  Naveen H.S  

* gcc.target/aarch64/fmaxmin.c : Add -fno-vect-cost-model.
* gcc.target/aarch64/fmul_fcvt_2.c : Likewise.
* gcc.target/aarch64/vect-abs-compile.c : Likewise.
* gcc.target/aarch64/vect-clz.c : Likewise.
* gcc.target/aarch64/vect-fcm-eq-d.c : Likewise.
* gcc.target/aarch64/vect-fcm-ge-d.c : Likewise.
* gcc.target/aarch64/vect-fcm-gt-d.c : Likewise.
* gcc.target/aarch64/vect-fmovd-zero.c : Likewise.
* gcc.target/aarch64/vect-fmovd.c : Likewise.
* gcc.target/aarch64/vect-fmovf-zero.c : Likewise.
* gcc.target/aarch64/vect-fmovf.c : Likewise.
* gcc.target/aarch64/vect_ctz_1.c : Likewise.diff --git a/gcc/testsuite/gcc.target/aarch64/fmaxmin.c b/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
index 7654955..4447e33 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fno-inline -save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -fno-vect-cost-model -save-temps" } */
 
 
 extern void abort (void);
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c b/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
index d8a9335..4ac3ab7 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline" } */
+/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline -fno-vect-cost-model" } */
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c b/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
index 27146b8..19082d7 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
@@ -1,6 +1,6 @@
 
 /* { dg-do compile } */
-/* { dg-options "-O3" } */
+/* { dg-options "-O3 -fno-vect-cost-model" } */
 
 #define N 16
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-clz.c b/gcc/testsuite/gcc.target/aarch64/vect-clz.c
index 4c7321f..044fa9e 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-clz.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-clz.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -save-temps -fno-inline" } */
+/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
 
 extern void abort ();
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c b/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
index d91cca2..4640f57 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
 
 #define FTYPE double
 #define ITYPE long
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c b/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
index c3c4fb3..f5b6329 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
 
 #define FTYPE double
 #define ITYPE long
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c b/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
index 9ef5f1c..28d7ab6 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
 
 #define FTYPE double
 #define ITYPE long
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c b/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
index f8ef3ac..bfd327c 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
 
 #define N 32
 
diff --git 

Re: [PATCH] [AArch64] Fix PR78382

2016-11-20 Thread Hurugalawadi, Naveen
Hi Kugan,

>> Why don't you use the mode of dest as done in other similar places. Like:

Thanks for the pointer. Modified the patch as per your suggestion.

Please find attached the modified patch and let me know your comments.

Bootstrapped and regression tested on Thunderx.

Thanks,
Naveen

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bd97c5b..4aea578 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1374,10 +1374,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 case SYMBOL_SMALL_TLSGD:
   {
 	rtx_insn *insns;
-	rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
+	machine_mode mode = GET_MODE (dest);
+	rtx result = gen_rtx_REG (mode, R0_REGNUM);
 
 	start_sequence ();
-	aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
+	if (TARGET_ILP32)
+	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
+	else
+	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
 	insns = get_insns ();
 	end_sequence ();
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a652a7c..4833c7f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5089,20 +5089,20 @@
 ;; The TLS ABI specifically requires that the compiler does not schedule
 ;; instructions in the TLS stubs, in order to enable linker relaxation.
 ;; Therefore we treat the stubs as an atomic sequence.
-(define_expand "tlsgd_small"
+(define_expand "tlsgd_small_"
  [(parallel [(set (match_operand 0 "register_operand" "")
   (call (mem:DI (match_dup 2)) (const_int 1)))
-	 (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
+	 (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
 	 (clobber (reg:DI LR_REGNUM))])]
  ""
 {
   operands[2] = aarch64_tls_get_addr ();
 })
 
-(define_insn "*tlsgd_small"
+(define_insn "*tlsgd_small_"
   [(set (match_operand 0 "register_operand" "")
 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
+   (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
(clobber (reg:DI LR_REGNUM))
   ]
   ""
diff --git a/gcc/testsuite/gcc.target/aarch64/pr78382.c b/gcc/testsuite/gcc.target/aarch64/pr78382.c
new file mode 100644
index 000..6c98e5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr78382.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -fpic -mabi=ilp32 -mtls-dialect=trad" } */
+
+__thread int abc;
+void
+foo ()
+{
+  int *p;
+  p = 
+}


Re: [PATCH] [AArch64] Fix PR78382

2016-11-18 Thread Hurugalawadi, Naveen
Hi Kyrill,

Thanks for the comment.

Bootstrapped successfully on AArch64 (thunder) system.
And also regression tested on AArch64(thunder) with no regressions.

Thanks,
Naveen

[PATCH] [AArch64] Fix PR78382

2016-11-17 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that fixes PR78382.

The "SYMBOL_SMALL_TLSGD" was not handled for ILP32. 
Hence it generates error when compiled for ILP32.
The attached patch adds the support and handles it properly as expected 
for ILP32.

Please review the patch and let me know if its okay?

Regression tested on AArch64 with no regressions.

Thanks,
Naveen

2016-11-18  Naveen H.S  

* config/aarch64/aarch64.c (aarch64_load_symref_appropriately):
Handle SYMBOL_SMALL_TLSGD for ILP32.
* config/aarch64/aarch64.md : tlsgd_small modified into
tlsgd_small_ to support SImode and DImode.
*tlsgd_small modified into *tlsgd_small_ to support SImode and
DImode.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 11d41cf..1688f0d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1374,10 +1374,17 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 case SYMBOL_SMALL_TLSGD:
   {
 	rtx_insn *insns;
-	rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
+	rtx result;
+	if (TARGET_ILP32)
+	  result = gen_rtx_REG (SImode, R0_REGNUM);
+	else
+	  result = gen_rtx_REG (DImode, R0_REGNUM);
 
 	start_sequence ();
-	aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
+	if (TARGET_ILP32)
+	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
+	else
+	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
 	insns = get_insns ();
 	end_sequence ();
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a652a7c..4833c7f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5089,20 +5089,20 @@
 ;; The TLS ABI specifically requires that the compiler does not schedule
 ;; instructions in the TLS stubs, in order to enable linker relaxation.
 ;; Therefore we treat the stubs as an atomic sequence.
-(define_expand "tlsgd_small"
+(define_expand "tlsgd_small_"
  [(parallel [(set (match_operand 0 "register_operand" "")
   (call (mem:DI (match_dup 2)) (const_int 1)))
-	 (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
+	 (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
 	 (clobber (reg:DI LR_REGNUM))])]
  ""
 {
   operands[2] = aarch64_tls_get_addr ();
 })
 
-(define_insn "*tlsgd_small"
+(define_insn "*tlsgd_small_"
   [(set (match_operand 0 "register_operand" "")
 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
+   (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
(clobber (reg:DI LR_REGNUM))
   ]
   ""
diff --git a/gcc/testsuite/gcc.target/aarch64/pr78382.c b/gcc/testsuite/gcc.target/aarch64/pr78382.c
new file mode 100644
index 000..6c98e5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr78382.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -fpic -mabi=ilp32 -mtls-dialect=trad" } */
+
+__thread int abc;
+void
+foo ()
+{
+  int *p;
+  p = 
+}


Re: [PATCH] Fix PR31096

2016-11-11 Thread Hurugalawadi, Naveen
Hi,

Sorry for a very late reply as the mail was missed or overlooked.

>> could now move the test  tree_expr_nonzero_p next to 
>> tree_expr_nonnegative_p (it is redundant for  the last case). 

Done.

>> Often just a comment can really help here. 

Comments updated as per the suggestion

>> when C is zero and verify this transformation doesn't fire on that case.

Updated test to check with zero.

>> verifying that the operand orders change appropriately when dealing 
>> with a negative constant.

Done.

>> verify nothing happens with floating point or vector types.

Done.

Please review the patch and let me know if any modifications are required.
Regression tested on X86 and AArch64.

Thanks,
Naveen

2016-11-11  Naveen H.S  
gcc
* fold-const.c (tree_expr_nonzero_p) : Make non-static.
* fold-const.h (tree_expr_nonzero_p) : Declare.
* match.pd (cmp (mult:c @0 @1) (mult:c @2 @1) : New Pattern.
* match.pd (cmp (mult:c @0 @1) (mult:c @2 @1) : New Pattern.
gcc/testsuite
* gcc.dg/pr31096.c: New testcase.
* gcc.dg/pr31096-1.c: New testcase.diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index e14471e..8f13807 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -9015,7 +9015,7 @@ tree_expr_nonzero_warnv_p (tree t, bool *strict_overflow_p)
 /* Return true when T is an address and is known to be nonzero.
Handle warnings about undefined signed overflow.  */
 
-static bool
+bool
 tree_expr_nonzero_p (tree t)
 {
   bool ret, strict_overflow_p;
diff --git a/gcc/fold-const.h b/gcc/fold-const.h
index 46dcd28..fbe1328 100644
--- a/gcc/fold-const.h
+++ b/gcc/fold-const.h
@@ -169,6 +169,7 @@ extern tree size_diffop_loc (location_t, tree, tree);
 #define non_lvalue(T) non_lvalue_loc (UNKNOWN_LOCATION, T)
 extern tree non_lvalue_loc (location_t, tree);
 
+extern bool tree_expr_nonzero_p (tree);
 extern bool tree_expr_nonnegative_p (tree);
 extern bool tree_expr_nonnegative_warnv_p (tree, bool *, int = 0);
 extern tree make_range (tree, int *, tree *, tree *, bool *);
diff --git a/gcc/match.pd b/gcc/match.pd
index 29ddcd8..eecfe23 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -31,6 +31,7 @@ along with GCC; see the file COPYING3.  If not see
zerop
CONSTANT_CLASS_P
tree_expr_nonnegative_p
+   tree_expr_nonzero_p
integer_valued_real_p
integer_pow2p
HONOR_NANS)
@@ -1017,7 +1018,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   && tree_nop_conversion_p (type, TREE_TYPE (@1)))
   (convert (bit_and (bit_not @1) @0
 
+/* For integral types with undefined overflow and C != 0 fold
+   x * C EQ/NE y * C into x EQ/NE y.  */
+(for cmp (eq ne)
+ (simplify
+  (cmp (mult:c @0 @1) (mult:c @2 @1))
+  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
+   && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0))
+   && tree_expr_nonzero_p (@1))
+   (cmp @0 @2
+
+/* For integral types with undefined overflow and C != 0 fold
+   x * C RELOP y * C into:
 
+   x RELOP y for nonnegative C
+   y RELOP x for negative C  */
+(for cmp (lt gt le ge)
+ (simplify
+  (cmp (mult:c @0 @1) (mult:c @2 @1))
+  (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
+   && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
+   (if (tree_expr_nonnegative_p (@1) && tree_expr_nonzero_p (@1))
+(cmp @0 @2)
+   (if (TREE_CODE (@1) == INTEGER_CST
+	&& wi::lt_p (@1, 0, TYPE_SIGN (TREE_TYPE (@1
+(cmp @2 @0))
 
 /* ((X inner_op C0) outer_op C1)
With X being a tree where value_range has reasoned certain bits to always be
diff --git a/gcc/testsuite/gcc.dg/pr31096-1.c b/gcc/testsuite/gcc.dg/pr31096-1.c
new file mode 100644
index 000..e681f0f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr31096-1.c
@@ -0,0 +1,51 @@
+/* PR middle-end/31096 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define zero(name, op) \
+int name (int a, int b) \
+{ return a * 0 op b * 0; }
+
+zero(zeq, ==) zero(zne, !=) zero(zlt, <)
+zero(zgt, >)  zero(zge, >=) zero(zle, <=)
+
+#define unsign_pos(name, op) \
+int name (unsigned a, unsigned b) \
+{ return a * 4 op b * 4; }
+
+unsign_pos(upeq, ==) unsign_pos(upne, !=) unsign_pos(uplt, <)
+unsign_pos(upgt, >)  unsign_pos(upge, >=) unsign_pos(uple, <=)
+
+#define unsign_neg(name, op) \
+int name (unsigned a, unsigned b) \
+{ return a * -2 op b * -2; }
+
+unsign_neg(uneq, ==) unsign_neg(unne, !=) unsign_neg(unlt, <)
+unsign_neg(ungt, >)  unsign_neg(unge, >=) unsign_neg(unle, <=)
+
+#define float(name, op) \
+int name (float a, float b) \
+{ return a * 5 op b * 5; }
+
+float(feq, ==) float(fne, !=) float(flt, <)
+float(fgt, >)  float(fge, >=) float(fle, <=)
+
+#define float_val(name, op) \
+int name (int a, int b) \
+{ return a * 54.0 op b * 54.0; }
+
+float_val(fveq, ==) float_val(fvne, !=) float_val(fvlt, <)
+float_val(fvgt, >)  float_val(fvge, >=) float_val(fvle, <=)
+
+#define vec(name, op) \
+int name (int a, int b) \
+{ int c[10]; return a * c[1] op b * c[1]; }
+
+vec(veq, ==) vec(vne, !=) vec(vlt, <)
+vec(vgt, 

Re: [PATCH] [AArch64] Fix PR71727

2016-11-08 Thread Hurugalawadi, Naveen
Hi Kyrill,

Thanks for the review and suggestions.

>> It's a good idea to CC the AArch64 maintainers and reviewers
>> on aarch64 patches, or at least

Thanks for CCing the maintainers. Added [AArch64] in the subject line.

>> New functions need a function comment describing their arguments and their 
>> result.

Done.

>> Some more information about why the current behaviour is wrong
>> and how the patch fixes it would be useful in reviewing.

support_vector_misalignment target hook is incorrect when 
STRICT_ALIGNMENT is true for AArch64. 
The patch implements the hook and rectifies the behavior.

Please find attached the modified patch as per suggestions.

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b7d4640..5a0eff5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -141,6 +141,10 @@ static bool aarch64_vector_mode_supported_p (machine_mode);
 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 		 const unsigned char *sel);
 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
+static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
+			 const_tree type,
+			 int misalignment,
+			 bool is_packed);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -11148,6 +11152,37 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
   return true;
 }
 
+/* Return true if the vector misalignment factor is supported by the
+   target.  */
+static bool
+aarch64_builtin_support_vector_misalignment (machine_mode mode,
+	 const_tree type, int misalignment,
+	 bool is_packed)
+{
+  if (TARGET_SIMD && STRICT_ALIGNMENT)
+{
+  /* Return if movmisalign pattern is not supported for this mode.  */
+  if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
+return false;
+
+  if (misalignment == -1)
+	{
+	  /* Misalignment factor is unknown at compile time but we know
+	 it's word aligned.  */
+	  if (aarch64_simd_vector_alignment_reachable (type, is_packed))
+{
+  int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
+
+  if (element_size != 64)
+return true;
+}
+	  return false;
+	}
+}
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+		  is_packed);
+}
+
 /* If VALS is a vector constant that can be loaded into a register
using DUP, generate instructions to do so and return an RTX to
assign to the register.  Otherwise return NULL_RTX.  */
@@ -14398,6 +14433,10 @@ aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+  aarch64_builtin_support_vector_misalignment
+
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71727.c b/gcc/testsuite/gcc.target/aarch64/pr71727.c
new file mode 100644
index 000..05eef3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71727.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mstrict-align -O3" } */
+
+struct test_struct_s
+{
+  long a;
+  long b;
+  long c;
+  long d;
+  unsigned long e;
+};
+
+
+char _a;
+struct test_struct_s xarray[128];
+
+void
+_start (void)
+{
+  struct test_struct_s *new_entry;
+
+  new_entry = [0];
+  new_entry->a = 1;
+  new_entry->b = 2;
+  new_entry->c = 3;
+  new_entry->d = 4;
+  new_entry->e = 5;
+
+  return;
+}
+
+/* { dg-final { scan-assembler-times "mov\tx" 5 {target lp64} } } */
+/* { dg-final { scan-assembler-not "add\tx0, x0, :" {target lp64} } } */


[PATCH] Fix PR71727

2016-11-07 Thread Hurugalawadi, Naveen
Hi,

Please find attached the patch that fixes PR71727.
Please review the patch and let me know if its okay?

Regression tested on Aarch64 with no regressions.

Thanks,
Naveen

2016-11-08  Naveen H.S  

* config/aarch64/aarch64.c
(aarch64_builtin_support_vector_misalignment): New.
(TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT): Define.
* gcc.target/aarch64/pr71727.c : New Testcase.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b7d4640..2649951 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -141,6 +141,10 @@ static bool aarch64_vector_mode_supported_p (machine_mode);
 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 		 const unsigned char *sel);
 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
+static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
+			 const_tree type,
+			 int misalignment,
+			 bool is_packed);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -11148,6 +11152,35 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
   return true;
 }
 
+static bool
+aarch64_builtin_support_vector_misalignment (machine_mode mode,
+	 const_tree type, int misalignment,
+	 bool is_packed)
+{
+  if (TARGET_SIMD && STRICT_ALIGNMENT)
+{
+  /* Return if movmisalign pattern is not supported for this mode.  */
+  if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
+return false;
+
+  if (misalignment == -1)
+	{
+	  /* Misalignment factor is unknown at compile time but we know
+	 it's word aligned.  */
+	  if (aarch64_simd_vector_alignment_reachable (type, is_packed))
+{
+  int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
+
+  if (element_size != 64)
+return true;
+}
+	  return false;
+	}
+}
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+		  is_packed);
+}
+
 /* If VALS is a vector constant that can be loaded into a register
using DUP, generate instructions to do so and return an RTX to
assign to the register.  Otherwise return NULL_RTX.  */
@@ -14398,6 +14431,10 @@ aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+  aarch64_builtin_support_vector_misalignment
+
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71727.c b/gcc/testsuite/gcc.target/aarch64/pr71727.c
new file mode 100644
index 000..05eef3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71727.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mstrict-align -O3" } */
+
+struct test_struct_s
+{
+  long a;
+  long b;
+  long c;
+  long d;
+  unsigned long e;
+};
+
+
+char _a;
+struct test_struct_s xarray[128];
+
+void
+_start (void)
+{
+  struct test_struct_s *new_entry;
+
+  new_entry = [0];
+  new_entry->a = 1;
+  new_entry->b = 2;
+  new_entry->c = 3;
+  new_entry->d = 4;
+  new_entry->e = 5;
+
+  return;
+}
+
+/* { dg-final { scan-assembler-times "mov\tx" 5 {target lp64} } } */
+/* { dg-final { scan-assembler-not "add\tx0, x0, :" {target lp64} } } */


  1   2   3   >