Hi All,

This patch adds support for SLP vectorization of Complex number arithmetic with 
rotations
along with Argand plane.

For this to work it has to recognize two statements in parallel as it needs to 
match
against operations towards both the real and imaginary numbers.  The 
instructions generated
by this change is also only available in their vector forms.  As such I add 
them as pattern
statements to the stmt.  The BB is left untouched and so the scalar loop is 
untouched.

The instructions also require the loads to be contiguous and so when a match is 
made, and
the code decides it is able to do the replacement it re-organizes the SLP tree 
such that the
loads are contiguous.  Since this operation cannot be undone it only does this 
if it's sure that
the resulting loads can be made continguous.

It gets this guarantee by only allowing the replacement if between the matched 
expression and the
loads there are no other expressions it doesn't know aside from type casts.

When a match occurs over multiple expressions, the dead statements are 
immediately removed from the
tree to prevent verification failures later.

Because the pattern matching is done after SLP analysis has analyzed the usage 
of the instruction it
also marks the instructions as used and the old ones as unusued.

When a replacement is done a new internal function is generated which the 
back-end has to expand to
the proper instruction sequences.

For now, this patch only adds support for Complex addition with rotate and 
Complex FMLA
with rotation of 0 and 180. However it is the intention to in the future add 
support for
Complex subtraction and Complex multiplication.

Concretely, this generates

        ldr     q1, [x1, x3]
        ldr     q2, [x0, x3]
        ldr     q0, [x2, x3]
        fcmla   v0.2d, v1.2d, v2.2d, #180
        fcmla   v0.2d, v1.2d, v2.2d, #90
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 3200
        bne     .L2
        ret

now instead of

        add     x3, x2, 31
        sub     x4, x3, x1
        sub     x3, x3, x0
        cmp     x4, 62
        mov     x4, 62
        ccmp    x3, x4, 0, hi
        bls     .L5
        mov     x3, x0
        mov     x0, x1
        add     x1, x2, 3200
        .p2align 3,,7
.L3:
        ld2     {v16.2d - v17.2d}, [x2]
        ld2     {v2.2d - v3.2d}, [x3], 32
        ld2     {v0.2d - v1.2d}, [x0], 32
        mov     v7.16b, v17.16b
        fmul    v6.2d, v0.2d, v3.2d
        fmla    v7.2d, v1.2d, v3.2d
        fmla    v6.2d, v1.2d, v2.2d
        fmls    v7.2d, v2.2d, v0.2d
        fadd    v4.2d, v6.2d, v16.2d
        mov     v5.16b, v7.16b
        st2     {v4.2d - v5.2d}, [x2], 32
        cmp     x2, x1
        bne     .L3
        ret
.L5:
        add     x4, x2, 8
        add     x6, x0, 8
        add     x5, x1, 8
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     d1, [x6, x3]
        ldr     d4, [x1, x3]
        ldr     d5, [x5, x3]
        ldr     d3, [x0, x3]
        fmul    d2, d4, d1
        ldr     d0, [x4, x3]
        fmadd   d0, d5, d1, d0
        ldr     d1, [x2, x3]
        fmadd   d2, d5, d3, d2
        fmsub   d0, d4, d3, d0
        fadd    d1, d1, d2
        str     d1, [x2, x3]
        str     d0, [x4, x3]
        add     x3, x3, 16
        cmp     x3, 3200
        bne     .L2
        ret

Bootstrap and Regtests on aarch64-none-linux-gnu, arm-none-gnueabihf and 
x86_64-pc-linux-gnu
are still on going but previous patch showed no regressions.

The instructions have also been tested on aarch64-none-elf and arm-none-eabi on 
a Armv8.3-a model
and -march=Armv8.3-a+fp16 and all tests pass.

Ok for trunk?

Thanks,
Tamar

gcc/ChangeLog:

2018-11-11  Tamar Christina  <tamar.christ...@arm.com>

        * doc/md.texi (fcadd, fcmla): New.
        * doc/sourcebuild.texi (vect_complex_rot_): New.
        * internal-fn.def (FCOMPLEX_ADD_ROT90): New.
        (FCOMPLEX_ADD_ROT270): New.
        (FCOMPLEX_FMA_ROT0): New.
        (FCOMPLEX_FMA_ROT180): New.
        * optabs.def (fcadd90_optab, fcadd270_optab,
        fcmla0_optab, fcmla180_optab):  New.
        * tree-vect-patterns.c (vect_mark_pattern_stmts): Export function.
        * tree-vect-slp.c (vect_create_complex_patt_stmt): New.
        (vect_is_complex_transform_valid): New.
        (vect_reorder_based_on_load_chain): New.
        (vect_determine_place_in_load_1): New.
        (vect_determine_place_in_load): New.
        (vect_balance_load_nodes_1): New.
        (vect_balance_load_nodes): New.
        (vect_match_expression_p): New.
        (vect_detect_pair_op): New.
        (vect_delete_slp_nodes_1): New.
        (vect_delete_slp_nodes): New.
        (vect_match_slp_patterns_1): New.
        (vect_match_call_complex_add): New.
        (vect_match_call_complex_mla_1): New.
        (vect_match_call_complex_mla): New.
        (vect_match_slp_patterns): New.
        (vect_analyze_slp_instance): Use it.
        * tree-vectorizer.h (vect_mark_pattern_stmts): Export function.

-- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 510321ef12a167f79875cd13a8683271ee590ebe..dd5e2eb0a1eb37897d4f8d5e6b9d9edc2c8278cd 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6006,6 +6006,30 @@ and the sign of operand 2 into operand 0.  All operands have mode
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{fcadd@var{m}@var{n}3} instruction pattern
+@item @samp{fcadd@var{m}@var{n}3}
+Perform a vector addition of complex numbers in operand 1 with operand 2
+and rotation @var{m} storing the result in operand 0.
+The instruction will perform the required permute on it's, which
+requires that the operands be loaded contiguosly into the vectors.
+The operation is only support for vector modes @var{n} and with
+rotations @var{m} of 90 or 270.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{fcmla@var{m}@var{n}4} instruction pattern
+@item @samp{fcmla@var{m}@var{n}4}
+Perform a vector floating point multiply and accumulate of complex numbers
+in operand 0, operand 1 and 2 with rotation @var{m} storing the result in
+operand 0.
+
+The instruction will perform the required permute on it's, which
+requires that the operands be loaded contiguosly into the vectors.
+The operation is only support for vector modes @var{n} and with
+rotations @var{m} of 0, or 180.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{ffs@var{m}2} instruction pattern
 @item @samp{ffs@var{m}2}
 Store into operand 0 one plus the index of the least significant 1-bit
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 748797762d2568f361c2c06268aabd1aac1fede0..00f9fe8664e0a861bda9a35e00b4ef596ef1ed2c 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1602,6 +1602,10 @@ Target supports a vector dot-product of @code{signed short}.
 @item vect_udot_hi
 Target supports a vector dot-product of @code{unsigned short}.
 
+@item vect_complex_rot_@var{n}
+Target supports a vector complex addition and complex fma of mode @var{N}.
+Possible values of @var{n} are @code{hf}, @code{sf}, @code{df}.
+
 @item vect_pack_trunc
 Target supports a vector demotion (packing) of @code{short} to @code{char}
 and from @code{int} to @code{short} using modulo arithmetic.
@@ -1834,6 +1838,16 @@ ARM target supports executing instructions from ARMv8.2-A with the Dot
 Product extension. Some multilibs may be incompatible with these options.
 Implies arm_v8_2a_dotprod_neon_ok.
 
+@item arm_v8_3a_complex_neon_ok
+@anchor{arm_v8_3a_complex_neon_ok}
+ARM target supports options to generate complex number arithmetic instructions
+from ARMv8.3-A.  Some multilibs may be incompatible with these options.
+
+@item arm_v8_3a_complex_neon_hw
+ARM target supports executing complex arithmetic instructions from ARMv8.3-A.
+Some multilibs may be incompatible with these options.
+Implies arm_v8_3a_complex_neon_ok.
+
 @item arm_fp16fml_neon_ok
 @anchor{arm_fp16fml_neon_ok}
 ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS}
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index cda314e112155ae638db7cdb60a51df20b96ade3..220a8f42271ff2c04c363aec87fd18cece0011c1 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -236,12 +236,16 @@ DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb, binary)
 DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
 DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
+DEF_INTERNAL_OPTAB_FN (FCOMPLEX_ADD_ROT90, ECF_CONST, fcadd90, binary)
+DEF_INTERNAL_OPTAB_FN (FCOMPLEX_ADD_ROT270, ECF_CONST, fcadd270, binary)
 
 /* FP scales.  */
 DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
 
 /* Ternary math functions.  */
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
+DEF_INTERNAL_OPTAB_FN (FCOMPLEX_FMA_ROT0, ECF_CONST, fcmla0, ternary)
+DEF_INTERNAL_OPTAB_FN (FCOMPLEX_FMA_ROT180, ECF_CONST, fcmla180, ternary)
 
 /* Unary integer ops.  */
 DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 5a67f5eed5e457e6dea34f9969534f8497e620f6..ab01e103191a1f5cda7846336dfc5d2b2860ac8c 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -278,6 +278,10 @@ OPTAB_D (atan2_optab, "atan2$a3")
 OPTAB_D (atan_optab, "atan$a2")
 OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
+OPTAB_D (fcadd90_optab, "fcadd90$F$a3")
+OPTAB_D (fcadd270_optab, "fcadd270$F$a3")
+OPTAB_D (fcmla0_optab, "fcmla0$F$a4")
+OPTAB_D (fcmla180_optab, "fcmla180$F$a4")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
 OPTAB_D (exp2_optab, "exp2$a2")
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 3053642b24108350d092fb6955beb5f9752086ca..b11868915ba456c1e04461a732a9f49f63181ba5 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -4682,7 +4682,7 @@ const unsigned int NUM_PATTERNS = ARRAY_SIZE (vect_vect_recog_func_ptrs);
 
 /* Mark statements that are involved in a pattern.  */
 
-static inline void
+void
 vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt,
                          tree pattern_vectype)
 {
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index e7e5d252c00cc0a054d3db3f9afaee87403dac51..2e533804b776cdc59ccbe63719249f8563f84e6b 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1872,6 +1872,759 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Create a replacement pattern statement for STMT_INFO and inserts the new
+   statement into NODE.  The statement is created as call to internal function
+   IFN with arguments ARGS.  The arity of IFN needs to match the amount of
+   elements in ARGS.  The scalar type of the statement as TYPE and the
+   corresponding vector type VECTYPE.  These two types are used to construct
+   the new vector only replacement pattern statement.
+
+   Futhermore the new pattern is also added to the vectorization information
+   structure VINFO and the old statement STMT_INFO is marked as unused while
+   the new statement is marked as used and the number of SLP uses of the new
+   statement is incremented.
+
+   The newly created gimple call is returned and the BB remains unchanged.  */
+
+static gcall*
+vect_create_complex_patt_stmt (slp_tree node, stmt_vec_info stmt_info,
+			       internal_fn ifn, vec<tree> args,
+			       vec_info *vinfo, tree type, tree vectype)
+{
+  gcall *call_stmt = gimple_build_call_internal_vec (ifn, args);
+  tree var = make_temp_ssa_name (type, call_stmt, "patt");
+  gimple_call_set_lhs (call_stmt, var);
+  gimple_set_location (call_stmt,
+		       gimple_location (STMT_VINFO_STMT (stmt_info)));
+  gimple_call_set_nothrow (call_stmt, true);
+
+  stmt_vec_info call_stmt_info = vinfo->add_stmt (call_stmt);
+  vect_mark_pattern_stmts (stmt_info, call_stmt, vectype);
+  STMT_VINFO_RELEVANT (stmt_info) = vect_unused_in_scope;
+  STMT_VINFO_RELEVANT (call_stmt_info) = vect_used_in_scope;
+  STMT_VINFO_NUM_SLP_USES (call_stmt_info)++;
+
+  SLP_TREE_SCALAR_STMTS (node).safe_push (call_stmt_info);
+
+  return call_stmt;
+}
+
+/* Validate that the given slp tree rooted in NODE can be safely tranformed
+   into a call to one of the new complex number builtins.  This is a pre-check
+   because the transformation cannot be undone if it is later discovered that
+   the conversion is not possible.
+
+   A transformation is valid if between the NODE that has been matched and the
+   leaf of the tree (which contains the loads) there are no other statements
+   other than type conversions or other expressions that are part of the complex
+   operation.  These other expressions that shouldn't block the transformation
+   are given in EXEMPT.  */
+
+static bool
+vect_is_complex_transform_valid (slp_tree node, hash_set<stmt_vec_info> exempt)
+{
+  int i, j;
+  stmt_vec_info stmt_info;
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt_info)
+      {
+	gimple *stmt = STMT_VINFO_STMT (stmt_info);
+	if (!gimple_assign_load_p (stmt)
+	    && !gimple_assign_cast_p (stmt)
+	    && !exempt.contains (stmt_info))
+	  return false;
+      }
+
+  return true;
+}
+
+/* Delegate function that can be used to sort stmt_vec_info's inside a vec<>
+   in an order that would result in them being sequential in the load chain.
+   such that they would not require a load permutation.
+
+   The difference between the place of A - B in the load chain is returned.  */
+
+static int
+vect_reorder_based_on_load_chain (const void *a, const void *b)
+{
+  const stmt_vec_info ax = *(const stmt_vec_info*)a;
+  const stmt_vec_info bx = *(const stmt_vec_info*)b;
+  stmt_vec_info first_stmt_info
+    = DR_GROUP_FIRST_ELEMENT (ax);
+  int load_place_a
+    = vect_get_place_in_interleaving_chain (ax, first_stmt_info);
+  int load_place_b
+    = vect_get_place_in_interleaving_chain (bx, first_stmt_info);
+  return load_place_a - load_place_b;
+}
+
+/* Helper method of vect_determine_place_in_load.
+
+   Determines the location in the interleaving chain of the declaration of the
+   given SSA name.  */
+
+static int
+vect_determine_place_in_load_1 (vec_info *vinfo, tree lhs)
+{
+  gimple *stmt = SSA_NAME_DEF_STMT (lhs);
+  if (!gimple_assign_load_p (stmt))
+    {
+      if (gimple_assign_cast_p (stmt))
+	return vect_determine_place_in_load_1 (vinfo, gimple_assign_lhs (stmt));
+      else
+	return -1;
+    }
+
+  stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
+  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  return vect_get_place_in_interleaving_chain (stmt_info, first_stmt_info);
+}
+
+/* Re-arrange the arguments A and B such that they are in the order the stmts
+   they refer to access the load chain.  This is used to normalize the order
+   or the arguments to the complex pattern recognizer.  If an ordering cannot be
+   determined then FALSE is returned, otherwise TRUE.
+
+   If the entries are added in the proper placein V1 and V2 if the function is
+   successful.  */
+
+static bool
+vect_determine_place_in_load (vec_info *vinfo, tree a, tree b,
+			      vec<tree> *v1, vec<tree> *v2)
+{
+  int pos_a = vect_determine_place_in_load_1 (vinfo, a);
+  int pos_b = vect_determine_place_in_load_1 (vinfo, b);
+  if (pos_a < 0 && pos_b < 0)
+    return false;
+
+  if (pos_a > pos_b)
+    {
+      v1->quick_push (b);
+      v2->quick_push (a);
+    }
+  else
+    {
+      v1->quick_push (a);
+      v2->quick_push (b);
+    }
+
+  return true;
+}
+
+/* Helper function of vec_balance_load_nodes that sorts and re-orders
+   loads rooted in NODE in such a way that they don't require load permutes.
+
+   The load statements are sorted using vect_reorder_based_on_load_chain and
+   loads following typecasts are also sorted.
+
+   This function is intended to be used with the complex instructions in which
+   you may end up with the following situation:
+
+                       +----------+
+                       | express1 |
+           +-----------+ express2 +-----------+
+           |           +----------+           |
+           |                 |                |
+           |                 |                |
+      +----v-----+    +------v---+     +------v----+
+      | load a2  |    | (cast)b1 |     | (cast) b2 |
+      | load a1  |    | (cast)b1 |     | (cast) b2 |
+      +----------+    +----------+     +-----------+
+                             |                |
+                             |                |
+                      +------v---+     +------v----+
+                      | load b1  |     | load b2   |
+                      | load b1  |     | load b2   |
+                      +----------+     +-----------+
+
+   which will get transformed into
+
+                       +----------+
+                       | express1 |
+           +-----------+ express2 +-----------+
+           |           +----------+           |
+           |                 |                |
+           |                 |                |
+      +----v-----+    +------v---+     +------v----+
+      | load a1  |    | (cast)b1 |     | (cast) b2 |
+      | load a2  |    | (cast)b1 |     | (cast) b2 |
+      +----------+    +----------+     +-----------+
+                             |                |
+                             |                |
+                      +------v---+     +------v----+
+                      | load b1  |     | load b1   |
+                      | load b2  |     | load b2   |
+                      +----------+     +-----------+
+
+   in order to remove any load permutations.  The order of the nodes are not
+   changed during merging.  The re-ordering is done based on GROUP_SIZE
+   statements expected for each slp_tree node and a list of so far seen
+   "unbalanced" nodes (e.g. nodes that don't load the entire vector) so far
+   is kept in UNBALANCED.  This function does not keep track of which vector
+   the unbalanced nodes come from, and is thus only safe when we can expect
+   at most one vector being "unbalanced".  */
+
+static void
+vect_balance_load_nodes_1 (slp_tree node, unsigned int group_size,
+			   vec<slp_tree> unbalanced)
+{
+  stmt_vec_info stmt_info;
+  slp_tree child, entry;
+  unsigned i, j, k;
+  auto_vec<int> load_pos;
+  load_pos.create (group_size);
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    {
+      if (SLP_TREE_SCALAR_STMTS (child).length () == 0)
+	continue;
+
+      stmt_info = SLP_TREE_SCALAR_STMTS (child).last ();
+      bool is_load_node = gimple_assign_load_p (STMT_VINFO_STMT (stmt_info));
+      if (is_load_node)
+	{
+	  bool duplicate_place = false;
+	  load_pos.truncate (0);
+	  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt_info)
+	    {
+	      int load_place
+		= vect_get_place_in_interleaving_chain (stmt_info,
+							first_stmt_info);
+	      if (load_pos.contains (load_place))
+		{
+		  duplicate_place = true;
+		  break;
+		}
+	      else
+		load_pos.safe_push (load_place);
+	    }
+
+	  if (duplicate_place)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Unbalanced load %p found\n", child);
+	      unbalanced.safe_push (child);
+
+	      /* If we've reached the treshold, re-arrange the statements.  */
+	      if (unbalanced.length () == group_size)
+		{
+		  vec<stmt_vec_info> scalar_stmts;
+		  scalar_stmts.create (group_size);
+
+		  FOR_EACH_VEC_ELT (unbalanced, j, entry)
+		    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (entry), k,
+				      stmt_info)
+		      if (!scalar_stmts.contains (stmt_info))
+			scalar_stmts.safe_push (stmt_info);
+
+		   scalar_stmts.qsort (vect_reorder_based_on_load_chain);
+		   FOR_EACH_VEC_ELT (unbalanced, j, entry)
+		     {
+		       SLP_TREE_SCALAR_STMTS (entry).truncate (0);
+		       SLP_TREE_SCALAR_STMTS (entry).safe_splice (scalar_stmts);
+		     }
+		   unbalanced.truncate (0);
+		   if (dump_enabled_p ())
+		     dump_printf_loc (MSG_NOTE, vect_location,
+				      "Re-distributed loads across nodes and "
+				      "made contigous.\n");
+		}
+	    }
+	}
+
+      vect_balance_load_nodes_1 (child, group_size, unbalanced);
+
+      /* Re-order the nodes at this level if it was a load node.  */
+      if (is_load_node)
+	SLP_TREE_SCALAR_STMTS (child).qsort (vect_reorder_based_on_load_chain);
+    }
+}
+
+/* This function attemps to rebalance and re-order loads rooted in the SLP tree
+   NODE with group size GROUP_SIZE in such a way that they are guaranteed to not
+   require any load permutations.
+
+   It is similar to vect_attempt_slp_rearrange_stmts with the big difference
+   that it can run before SLP_INSTANCE_LOADS is populated.
+
+   The function is also a bit more aggressive in that it will also force
+   grouping and re-ordering of loads in order to make them sequential if it is
+   safe to do so.  In the case of the complex multiplication instructions this
+   is always the case if vect_is_complex_transform_valid hold.  */
+
+static void
+vect_balance_load_nodes (slp_tree node, unsigned int group_size)
+{
+  auto_vec<slp_tree> unbalanced;
+  unbalanced.reserve_exact (group_size);
+
+  vect_balance_load_nodes_1 (node, group_size, unbalanced);
+
+  gcc_assert (unbalanced.is_empty ());
+}
+
+/* The COMPLEX_OPERATION enum denotes the possible pair of operations that can
+   be matched when looking for expressions that we are interested matching for
+   complex numbers addition and mla.  */
+
+typedef enum _complex_operation {
+  PLUS_PLUS, MINUS_PLUS, PLUS_MINUS, MULT_MULT, NONE
+} complex_operation_t;
+
+/* Checks to see of the expression EXPR is a gimple assign with code CODE and if
+   this is the case the two operands of EXPR is returned in OP1 and OP2.
+
+   If the matching and extraction is successful TRUE is returned otherwise FALSE
+   in which case the value of OP1 and OP2 will not have been touched.  */
+
+static bool
+vect_match_expression_p (tree_code code, gimple *expr, tree *op1, tree* op2)
+{
+  if (!is_gimple_assign (expr)
+      || gimple_expr_code (expr) != code)
+    return false;
+
+  *op1 = gimple_assign_rhs1 (expr);
+  *op2 = gimple_assign_rhs2 (expr);
+  return true;
+}
+
+/* This function will match two gimple expressions STMT_0 and STMT_1 in parallel
+   and returns the pair operation that repressents the two expressions in the
+   two statements.
+
+   If match is successful then the corresponding complex_operation is returned
+   and the arguments to the two matched operations are returned in OPS.
+
+   If unsuccessful then NONE is returned and OPS is untouched.
+
+   e.g. the following gimple statements
+
+   stmt 0 _39 = _37 + _12;
+   stmt 1 _6 = _38 - _36;
+
+   will return PLUS_MINUX along with OPS containing {_37, _12, _38, _36}.  */
+
+static complex_operation_t
+vect_detect_pair_op (gimple* stmt_0, gimple* stmt_1, vec<tree> ops)
+{
+  tree op1, op2, op3, op4;
+  complex_operation_t result = NONE;
+  if (vect_match_expression_p (MINUS_EXPR, stmt_0, &op1, &op2)
+      && vect_match_expression_p (PLUS_EXPR, stmt_1, &op3, &op4))
+    result = MINUS_PLUS;
+  else if (vect_match_expression_p (PLUS_EXPR, stmt_0, &op1, &op2)
+	   && vect_match_expression_p (MINUS_EXPR, stmt_1, &op3, &op4))
+    result = PLUS_MINUS;
+  else if (vect_match_expression_p (PLUS_EXPR, stmt_0, &op1, &op2)
+	   && vect_match_expression_p (PLUS_EXPR, stmt_1, &op3, &op4))
+    result = PLUS_PLUS;
+  else if (vect_match_expression_p (MULT_EXPR, stmt_0, &op1, &op2)
+	   && vect_match_expression_p (MULT_EXPR, stmt_1, &op3, &op4))
+    result = MULT_MULT;
+
+  if (result != NONE)
+    {
+      ops.safe_push (op1);
+      ops.safe_push (op2);
+      ops.safe_push (op3);
+      ops.safe_push (op4);
+    }
+  return result;
+}
+
+/* Helper function for vect_delete_slp_nodes.
+
+   Delete any nodes rooted in NODE that after the complex number internal
+   function replacement pattern statements have been made are now redundant and
+   unused.  They also would remain in the tree until DSE removed them, but we
+   can prune them much earlier and simplify the SLP tree.
+
+   When a node is removed, it's children are merged in PARENT but the order will
+   not be changed.  In order to prevent cyclic traversals a list of visited
+   nodes is kept in VISITED.  Any removed statements will be marked as unused.
+   The nodes are only removed from the SLP tree and the BB remains unchanged.
+
+   This function cannot delete the root which will contain the top level
+   expression we would want to replace.  */
+
+static bool
+vect_delete_slp_nodes_1 (slp_tree *parent, slp_tree node,
+			 hash_set<slp_tree> *visited)
+{
+  unsigned i;
+  slp_tree child;
+  if (visited->contains (node))
+    return false;
+
+  auto_vec<unsigned, 4> to_remove;
+  gimple *stmt = STMT_VINFO_STMT (SLP_TREE_SCALAR_STMTS (node).last ());
+  bool keep = gimple_assign_load_p (stmt)
+	      || gimple_assign_cast_p (stmt)
+	      || *parent == node;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (vect_delete_slp_nodes_1 (keep ? &node : parent, child, visited))
+      to_remove.safe_push (i);
+
+  unsigned idx;
+  FOR_EACH_VEC_ELT_REVERSE (to_remove, i, idx)
+    SLP_TREE_CHILDREN (node).ordered_remove (idx);
+
+  if (keep)
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Deleting node %p and re-linking children to %p\n", node,
+		     *parent);
+  SLP_TREE_CHILDREN (*parent).safe_splice (SLP_TREE_CHILDREN (node));
+  stmt_vec_info stmt_info;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    STMT_VINFO_RELEVANT (stmt_info) = vect_unused_in_scope;
+
+  SLP_TREE_CHILDREN (node).release ();
+  SLP_TREE_SCALAR_STMTS (node).release ();
+  SLP_TREE_VEC_STMTS (node).release ();
+  SLP_TREE_LOAD_PERMUTATION (node).release ();
+  visited->add (node);
+  free (node);
+  return true;
+}
+
+/* Deletes any unneeded nodes from NODE and if needed merge them into PARENT.
+
+   See vect_delete_slp_nodes_1.  */
+
+static void
+vect_delete_slp_nodes (slp_tree *parent, slp_tree node)
+{
+  hash_set<slp_tree> visited;
+  vect_delete_slp_nodes_1 (parent, node, &visited);
+  visited.empty ();
+}
+
+/* Holds different supported pattern functions and their names.  */
+typedef struct  _complex_pattern
+{
+  bool (*patt_fn) (gimple *, gimple *, internal_fn *, vec<tree> *, vec<tree>*,
+		   vec_info *,hash_set<stmt_vec_info> *);
+  char name[50];
+  char optab[50];
+} complex_pattern_t;
+
+/* Helper function of vect_match_slp_patterns.
+
+   Attempts to match the given pattern PATT_INFO against the slp tree rooted in
+   NODE using VINFO and GROUP_SIZE.
+
+   If matching is successful the value in NODE is updated and returned, if not
+   then it is returned unchanged.  */
+
+static slp_tree
+vect_match_slp_patterns_1 (slp_tree node, vec_info *vinfo,
+			   unsigned int group_size, complex_pattern_t patt_info)
+{
+  int i;
+  stmt_vec_info stmt_info;
+  hash_set<stmt_vec_info> exempt;
+  auto_vec<tree> v1, v2;
+  vec<stmt_vec_info> scalar_stmts = SLP_TREE_SCALAR_STMTS (node);
+  bool keep_looking = false;
+
+  if (group_size != 2)
+    return node;
+
+  FOR_EACH_VEC_ELT_FROM (scalar_stmts, i, stmt_info, 1)
+    {
+      exempt.empty ();
+      stmt_vec_info opstmt_info_0 = scalar_stmts[i-1];
+      stmt_vec_info opstmt_info_1 = scalar_stmts[i];
+
+      gimple *stmt_0 = STMT_VINFO_STMT (opstmt_info_0);
+      gimple *stmt_1 = STMT_VINFO_STMT (opstmt_info_1);
+
+      gcc_assert (opstmt_info_0 && opstmt_info_1);
+
+      if (gimple_expr_type (stmt_0) != gimple_expr_type (stmt_1))
+	continue;
+
+      keep_looking = gimple_assign_cast_p (stmt_0)
+		     || gimple_assign_load_p (stmt_0)
+		     || gimple_store_p (stmt_0);
+      tree type = gimple_expr_type (stmt_0);
+      tree vectype = get_vectype_for_scalar_type (type);
+      internal_fn ifn;
+
+      if (!patt_info.patt_fn (stmt_0, stmt_1, &ifn, &v1, &v2, vinfo, &exempt)
+	  || ifn == IFN_LAST)
+	continue;
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Found %s pattern in SLP tree\n", patt_info.name);
+
+      if (vectype
+	  && direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+	{
+	  if (!vect_is_complex_transform_valid (node, exempt))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "SLP Pattern failed. Unknown instruction in "
+				 "between the instruction and the loads, "
+				 "cannot guarantee contiguous vectors\n");
+	      continue;
+	    }
+
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Target supports complex vectorization, replacing "
+			   "nodes\n");
+
+	  /* Now push new statements.  */
+	  SLP_TREE_SCALAR_STMTS (node).truncate (0);
+
+	  gcall *call_stmt_0
+	    = vect_create_complex_patt_stmt (node, opstmt_info_0, ifn,
+					     v1, vinfo, type, vectype);
+	  gcall *call_stmt_1
+	    = vect_create_complex_patt_stmt (node, opstmt_info_1, ifn,
+					     v2, vinfo, type, vectype);
+
+	  /* Re-order the loads, first in the SLP tree.  */
+	  vect_delete_slp_nodes (&node, node);
+	  vect_balance_load_nodes (node, group_size);
+	  SLP_TREE_TWO_OPERATORS (node) = false;
+
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Created patterns\n\tstmt(0): %G\tstmt(1): %G",
+			     call_stmt_0, call_stmt_1);
+	}
+      else
+        {
+	  if (dump_enabled_p ())
+	    {
+	      if (!vectype)
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "Target does not support vector type for %T\n",
+				 type);
+	      else
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "Target does not support %s for "
+				 "vector type %T\n", patt_info.optab, vectype);
+	    }
+        }
+      v1.release ();
+      v2.release ();
+    }
+
+  exempt.empty ();
+  /* If we haven't matched anything, look deeper.  */
+  if (keep_looking)
+    {
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+	SLP_TREE_CHILDREN (node)[i]
+	  = vect_match_slp_patterns_1 (child, vinfo, group_size, patt_info);
+    }
+
+  return node;
+}
+
+/* Pattern matcher for trying to match complex addition pattern in SLP tree
+   using the two statements STMT_0 and STMT_0 as the root statements.  If the
+   operation matches then IFN is set to the operation it matched and the
+   arguments to the two replacement statements are put in V1 and V2.
+
+   If no match is found then IFN is set to IFN_LAST and V1 and V2 are unchanged.
+
+   This function matches the patterns shaped as:
+
+      c[i] = a[i] - b[i+1];
+      c[i+1] = a[i+1] + b[i];
+
+   If a match occurred then TRUE is returned, else FALSE.  The statement that
+   make up the body of the expression is returned in EXEMPT.  */
+
+static bool
+vect_match_call_complex_add (gimple *stmt_0, gimple *stmt_1, internal_fn *ifn,
+			     vec<tree> *v1, vec<tree> *v2,
+			     vec_info *vinfo ATTRIBUTE_UNUSED,
+			     hash_set<stmt_vec_info> *exempt ATTRIBUTE_UNUSED)
+{
+  *ifn = IFN_LAST;
+  auto_vec<tree, 4> args;
+  complex_operation_t op = vect_detect_pair_op (stmt_0, stmt_1, args);
+
+  /* Find the two components.  Rotation in the complex plane will modify
+     the operations:
+
+     * Rotation  0: + +
+     * Rotation 90: - +
+     * Rotation 180: - -
+     * Rotation 270: + -
+
+     Rotation 0 and 180 can be handled by normal SIMD code, so we don't need
+     to care about them here.  */
+  if (op == MINUS_PLUS)
+    *ifn = IFN_FCOMPLEX_ADD_ROT90;
+  else if (op == PLUS_MINUS)
+    *ifn = IFN_FCOMPLEX_ADD_ROT270;
+
+  if (*ifn == IFN_LAST)
+    return false;
+
+  /* If the two operands are the same, we've matched some intermediate
+     computation and is not vectorizable using complex add, bail out.  */
+  if (args[0] == args[3] || args[2] == args[1])
+    return false;
+
+  v1->create (2);
+  v2->create (2);
+  v1->quick_push (args[0]);
+  v1->quick_push (args[3]);
+  v2->quick_push (args[2]);
+  v2->quick_push (args[1]);
+
+  return true;
+}
+
+/* Helper function of vect_match_call_complex_mla that looks up the definition
+   of LHS_0 and LHS_1 and tries to match the definition against pair ops.
+
+   If the match is successful then ARGS will contain the operands matched and
+   the complex_operation_t type is returned.  If match is not successful then
+   NONE is returned and ARGS is left unmodified.  */
+
+static complex_operation_t
+vect_match_call_complex_mla_1 (vec_info *vinfo, tree lhs_0, tree lhs_1,
+			       vec<tree> args, hash_set<stmt_vec_info> *exempt)
+{
+  if (TREE_CODE (lhs_0) != SSA_NAME || TREE_CODE (lhs_1) != SSA_NAME)
+    return NONE;
+
+  gimple *stmt_0 = SSA_NAME_DEF_STMT (lhs_0);
+  gimple *stmt_1 = SSA_NAME_DEF_STMT (lhs_1);
+
+  if (gimple_expr_type (stmt_0) != gimple_expr_type (stmt_1))
+    return NONE;
+
+  exempt->add (vinfo->lookup_stmt (stmt_0));
+  exempt->add (vinfo->lookup_stmt (stmt_1));
+
+  return vect_detect_pair_op (stmt_0, stmt_1, args);
+}
+
+/* Pattern matcher for trying to match complex multiply and accumulate pattern
+   in SLP tree using the two statements STMT_0 and STMT_0 as the root
+   statements.  If the operation matches then IFN is set to the operation it
+   matched and the arguments to the two replacement statements are put in V1
+   and V2.
+
+   If no match is found then IFN is set to IFN_LAST and V1 and V2 are unchanged.
+
+   This function matches the patterns shaped as:
+
+      double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
+      double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
+
+      c[i] = c[i] - ax;
+      c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The statement that
+   make up the body of the expression is returned in EXEMPT.  */
+
+static bool
+vect_match_call_complex_mla (gimple *stmt_0, gimple *stmt_1, internal_fn *ifn,
+			     vec<tree> *v1, vec<tree> *v2, vec_info *vinfo,
+			     hash_set<stmt_vec_info> *exempt)
+{
+  *ifn = IFN_LAST;
+  /* Find the two components.  Rotation in the complex plane will modify
+     the operations:
+
+     * Rotation  0: + +
+     * Rotation 90: - +
+     * Rotation 180: - -
+     * Rotation 270: + -.  */
+  auto_vec<tree, 4> args1;
+  complex_operation_t op1 = vect_detect_pair_op (stmt_0, stmt_1, args1);
+
+  if (op1 == NONE)
+    return false;
+
+  /* Now operand2+4 must lead to another expression.  */
+  auto_vec<tree, 4> args2;
+  complex_operation_t op2
+    = vect_match_call_complex_mla_1 (vinfo, args1[1], args1[3], args2, exempt);
+
+  if (op2 != MINUS_PLUS && op2 != PLUS_MINUS)
+    return false;
+
+  /* Now operand1+3 must lead to another expression.  */
+  auto_vec<tree, 4> args3;
+  complex_operation_t op3
+    = vect_match_call_complex_mla_1 (vinfo, args2[0], args2[2], args3, exempt);
+
+  if (op3 != MULT_MULT)
+    return false;
+
+  /* Now operand2+4 must lead to another expression.  */
+  auto_vec<tree, 4> args4;
+  complex_operation_t op4
+    = vect_match_call_complex_mla_1 (vinfo, args2[1], args2[3], args4, exempt);
+
+  if (op4 != MULT_MULT)
+    return false;
+
+  v1->create (3);
+  v2->create (3);
+
+  /* Canonicalize the arguments, if not possible stop.  */
+  if (!vect_determine_place_in_load (vinfo, args1[0], args1[2], v1, v2)
+      || !vect_determine_place_in_load (vinfo, args4[3], args3[2], v1, v2)
+      || !vect_determine_place_in_load (vinfo, args3[3], args4[2], v1, v2))
+    return false;
+
+  if (op1 == PLUS_PLUS && op2 == MINUS_PLUS)
+    *ifn = IFN_FCOMPLEX_FMA_ROT0;
+  else if (op1 == PLUS_MINUS && op2 == MINUS_PLUS)
+    *ifn = IFN_FCOMPLEX_FMA_ROT180;
+
+  if (*ifn == IFN_LAST)
+    return false;
+
+  return true;
+}
+
+/* Applies pattern matching to the given SLP tree rooted in NODE using vec_info
+   VINFO and group size GROUP_SIZE.
+
+   The modified tree is returned.  Patterns are tried in order and multiple
+   patterns may match.  */
+
+static slp_tree
+vect_match_slp_patterns (slp_tree node, vec_info *vinfo,
+			 unsigned int group_size)
+{
+  DUMP_VECT_SCOPE ("vect_match_slp_patterns");
+
+  static complex_pattern_t patterns[2] = {
+    { vect_match_call_complex_mla, "Complex FMA", "IFN_COMPLEX_FMA" },
+    { vect_match_call_complex_add, "Complex Addition", "IFN_COMPLEX_ADD" },
+  };
+
+  for (size_t x = 0; x < sizeof (patterns) / sizeof (complex_pattern_t); x++)
+    node = vect_match_slp_patterns_1 (node, vinfo, group_size, patterns[x]);
+
+  gcc_assert (node);
+  return node;
+}
+
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
    vect_build_slp_tree to build a tree of packed stmts if possible.
    Return FALSE if it's impossible to SLP any stmt in the loop.  */
@@ -1994,6 +2747,9 @@ vect_analyze_slp_instance (vec_info *vinfo,
 	}
       else
 	{
+
+	  node = vect_match_slp_patterns (node, vinfo, group_size);
+
 	  /* Create a new SLP instance.  */
 	  new_instance = XNEW (struct _slp_instance);
 	  SLP_INSTANCE_TREE (new_instance) = node;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e1292aa6eb6b743844869c5343b2c462429757f2..396ecb58889ff6a082bd7d2556194b2feec0bcbe 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1611,6 +1611,8 @@ extern void duplicate_and_interleave (gimple_seq *, tree, vec<tree>,
 extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info);
 
 /* In tree-vect-patterns.c.  */
+extern void vect_mark_pattern_stmts (stmt_vec_info, gimple *, tree);
+
 /* Pattern recognition functions.
    Additional pattern recognition functions can (and will) be added
    in the future.  */

Reply via email to