The following makes SSA rewrite (update-address-taken) recognize
sets of aligned sub-vectors in aligned position
(v2qi into v16qi, but esp. v8qi into v16qi).  It uses the
BIT_INSERT_EXPR support for this, enabling that for vector
typed values.  This makes us turn for example

typedef unsigned char v16qi __attribute__((vector_size(16)));
v16qi load (const void *p)
{
  v16qi r;
  __builtin_memcpy (&r, p, 8);
  return r;
}

into the following

load (const void * p)
{
  v16qi r;
  long unsigned int _3;
  v16qi _5;
  vector(8) unsigned char _7;

  <bb 2> :
  _3 = MEM[(char * {ref-all})p_2(D)];
  _7 = VIEW_CONVERT_EXPR<vector(8) unsigned char>(_3);
  r_9 = BIT_INSERT_EXPR <r_8(D), _7, 0 (64 bits)>;
  _5 = r_9;
  return _5;

this isn't yet nicely expanded since the BIT_INSERT_EXPR
expansion simply goes through store_bit_field and there's
no vector-mode vec_set.

Similar as to the single-element insert SSA rewrite already
handles the transform is conditional on the involved
vector types having non-BLKmode.  This is somewhat bad
since the transform is supposed to enable SSA optimizations
by rewriting memory vectors into SSA form.  Since splitting
of larger generic vectors happens very much later only
this pessimizes their use.  But the BIT_INSERT_EXPR
expansion doesn't cope with BLKmode entities (source or
destination).

Extending BIT_INSERT_EXPR this way seems natural given
the support of CONSTRUCTORs with smaller vectors.
BIT_FIELD_REF isn't particularly restricted so can be
used to extract sub-vectors as well.

Code generation is as bad as before (RTL expansion eventually
spills) but SSA optimizations are enabled on less trivial
testcases.

Boostrap / regtest running on x86_64-unknown-linux-gnu.

Comments?

Richard.

2019-05-14  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/90424
        * tree-ssa.c (non_rewritable_lvalue_p): Handle inserts from
        aligned subvectors.
        (execute_update_addresses_taken): Likewise.
        * tree-cfg.c (verify_gimple_assign_ternary): Likewise.

        * g++.target/i386/pr90424-1.C: New testcase.
        * g++.target/i386/pr90424-2.C: Likewise.

Index: gcc/tree-ssa.c
===================================================================
--- gcc/tree-ssa.c      (revision 271155)
+++ gcc/tree-ssa.c      (working copy)
@@ -1521,14 +1521,28 @@ non_rewritable_lvalue_p (tree lhs)
       if (DECL_P (decl)
          && VECTOR_TYPE_P (TREE_TYPE (decl))
          && TYPE_MODE (TREE_TYPE (decl)) != BLKmode
-         && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
-                             TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
+         && multiple_of_p (sizetype,
+                           TYPE_SIZE_UNIT (TREE_TYPE (decl)),
+                           TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
          && known_ge (mem_ref_offset (lhs), 0)
          && known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),
                       mem_ref_offset (lhs))
          && multiple_of_p (sizetype, TREE_OPERAND (lhs, 1),
                            TYPE_SIZE_UNIT (TREE_TYPE (lhs))))
-       return false;
+       {
+         if (operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
+                              TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))),
+                              0))
+           return false;
+         /* For sub-vector inserts the insert vector mode has to be
+            supported.  */
+         tree vtype = build_vector_type
+             (TREE_TYPE (TREE_TYPE (decl)),
+              tree_to_uhwi (TYPE_SIZE (TREE_TYPE (lhs)))
+              / tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))));
+         if (TYPE_MODE (vtype) != BLKmode)
+           return false;
+       }
     }
 
   /* A vector-insert using a BIT_FIELD_REF is rewritable using
@@ -1866,9 +1880,9 @@ execute_update_addresses_taken (void)
                    && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
                    && VECTOR_TYPE_P (TREE_TYPE (sym))
                    && TYPE_MODE (TREE_TYPE (sym)) != BLKmode
-                   && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
-                                       TYPE_SIZE_UNIT
-                                         (TREE_TYPE (TREE_TYPE (sym))), 0)
+                   && multiple_of_p (sizetype,
+                                     TYPE_SIZE_UNIT (TREE_TYPE (sym)),
+                                     TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
                    && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
                    && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
                                        TYPE_SIZE_UNIT (TREE_TYPE (sym)))
@@ -1879,7 +1893,16 @@ execute_update_addresses_taken (void)
                    if (! types_compatible_p (TREE_TYPE (val),
                                              TREE_TYPE (TREE_TYPE (sym))))
                      {
-                       tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
+                       tree temtype = TREE_TYPE (TREE_TYPE (sym));
+                       if (!operand_equal_p (TYPE_SIZE
+                                               (TREE_TYPE (TREE_TYPE (sym))),
+                                             TYPE_SIZE (TREE_TYPE (lhs)), 0))
+                         temtype = build_vector_type
+                           (temtype,
+                            tree_to_uhwi (TYPE_SIZE (TREE_TYPE (lhs)))
+                            / tree_to_uhwi (TYPE_SIZE (TREE_TYPE
+                                                         (TREE_TYPE (sym)))));
+                       tree tem = make_ssa_name (temtype);
                        gimple *pun
                          = gimple_build_assign (tem,
                                                 build1 (VIEW_CONVERT_EXPR,
Index: gcc/tree-cfg.c
===================================================================
--- gcc/tree-cfg.c      (revision 271155)
+++ gcc/tree-cfg.c      (working copy)
@@ -4263,8 +4263,17 @@ verify_gimple_assign_ternary (gassign *s
        }
       if (! ((INTEGRAL_TYPE_P (rhs1_type)
              && INTEGRAL_TYPE_P (rhs2_type))
+            /* Vector element insert.  */
             || (VECTOR_TYPE_P (rhs1_type)
-                && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))
+                && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))
+            /* Aligned sub-vector insert.  */
+            || (VECTOR_TYPE_P (rhs1_type)
+                && VECTOR_TYPE_P (rhs2_type)
+                && types_compatible_p (TREE_TYPE (rhs1_type),
+                                       TREE_TYPE (rhs2_type))
+                && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),
+                               TYPE_VECTOR_SUBPARTS (rhs2_type))
+                && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))
        {
          error ("not allowed type combination in BIT_INSERT_EXPR");
          debug_generic_expr (rhs1_type);
Index: gcc/testsuite/g++.target/i386/pr90424-1.C
===================================================================
--- gcc/testsuite/g++.target/i386/pr90424-1.C   (nonexistent)
+++ gcc/testsuite/g++.target/i386/pr90424-1.C   (working copy)
@@ -0,0 +1,32 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+  using W = V<T>;
+  W r;
+  __builtin_memcpy(&r, p, M);
+  return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *);     // bad
+template V<short> load<short, 8>(const void *);   // bad
+template V<int> load<int, 8>(const void *);       // bad
+template V<long> load<long, 8>(const void *);     // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *);   // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *);   // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *);     // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function.  */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
Index: gcc/testsuite/g++.target/i386/pr90424-2.C
===================================================================
--- gcc/testsuite/g++.target/i386/pr90424-2.C   (nonexistent)
+++ gcc/testsuite/g++.target/i386/pr90424-2.C   (working copy)
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+  V<T> r = {};
+  __builtin_memcpy(&r, p, M);
+  return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *);     // bad
+template V<short> load<short, 8>(const void *);   // bad
+template V<int> load<int, 8>(const void *);       // bad
+template V<long> load<long, 8>(const void *);     // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *);   // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *);   // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *);     // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function.  */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */

Reply via email to