The following makes SSA rewrite (update-address-taken) recognize sets of aligned sub-vectors in aligned position (v2qi into v16qi, but esp. v8qi into v16qi). It uses the BIT_INSERT_EXPR support for this, enabling that for vector typed values. This makes us turn for example
typedef unsigned char v16qi __attribute__((vector_size(16))); v16qi load (const void *p) { v16qi r; __builtin_memcpy (&r, p, 8); return r; } into the following load (const void * p) { v16qi r; long unsigned int _3; v16qi _5; vector(8) unsigned char _7; <bb 2> : _3 = MEM[(char * {ref-all})p_2(D)]; _7 = VIEW_CONVERT_EXPR<vector(8) unsigned char>(_3); r_9 = BIT_INSERT_EXPR <r_8(D), _7, 0 (64 bits)>; _5 = r_9; return _5; this isn't yet nicely expanded since the BIT_INSERT_EXPR expansion simply goes through store_bit_field and there's no vector-mode vec_set. Similar as to the single-element insert SSA rewrite already handles the transform is conditional on the involved vector types having non-BLKmode. This is somewhat bad since the transform is supposed to enable SSA optimizations by rewriting memory vectors into SSA form. Since splitting of larger generic vectors happens very much later only this pessimizes their use. But the BIT_INSERT_EXPR expansion doesn't cope with BLKmode entities (source or destination). Extending BIT_INSERT_EXPR this way seems natural given the support of CONSTRUCTORs with smaller vectors. BIT_FIELD_REF isn't particularly restricted so can be used to extract sub-vectors as well. Code generation is as bad as before (RTL expansion eventually spills) but SSA optimizations are enabled on less trivial testcases. Boostrap / regtest running on x86_64-unknown-linux-gnu. Comments? Richard. 2019-05-14 Richard Biener <rguent...@suse.de> PR tree-optimization/90424 * tree-ssa.c (non_rewritable_lvalue_p): Handle inserts from aligned subvectors. (execute_update_addresses_taken): Likewise. * tree-cfg.c (verify_gimple_assign_ternary): Likewise. * g++.target/i386/pr90424-1.C: New testcase. * g++.target/i386/pr90424-2.C: Likewise. Index: gcc/tree-ssa.c =================================================================== --- gcc/tree-ssa.c (revision 271155) +++ gcc/tree-ssa.c (working copy) @@ -1521,14 +1521,28 @@ non_rewritable_lvalue_p (tree lhs) if (DECL_P (decl) && VECTOR_TYPE_P (TREE_TYPE (decl)) && TYPE_MODE (TREE_TYPE (decl)) != BLKmode - && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), - TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0) + && multiple_of_p (sizetype, + TYPE_SIZE_UNIT (TREE_TYPE (decl)), + TYPE_SIZE_UNIT (TREE_TYPE (lhs))) && known_ge (mem_ref_offset (lhs), 0) && known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))), mem_ref_offset (lhs)) && multiple_of_p (sizetype, TREE_OPERAND (lhs, 1), TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) - return false; + { + if (operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), + TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), + 0)) + return false; + /* For sub-vector inserts the insert vector mode has to be + supported. */ + tree vtype = build_vector_type + (TREE_TYPE (TREE_TYPE (decl)), + tree_to_uhwi (TYPE_SIZE (TREE_TYPE (lhs))) + / tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl))))); + if (TYPE_MODE (vtype) != BLKmode) + return false; + } } /* A vector-insert using a BIT_FIELD_REF is rewritable using @@ -1866,9 +1880,9 @@ execute_update_addresses_taken (void) && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym)) && VECTOR_TYPE_P (TREE_TYPE (sym)) && TYPE_MODE (TREE_TYPE (sym)) != BLKmode - && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), - TYPE_SIZE_UNIT - (TREE_TYPE (TREE_TYPE (sym))), 0) + && multiple_of_p (sizetype, + TYPE_SIZE_UNIT (TREE_TYPE (sym)), + TYPE_SIZE_UNIT (TREE_TYPE (lhs))) && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1)) && tree_int_cst_lt (TREE_OPERAND (lhs, 1), TYPE_SIZE_UNIT (TREE_TYPE (sym))) @@ -1879,7 +1893,16 @@ execute_update_addresses_taken (void) if (! types_compatible_p (TREE_TYPE (val), TREE_TYPE (TREE_TYPE (sym)))) { - tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym))); + tree temtype = TREE_TYPE (TREE_TYPE (sym)); + if (!operand_equal_p (TYPE_SIZE + (TREE_TYPE (TREE_TYPE (sym))), + TYPE_SIZE (TREE_TYPE (lhs)), 0)) + temtype = build_vector_type + (temtype, + tree_to_uhwi (TYPE_SIZE (TREE_TYPE (lhs))) + / tree_to_uhwi (TYPE_SIZE (TREE_TYPE + (TREE_TYPE (sym))))); + tree tem = make_ssa_name (temtype); gimple *pun = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR, Index: gcc/tree-cfg.c =================================================================== --- gcc/tree-cfg.c (revision 271155) +++ gcc/tree-cfg.c (working copy) @@ -4263,8 +4263,17 @@ verify_gimple_assign_ternary (gassign *s } if (! ((INTEGRAL_TYPE_P (rhs1_type) && INTEGRAL_TYPE_P (rhs2_type)) + /* Vector element insert. */ || (VECTOR_TYPE_P (rhs1_type) - && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type)))) + && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type)) + /* Aligned sub-vector insert. */ + || (VECTOR_TYPE_P (rhs1_type) + && VECTOR_TYPE_P (rhs2_type) + && types_compatible_p (TREE_TYPE (rhs1_type), + TREE_TYPE (rhs2_type)) + && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type), + TYPE_VECTOR_SUBPARTS (rhs2_type)) + && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type))))) { error ("not allowed type combination in BIT_INSERT_EXPR"); debug_generic_expr (rhs1_type); Index: gcc/testsuite/g++.target/i386/pr90424-1.C =================================================================== --- gcc/testsuite/g++.target/i386/pr90424-1.C (nonexistent) +++ gcc/testsuite/g++.target/i386/pr90424-1.C (working copy) @@ -0,0 +1,32 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */ + +template <class T> +using V [[gnu::vector_size(16)]] = T; + +template <class T, unsigned M = sizeof(V<T>)> +V<T> load(const void *p) { + using W = V<T>; + W r; + __builtin_memcpy(&r, p, M); + return r; +} + +// movq or movsd +template V<char> load<char, 8>(const void *); // bad +template V<short> load<short, 8>(const void *); // bad +template V<int> load<int, 8>(const void *); // bad +template V<long> load<long, 8>(const void *); // good +// the following is disabled because V2SF isn't a supported mode +// template V<float> load<float, 8>(const void *); // bad +template V<double> load<double, 8>(const void *); // good (movsd?) + +// movd or movss +template V<char> load<char, 4>(const void *); // bad +template V<short> load<short, 4>(const void *); // bad +template V<int> load<int, 4>(const void *); // good +template V<float> load<float, 4>(const void *); // good + +/* We should end up with one load and one insert for each function. */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */ Index: gcc/testsuite/g++.target/i386/pr90424-2.C =================================================================== --- gcc/testsuite/g++.target/i386/pr90424-2.C (nonexistent) +++ gcc/testsuite/g++.target/i386/pr90424-2.C (working copy) @@ -0,0 +1,31 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */ + +template <class T> +using V [[gnu::vector_size(16)]] = T; + +template <class T, unsigned M = sizeof(V<T>)> +V<T> load(const void *p) { + V<T> r = {}; + __builtin_memcpy(&r, p, M); + return r; +} + +// movq or movsd +template V<char> load<char, 8>(const void *); // bad +template V<short> load<short, 8>(const void *); // bad +template V<int> load<int, 8>(const void *); // bad +template V<long> load<long, 8>(const void *); // good +// the following is disabled because V2SF isn't a supported mode +// template V<float> load<float, 8>(const void *); // bad +template V<double> load<double, 8>(const void *); // good (movsd?) + +// movd or movss +template V<char> load<char, 4>(const void *); // bad +template V<short> load<short, 4>(const void *); // bad +template V<int> load<int, 4>(const void *); // good +template V<float> load<float, 4>(const void *); // good + +/* We should end up with one load and one insert for each function. */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */